In [2]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

Accessing the Big Query Data

In [3]:
client = bigquery.Client()
dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

Using Kaggle's public dataset BigQuery integration.


In [4]:
sql = """
        SELECT 
            id, title , tags 
        FROM 
            `bigquery-public-data.stackoverflow.stackoverflow_posts`
        WHERE
            title NOT LIKE '%None%' AND 
            (tags LIKE '%|python|%' OR tags LIKE '%|c#|%' OR
            tags LIKE '%|java|%' OR tags LIKE '%|r|%' OR
            tags LIKE '%|android|%' OR tags LIKE '%|html|%' OR
            tags LIKE '%|c++|%' OR tags LIKE '%|sql|%' OR
            tags LIKE '%|c|%' OR tags LIKE '%kotlin%') AND 
            LENGTH(tags) < 20
        LIMIT
             10000;
        """

results = client.query(sql).to_dataframe()

results.head()

  "Cannot create BigQuery Storage client, the dependency "


Unnamed: 0,id,title,tags
0,15267907,Failure running R CMD BATCH command with PHP,php|r|mamp|rscript
1,814546,Ejb sql [select * from where A & 1],java|sql|mysql|ejb
2,11586990,Should UI Element Code be at the top or bottom...,php|html|website
3,18427743,Can not execute update statement at toad for m...,mysql|sql|toad
4,24593012,exporting table to xlsx writes entire page con...,php|html|excel


Preprocessing Data

In [5]:
tags = []
for tag in results['tags']:
    tag = tag.split('|')
    if len(tag)<20:
        tags.append(tag)

In [6]:
df = pd.DataFrame({'id': results['id'], 'text': results['title'], 'tags' : tags})

Unnamed: 0,id,text,tags
0,15267907,Failure running R CMD BATCH command with PHP,"[php, r, mamp, rscript]"
1,814546,Ejb sql [select * from where A & 1],"[java, sql, mysql, ejb]"
2,11586990,Should UI Element Code be at the top or bottom...,"[php, html, website]"
3,18427743,Can not execute update statement at toad for m...,"[mysql, sql, toad]"
4,24593012,exporting table to xlsx writes entire page con...,"[php, html, excel]"


In [8]:
df.head()

Unnamed: 0,id,text,tags
0,15267907,Failure running R CMD BATCH command with PHP,"[php, r, mamp, rscript]"
1,814546,Ejb sql [select * from where A & 1],"[java, sql, mysql, ejb]"
2,11586990,Should UI Element Code be at the top or bottom...,"[php, html, website]"
3,18427743,Can not execute update statement at toad for m...,"[mysql, sql, toad]"
4,24593012,exporting table to xlsx writes entire page con...,"[php, html, excel]"


In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

punctuations = list(string.punctuation)
punctuations.remove("#")
punctuations.remove("+")

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [9]:
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)
def remove_punctuations(text):
    no_punctuations_text = [w for w in text.split() if not w in punctuations]
    return ' '.join(no_punctuations_text)

In [10]:
def clean_text(text):
    text = text.lower()
    text = remove_punctuations(text)
    text = remove_stopwords(text)
    return text

In [11]:
df['clean_text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,id,text,tags,clean_text
0,15267907,Failure running R CMD BATCH command with PHP,"[php, r, mamp, rscript]",failure running r cmd batch command php
1,814546,Ejb sql [select * from where A & 1],"[java, sql, mysql, ejb]",ejb sql [select 1]
2,11586990,Should UI Element Code be at the top or bottom...,"[php, html, website]",ui element code top bottom webpage?
3,18427743,Can not execute update statement at toad for m...,"[mysql, sql, toad]",execute update statement toad mysql tools
4,24593012,exporting table to xlsx writes entire page con...,"[php, html, excel]",exporting table xlsx writes entire page conten...


Preparing Target Variable

In [12]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['tags'])

y = multilabel_binarizer.transform(df['tags'])

In [13]:
y, y.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 (10000, 1572))

Train and Test Sets

In [14]:
train, test, y_train, y_test = train_test_split(df, y, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((8000, 4), (2000, 4))

Tokenizing and applying Pad-Sequences

In [15]:
train_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='')
train_tokenizer.fit_on_texts(train['clean_text'].values)
train_word_index = train_tokenizer.word_index

In [16]:
vocab_length = len(train_word_index) + 1
vocab_length

6039

In [17]:
train_sequences = train_tokenizer.texts_to_sequences(train['clean_text'].values)
len(train_sequences)

8000

In [18]:
test_sequences = train_tokenizer.texts_to_sequences(test['clean_text'].values)
len(test_sequences)

2000

In [20]:
longest_sentence = len(max(train_sequences, key=len))
train_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=longest_sentence, truncating='post')
test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=longest_sentence, truncating='post')
longest_sentence


18

LSTM Model


In [21]:
tf.keras.backend.clear_session()

embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=longest_sentence),
    
   tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(300, return_sequences=True),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

2022-11-01 09:43:27.941416: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [22]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 18, 50)            301950    
_________________________________________________________________
dropout (Dropout)            (None, 18, 50)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 18, 300)           421200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 18, 300)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 18, 300)           1200      
_________________________________________________________________
lstm_1 (LSTM)                (None, 18, 100)           160400    
_________________________________________________________________
batch_normalization_1 (Batch (None, 18, 100)           4

In [23]:
history = model.fit(train_padded_seqeunces, y_train, epochs=10, 
                    validation_data=(test_padded_seqeunces, y_test))

2022-11-01 09:43:48.996140: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
test_prob = model.predict(test_padded_seqeunces)
t = 0.2
test_pred = (test_prob >= t).astype(int)

In [31]:

for i in range(10):
    k = test.sample(1).index[0]
    print("Title: ", test['text'][k])
    print("Actual Tag: ", multilabel_binarizer.inverse_transform(y_test[k].reshape(1, -1))[0])
    print("Predicted tag: ", multilabel_binarizer.inverse_transform(test_pred[k].reshape(1, -1))[0])
    print('\n')

Title:  jQuery add and remove a class to a div by clicking on a different div
Actual Tag:  ('css', 'html', 'jquery')
Predicted tag:  ('html',)


Title:  context menu CSS and event partially working
Actual Tag:  ('css', 'html', 'javascript')
Predicted tag:  ('html',)


Title:  Compilation error after upgrading from ACRA 4.5 to ACRA 4.6
Actual Tag:  ('acra', 'android', 'java')
Predicted tag:  ('html',)


Title:  How do I repeat div classes using JavaScript only?
Actual Tag:  ('dom', 'html', 'javascript')
Predicted tag:  ('html',)


Title:  Change Listview items from previous Listview
Actual Tag:  ('android', 'java', 'xml')
Predicted tag:  ('html',)


Title:  kernel function parameter as const
Actual Tag:  ('c', 'c++', 'cuda')
Predicted tag:  ('html',)


Title:  PartialView to string
Actual Tag:  ('asp.net-mvc', 'c#', 'html')
Predicted tag:  ('html',)


Title:  CakePHP: Using timthumb with Html helper?
Actual Tag:  ('cakephp', 'html', 'php')
Predicted tag:  ('html',)


Title:  Center Outp

array([0, 0, 0, ..., 0, 0, 0])