## Import Library

In [314]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import re 
import string
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

## Dataset access to Big Query Data

In [315]:
client = bigquery.Client()
to_refer = client.dataset("stackoverflow", project="bigquery-public-data")

data = client.get_dataset(to_refer)

Using Kaggle's public dataset BigQuery integration.


In [316]:
sql = """
        SELECT id, title , tags 
        FROM 
        `bigquery-public-data.stackoverflow.stackoverflow_posts`
        WHERE
        title NOT LIKE '%None%' AND 
        (tags LIKE '%|python|%' OR tags LIKE '%|c#|%' OR
        tags LIKE '%|java|%' OR tags LIKE '%|r|%' OR
        tags LIKE '%|android|%' OR tags LIKE '%|html|%' OR
        tags LIKE '%|c++|%' OR tags LIKE '%|sql|%' OR
        tags LIKE '%|c|%' OR tags LIKE '%kotlin%') AND 
        LENGTH(tags) < 20
        LIMIT
        10000;
        """

df = client.query(sql).to_dataframe()

df.head()

  "Cannot create BigQuery Storage client, the dependency "


Unnamed: 0,id,title,tags
0,35493392,How to make a search result Page with a button...,javascript|html|jsp
1,10623932,Refer to one single SQL query 3 times on a pag...,php|sql|forms
2,8850977,MySQL before trigger,mysql|sql|triggers
3,9626410,How do I cast the result of a UNION in SQLite?,c++|sql|sqlite
4,26137459,Initialize AudioClip Object at Runtime Unity C...,c#|android|unity3d


## Preprocessing

**Stopwords**

In [317]:
df.title  = df.title.str.lower()
stop_words = set(stopwords.words('english'))

df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

Unnamed: 0,id,title,tags
0,35493392,make search result page button result item ope...,javascript|html|jsp
1,10623932,refer one single sql query 3 times page create...,php|sql|forms
2,8850977,mysql trigger,mysql|sql|triggers
3,9626410,cast result union sqlite?,c++|sql|sqlite
4,26137459,initialize audioclip object runtime unity c# a...,c#|android|unity3d


**Punctuation and hyperlinks except # and +**

In [318]:
punctuations = list(string.punctuation)
punctuations.remove("#")
punctuations.remove("+")

In [321]:
def remove_punctuations(text):
    for punctuation in punctuations:
        text = text.replace(punctuation, '')
    return text

In [322]:
df['title'] = df['title'].apply(remove_punctuations)
df.head()

Unnamed: 0,id,title,tags
0,35493392,make search result page button result item ope...,javascript|html|jsp
1,10623932,refer one single sql query 3 times page create...,php|sql|forms
2,8850977,mysql trigger,mysql|sql|triggers
3,9626410,cast result union sqlite,c++|sql|sqlite
4,26137459,initialize audioclip object runtime unity c# a...,c#|android|unity3d


**Convert Tags string column to list column**

In [323]:
df['tags'] = df['tags'].str.split('|')
df.head()

Unnamed: 0,id,title,tags
0,35493392,make search result page button result item ope...,"[javascript, html, jsp]"
1,10623932,refer one single sql query 3 times page create...,"[php, sql, forms]"
2,8850977,mysql trigger,"[mysql, sql, triggers]"
3,9626410,cast result union sqlite,"[c++, sql, sqlite]"
4,26137459,initialize audioclip object runtime unity c# a...,"[c#, android, unity3d]"


## MultiLabel Binarizer

In [324]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['tags'])

y = multilabel_binarizer.transform(df['tags'])
y, y.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 (10000, 1558))

## Train Test Split

In [325]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.2)
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
x_train.shape, x_test.shape

((8000, 3), (2000, 3))

## Tokenizing and applying Pad Sequence

In [326]:
train_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
train_tokenizer.fit_on_texts(x_train['title'].values)
train_word_index = train_tokenizer.word_index

In [327]:
vocab_length = len(train_word_index) + 1
vocab_length

6449

In [328]:
train_sequences = train_tokenizer.texts_to_sequences(x_train['title'].values)
len(train_sequences)

8000

In [329]:
test_sequences = train_tokenizer.texts_to_sequences(x_test['title'].values)
len(test_sequences)

2000

In [330]:
longest_sentence = len(max(train_sequences, key=len))
train_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=longest_sentence, truncating='post')
test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=longest_sentence, truncating='post')
longest_sentence

16

## LSTM

In [331]:
tf.keras.backend.clear_session()

embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=longest_sentence), 
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LSTM(50, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(16),
    tf.keras.layers.Dense(y.shape[1], activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 50)            322450    
_________________________________________________________________
dropout (Dropout)            (None, 16, 50)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 16, 100)           60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 16, 100)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 16, 100)           400       
_________________________________________________________________
lstm_1 (LSTM)                (None, 16, 50)            30200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 16, 50)            0

In [332]:
history = model.fit(train_padded_seqeunces, y_train, epochs=20,
                    validation_data=(test_padded_seqeunces, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Predicting Tags

In [333]:
pred_prob = model.predict(test_padded_seqeunces)
t = 0.3
test_pred = (pred_prob >= t).astype(int)

In [338]:
for i in range(10):
    k = x_test.sample(1).index[0]
    print("Title: ", x_test['title'][k])
    print("Actual Tag: ", multilabel_binarizer.inverse_transform(y_test[k].reshape(1, -1))[0])
    print("Predicted tag: ", multilabel_binarizer.inverse_transform(test_pred[k].reshape(1, -1))[0])
    print('\n')

Title:  make accordioncss+javascript work
Actual Tag:  ('css', 'html', 'javascript')
Predicted tag:  ('css', 'html', 'javascript')


Title:  connect native cc++ dll wcf c# hosted iis
Actual Tag:  ('c#', 'c++', 'dll', 'iis', 'wcf')
Predicted tag:  ('c', 'c#', 'c++')


Title:  search inside pdf using phphtml
Actual Tag:  ('html', 'pdf', 'php', 'search')
Predicted tag:  ('html', 'php')


Title:  c++c# array shift equivilents
Actual Tag:  ('arrays', 'c#', 'c++')
Predicted tag:  ('c', 'c++', 'python')


Title:  copying c objects
Actual Tag:  ('c', 'c++', 'copy')
Predicted tag:  ('c', 'c#', 'c++')


Title:  characters allowed querying mysql database
Actual Tag:  ('mysql', 'php', 'sql')
Predicted tag:  ('c#', 'mysql', 'sql')


Title:  select statement returns data although given value clause false
Actual Tag:  ('indexing', 'mysql', 'sql')
Predicted tag:  ('mysql', 'php', 'sql')


Title:  save entered content html page android storage
Actual Tag:  ('android', 'java', 'save')
Predicted tag:  ('