In [39]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [40]:
client = bigquery.Client()
to_refer = client.dataset("stackoverflow", project="bigquery-public-data")

df = client.get_dataset(to_refer)

Using Kaggle's public dataset BigQuery integration.


In [41]:
sql = """
        SELECT id, title , tags 
        FROM 
        `bigquery-public-data.stackoverflow.stackoverflow_posts`
        WHERE
        title NOT LIKE '%None%' AND 
        (tags LIKE '%|python|%' OR tags LIKE '%|c#|%' OR
        tags LIKE '%|java|%' OR tags LIKE '%|r|%' OR
        tags LIKE '%|android|%' OR tags LIKE '%|html|%' OR
        tags LIKE '%|c++|%' OR tags LIKE '%|sql|%' OR
        tags LIKE '%|c|%' OR tags LIKE '%kotlin%') AND 
        LENGTH(tags) < 20
        LIMIT
        10000;
        """

df = client.query(sql).to_dataframe()

df.head()

  "Cannot create BigQuery Storage client, the dependency "


Unnamed: 0,id,title,tags
0,30310354,Boost bjam tutorial keeps looking for the wron...,python|c++|boost
1,18401224,Does Java have any framework similar to WPF( a...,c#|java|wpf
2,35068061,curl url sent to flask server not working,php|python|curl
3,35920841,Generate files after compile (javac) in Androi...,java|android|gradle
4,24772652,Sending Dynamic NdefRecords to the constructor,java|android|nfc


**Preprocessing**

In [42]:
df.title  = df.title.str.lower()
stop_words = set(stopwords.words('english'))

df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

Unnamed: 0,id,title,tags
0,30310354,boost bjam tutorial keeps looking wrong mcvs v...,python|c++|boost
1,18401224,java framework similar wpf( capability using s...,c#|java|wpf
2,35068061,curl url sent flask server working,php|python|curl
3,35920841,generate files compile (javac) androidstudio g...,java|android|gradle
4,24772652,sending dynamic ndefrecords constructor,java|android|nfc


In [43]:
punctuations = list(string.punctuation)
punctuations.remove("#")
punctuations.remove("+")

In [44]:
def remove_punctuations(text):
    for punctuation in punctuations:
        text = text.replace(punctuation, '')
    return text

In [45]:
df['title'] = df['title'].apply(remove_punctuations)
df.head()

Unnamed: 0,id,title,tags
0,30310354,boost bjam tutorial keeps looking wrong mcvs v...,python|c++|boost
1,18401224,java framework similar wpf capability using sh...,c#|java|wpf
2,35068061,curl url sent flask server working,php|python|curl
3,35920841,generate files compile javac androidstudio gradle,java|android|gradle
4,24772652,sending dynamic ndefrecords constructor,java|android|nfc


**Tags string column to list column**

In [46]:
df['tags'] = df['tags'].str.split('|')
df.head()

Unnamed: 0,id,title,tags
0,30310354,boost bjam tutorial keeps looking wrong mcvs v...,"[python, c++, boost]"
1,18401224,java framework similar wpf capability using sh...,"[c#, java, wpf]"
2,35068061,curl url sent flask server working,"[php, python, curl]"
3,35920841,generate files compile javac androidstudio gradle,"[java, android, gradle]"
4,24772652,sending dynamic ndefrecords constructor,"[java, android, nfc]"


**MultiLabel Binarizer**

In [47]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['tags'])

y = multilabel_binarizer.transform(df['tags'])
y, y.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 (10000, 1557))

**Train Test splitting**

In [48]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.2)
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
x_train.shape, x_test.shape

((8000, 3), (2000, 3))

**Tokenizing and applying Pad Sequence**

In [50]:
train_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
train_tokenizer.fit_on_texts(x_train['title'].values)
train_word_index = train_tokenizer.word_index

In [51]:
vocab_length = len(train_word_index) + 1
vocab_length

6418

In [52]:
train_sequences = train_tokenizer.texts_to_sequences(x_train['title'].values)
len(train_sequences)

8000

In [53]:
test_sequences = train_tokenizer.texts_to_sequences(x_test['title'].values)
len(test_sequences)

2000

In [55]:
longest_sentence = len(max(train_sequences, key=len))
train_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=longest_sentence, truncating='post')

test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=longest_sentence, truncating='post')

longest_sentence

16

Lstm

In [56]:
tf.keras.backend.clear_session()

embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=longest_sentence), 
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LSTM(50, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(16),
    tf.keras.layers.Dense(y.shape[1], activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 50)            320900    
_________________________________________________________________
dropout (Dropout)            (None, 16, 50)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 16, 100)           60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 16, 100)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 16, 100)           400       
_________________________________________________________________
lstm_1 (LSTM)                (None, 16, 50)            30200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 16, 50)            0

In [57]:
history = model.fit(train_padded_seqeunces, y_train, epochs=20,
                    validation_data=(test_padded_seqeunces, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Display Predict tags

In [58]:
pred_prob = model.predict(test_padded_seqeunces)
t = 0.3
test_pred = (pred_prob >= t).astype(int)

In [59]:
for i in range(10):
    k = x_test.sample(1).index[0]
    print("Title: ", x_test['title'][k])
    print("Actual Tag: ", multilabel_binarizer.inverse_transform(y_test[k].reshape(1, -1))[0])
    print("Predicted tag: ", multilabel_binarizer.inverse_transform(test_pred[k].reshape(1, -1))[0])
    print('\n')

Title:  c#  sql  speed code db
Actual Tag:  ('c#', 'performance', 'sql')
Predicted tag:  ('c', 'c#', 'c++')


Title:  send value one class different classesmore one class
Actual Tag:  ('android', 'bundle', 'java')
Predicted tag:  ('html', 'php')


Title:  php pregreplace nbsp
Actual Tag:  ('html', 'php', 'regex')
Predicted tag:  ('html', 'php')


Title:  procedure function biinsertcustomer expects parameter custid supplied
Actual Tag:  ('c#', 'sql', 'sql-server')
Predicted tag:  ('c#', 'mysql', 'sql')


Title:  get first div behave should
Actual Tag:  ('css', 'html', 'joomla2.5')
Predicted tag:  ('css', 'html', 'javascript', 'jquery')


Title:  php  handling html checkbox array
Actual Tag:  ('checkbox', 'html', 'php')
Predicted tag:  ('forms', 'html', 'jquery', 'php')


Title:  optimal way store multipleselection survey answers database
Actual Tag:  ('database', 'mysql', 'sql')
Predicted tag:  ('mysql', 'php', 'sql')


Title:  adding touch targets pdf document html page
Actual Tag:  ('