In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

### Accessing Big Query data

In [2]:
client = bigquery.Client()
dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

Using Kaggle's public dataset BigQuery integration.


In [3]:
sql = """SELECT id, title , tags 
        FROM bigquery-public-data.stackoverflow.stackoverflow_posts
        WHERE title NOT LIKE '%None%'
        LIMIT 10000
        """

results = client.query(sql).to_dataframe()

results.head()

  "Cannot create BigQuery Storage client, the dependency "


Unnamed: 0,id,title,tags
0,34404321,Android 6.0.1 Do Not Disturb Next Alarm value ...,alarmmanager|android-6.0-marshmallow
1,36500766,Binding Label to int causes the label to disap...,data-binding
2,31634443,Cast from 'SecCertificateRef *' (aka 'struct _...,objective-c|afnetworking-2|xcode7|xcode7-beta4
3,20711943,FancyTree persist and cookies,javascript|grails|cookies|groovy|fancytree
4,36584642,iOS Simulator: strange behavior with iPad,ipad|uiview|ios-simulator


### Preprocessing data

In [4]:
tags = []
for tag in results['tags']:
    tag = tag.split('|')
    if len(tag)<20:
        tags.append(tag)
        
df = pd.DataFrame({'id': results['id'], 'text': results['title'], 'tags' : tags})

In [5]:
df.head()

Unnamed: 0,id,text,tags
0,34404321,Android 6.0.1 Do Not Disturb Next Alarm value ...,"[alarmmanager, android-6.0-marshmallow]"
1,36500766,Binding Label to int causes the label to disap...,[data-binding]
2,31634443,Cast from 'SecCertificateRef *' (aka 'struct _...,"[objective-c, afnetworking-2, xcode7, xcode7-b..."
3,20711943,FancyTree persist and cookies,"[javascript, grails, cookies, groovy, fancytree]"
4,36584642,iOS Simulator: strange behavior with iPad,"[ipad, uiview, ios-simulator]"


In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

punctuations = list(string.punctuation)
punctuations.remove("#")
punctuations.remove("+")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)
def remove_punctuations(text):
    no_punctuations_text = [w for w in text.split() if not w in punctuations]
    return ' '.join(no_punctuations_text)

In [8]:
def clean_text(text):
    text = text.lower()
    text = remove_punctuations(text)
    text = remove_stopwords(text)
    return text

In [9]:
df['clean_text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,id,text,tags,clean_text
0,34404321,Android 6.0.1 Do Not Disturb Next Alarm value ...,"[alarmmanager, android-6.0-marshmallow]",android 6.0.1 disturb next alarm value incorrect
1,36500766,Binding Label to int causes the label to disap...,[data-binding],binding label int causes label disapear int va...
2,31634443,Cast from 'SecCertificateRef *' (aka 'struct _...,"[objective-c, afnetworking-2, xcode7, xcode7-b...",cast 'seccertificateref *' (aka 'struct __secc...
3,20711943,FancyTree persist and cookies,"[javascript, grails, cookies, groovy, fancytree]",fancytree persist cookies
4,36584642,iOS Simulator: strange behavior with iPad,"[ipad, uiview, ios-simulator]",ios simulator: strange behavior ipad


### Preparing target variable 

In [10]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['tags'])

y = multilabel_binarizer.transform(df['tags'])

In [11]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Train & Test sets

In [12]:
train, test, y_train, y_test = train_test_split(df, y, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((8000, 4), (2000, 4))

### Tokenizing & applying Pad-Sequences 

In [13]:
train_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
train_tokenizer.fit_on_texts(train['clean_text'].values)
train_word_index = train_tokenizer.word_index

In [14]:
vocab_length = len(train_word_index) + 1
vocab_length

9003

In [15]:
train_sequences = train_tokenizer.texts_to_sequences(train['clean_text'].values)
len(train_sequences)

8000

In [16]:
test_sequences = train_tokenizer.texts_to_sequences(test['clean_text'].values)
len(test_sequences)

2000

In [17]:
longest_sentence = len(max(train_sequences, key=len))
train_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=longest_sentence, truncating='post')
test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=longest_sentence, truncating='post')
longest_sentence

20

### LSTM Model

In [30]:
tf.keras.backend.clear_session()

embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=longest_sentence),
    
    tf.keras.layers.LSTM(500, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(250, return_sequences=True),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

In [31]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 50)            450150    
_________________________________________________________________
lstm (LSTM)                  (None, 20, 500)           1102000   
_________________________________________________________________
batch_normalization (BatchNo (None, 20, 500)           2000      
_________________________________________________________________
dropout (Dropout)            (None, 20, 500)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 250)           751000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 250)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 20, 250)           1

In [32]:
history = model.fit(train_padded_seqeunces, y_train, epochs=10, 
                    validation_data=(test_padded_seqeunces, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Predicting Tags

In [33]:
test_prob = model.predict(test_padded_seqeunces)
t = 0.2
test_pred = (test_prob >= t).astype(int)

In [34]:
for i in range(10):
    k = test.sample(1).index[0]
    print("Title: ", test['text'][k])
    print("Actual Tag: ", multilabel_binarizer.inverse_transform(y_test[k].reshape(1, -1))[0])
    print("Predicted tag: ", multilabel_binarizer.inverse_transform(test_pred[k].reshape(1, -1))[0])
    print('\n')

Title:  Wrap div around img width and keep div 100% high
Actual Tag:  ('css', 'css3', 'html', 'javascript', 'jquery')
Predicted tag:  ('backbone.js', 'c++11')


Title:  #1054 - Unknown column 'id.especialidades' in 'where clause'
Actual Tag:  ('where',)
Predicted tag:  ('backbone.js', 'c++11')


Title:  Flink: Default Partitioning/Shuffling Strategy/Functions
Actual Tag:  ('apache-flink',)
Predicted tag:  ('backbone.js', 'c++11')


Title:  restlet 2.2 configure jackson format support
Actual Tag:  ('fasterxml', 'restlet')
Predicted tag:  ('backbone.js', 'c++11')


Title:  Programmaticly removing newlines inside Google Documents
Actual Tag:  ('google-apps-script', 'javascript', 'newline')
Predicted tag:  ('backbone.js', 'c++11')


Title:  Rails button options hash for API not working
Actual Tag:  ('filepicker.io', 'ruby-on-rails')
Predicted tag:  ('backbone.js', 'c++11')


Title:  Change background color of figure when using worldmap
Actual Tag:  ('mapping', 'matlab')
Predicted tag:  ('b