In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

### Accessing Big Query data

In [2]:
client = bigquery.Client()
dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

Using Kaggle's public dataset BigQuery integration.


In [3]:
sql = """
        SELECT 
            id, title , tags 
        FROM 
            `bigquery-public-data.stackoverflow.stackoverflow_posts`
        WHERE
            title NOT LIKE '%None%' AND 
            (tags LIKE '%|python|%' OR tags LIKE '%|c#|%' OR
            tags LIKE '%|java|%' OR tags LIKE '%|r|%' OR
            tags LIKE '%|android|%' OR tags LIKE '%|html|%' OR
            tags LIKE '%|c++|%' OR tags LIKE '%|sql|%' OR
            tags LIKE '%|c|%' OR tags LIKE '%kotlin%') AND 
            LENGTH(tags) < 20
        LIMIT
             10000;
        """

results = client.query(sql).to_dataframe()

results.head()

  "Cannot create BigQuery Storage client, the dependency "


Unnamed: 0,id,title,tags
0,3371822,Is there anything that i can do in C but not i...,c++|c|oop
1,36106611,Bitmaps swallowing more memory than expected,java|android|bitmap
2,36371514,Error connect Arduino to HTML net::ERR_CONNECT...,php|html|arduino
3,37169462,How can I check if the device has an audio out...,java|android|audio
4,18448210,Is there any library similar to conio.h?,c++|c|linux|ubuntu


### Preprocessing data

In [4]:
tags = []
for tag in results['tags']:
    tag = tag.split('|')
    if len(tag)<20:
        tags.append(tag)

In [5]:
df = pd.DataFrame({'id': results['id'], 'text': results['title'], 'tags' : tags})
df.head()

Unnamed: 0,id,text,tags
0,3371822,Is there anything that i can do in C but not i...,"[c++, c, oop]"
1,36106611,Bitmaps swallowing more memory than expected,"[java, android, bitmap]"
2,36371514,Error connect Arduino to HTML net::ERR_CONNECT...,"[php, html, arduino]"
3,37169462,How can I check if the device has an audio out...,"[java, android, audio]"
4,18448210,Is there any library similar to conio.h?,"[c++, c, linux, ubuntu]"


In [6]:
df.head()

Unnamed: 0,id,text,tags
0,3371822,Is there anything that i can do in C but not i...,"[c++, c, oop]"
1,36106611,Bitmaps swallowing more memory than expected,"[java, android, bitmap]"
2,36371514,Error connect Arduino to HTML net::ERR_CONNECT...,"[php, html, arduino]"
3,37169462,How can I check if the device has an audio out...,"[java, android, audio]"
4,18448210,Is there any library similar to conio.h?,"[c++, c, linux, ubuntu]"


In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

punctuations = list(string.punctuation)
punctuations.remove("#")
punctuations.remove("+")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)
def remove_punctuations(text):
    no_punctuations_text = [w for w in text.split() if not w in punctuations]
    return ' '.join(no_punctuations_text)

In [9]:
def clean_text(text):
    text = text.lower()
    text = remove_punctuations(text)
    text = remove_stopwords(text)
    return text

In [10]:
df['clean_text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,id,text,tags,clean_text
0,3371822,Is there anything that i can do in C but not i...,"[c++, c, oop]",anything c c++?
1,36106611,Bitmaps swallowing more memory than expected,"[java, android, bitmap]",bitmaps swallowing memory expected
2,36371514,Error connect Arduino to HTML net::ERR_CONNECT...,"[php, html, arduino]",error connect arduino html net::err_connection...
3,37169462,How can I check if the device has an audio out...,"[java, android, audio]",check device audio output caused app
4,18448210,Is there any library similar to conio.h?,"[c++, c, linux, ubuntu]",library similar conio.h?


### Preparing target variable 

In [12]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['tags'])

y = multilabel_binarizer.transform(df['tags'])

In [13]:
y, y.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 (10000, 1595))

### Train & Test sets

In [14]:
train, test, y_train, y_test = train_test_split(df, y, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((8000, 4), (2000, 4))

### Tokenizing & applying Pad-Sequences 

In [15]:
train_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
train_tokenizer.fit_on_texts(train['clean_text'].values)
train_word_index = train_tokenizer.word_index

In [16]:
vocab_length = len(train_word_index) + 1
vocab_length

6062

In [17]:
train_sequences = train_tokenizer.texts_to_sequences(train['clean_text'].values)
len(train_sequences)

8000

In [18]:
test_sequences = train_tokenizer.texts_to_sequences(test['clean_text'].values)
len(test_sequences)

2000

In [19]:
longest_sentence = len(max(train_sequences, key=len))
train_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=longest_sentence, truncating='post')
test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=longest_sentence, truncating='post')
longest_sentence

19

### LSTM Model

In [30]:
tf.keras.backend.clear_session()

embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=longest_sentence),
    
   tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(300, return_sequences=True),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

In [31]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 19, 50)            303100    
_________________________________________________________________
dropout (Dropout)            (None, 19, 50)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 19, 300)           421200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 19, 300)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 19, 300)           1200      
_________________________________________________________________
lstm_1 (LSTM)                (None, 19, 100)           160400    
_________________________________________________________________
batch_normalization_1 (Batch (None, 19, 100)           4

In [32]:
history = model.fit(train_padded_seqeunces, y_train, epochs=10, 
                    validation_data=(test_padded_seqeunces, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Predicting Tags

In [33]:
test_prob = model.predict(test_padded_seqeunces)
t = 0.2
test_pred = (test_prob >= t).astype(int)

In [34]:
for i in range(10):
    k = test.sample(1).index[0]
    print("Title: ", test['text'][k])
    print("Actual Tag: ", multilabel_binarizer.inverse_transform(y_test[k].reshape(1, -1))[0])
    print("Predicted tag: ", multilabel_binarizer.inverse_transform(test_pred[k].reshape(1, -1))[0])
    print('\n')

Title:  How to pass data from Java code to R code periodically
Actual Tag:  ('java', 'jri', 'r', 'rserve')
Predicted tag:  ('c++',)


Title:  How do I make this Youtube Video clickable (css + z-index)?
Actual Tag:  ('css', 'html', 'javascript')
Predicted tag:  ('c++',)


Title:  How to get data from phpMyAdmin and use that data?
Actual Tag:  ('database', 'html', 'php')
Predicted tag:  ('c++',)


Title:  Copy entire php page to another _blank php page
Actual Tag:  ('copy', 'html', 'php')
Predicted tag:  ('c++',)


Title:  Pipe ESS to terminal outside of Emacs?
Actual Tag:  ('emacs', 'ess', 'r')
Predicted tag:  ('c++',)


Title:  How to increase resolution of gif image?
Actual Tag:  ('gif', 'linux', 'r', 'rgl')
Predicted tag:  ('c++',)


Title:  Execute Python game in Pygame from PHP script
Actual Tag:  ('php', 'pygame', 'python')
Predicted tag:  ('c++',)


Title:  jQuery set att value as integer
Actual Tag:  ('css', 'html', 'jquery')
Predicted tag:  ('c++',)


Title:  convert string yyy