In [33]:
from google.cloud import bigquery
import pandas as pd
client = bigquery.Client()
dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

Using Kaggle's public dataset BigQuery integration.


**Applied Require Query**

In [52]:
sql = """SELECT id, title , tags 
        FROM bigquery-public-data.stackoverflow.stackoverflow_posts
        WHERE (LENGTH(tags)<20) AND (tags LIKE '%python%'
            OR tags LIKE '%|r|%'
            OR tags LIKE 'r|%'
            OR tags LIKE '%|r'
            OR tags LIKE '%c#%'
            OR tags LIKE '%java%'
            OR tags LIKE '%android%'
            OR tags LIKE '%html%'
            OR tags LIKE '%kotlin%'
            OR tags LIKE '%|c|%'
            OR tags LIKE '%|c'
            OR tags LIKE 'c|%'
            OR tags LIKE '%C++%')
        LIMIT 10000
        """

results = client.query(sql).to_dataframe()

results.head(20)

  "Cannot create BigQuery Storage client, the dependency "


Unnamed: 0,id,title,tags
0,33179067,Problems configuring JNDI with CDI,java|tomcat|jndi
1,37331745,ChartJS (2.1.3): Override global option for si...,javascript|chart.js
2,37237666,Kivy set textinput to kivy's default keyboard,python|kivy
3,33256152,Image search engine using opencv,python|opencv3.0
4,35808182,insert query into json array,java|mysql|jdbc
5,32868786,How to build jars with and without test classe...,java|spring|maven
6,11061746,how to find printer's flow control whether it ...,c#|wpf
7,35845124,How could I dealing with weka output clusterin...,java|weka
8,37305560,How can one mark a flag as required with gflags?,python|gflags
9,23237658,Managing membership provider on web service th...,c#|wcf|sqlanywhere


In [53]:
import numpy as np
import re
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [54]:
df = pd.DataFrame(results)
df.head()

Unnamed: 0,id,title,tags
0,33179067,Problems configuring JNDI with CDI,java|tomcat|jndi
1,37331745,ChartJS (2.1.3): Override global option for si...,javascript|chart.js
2,37237666,Kivy set textinput to kivy's default keyboard,python|kivy
3,33256152,Image search engine using opencv,python|opencv3.0
4,35808182,insert query into json array,java|mysql|jdbc


**Preprocessing**


In [37]:
stop_words = set(stopwords.words('english'))

punctuations = list(string.punctuation)
punctuations.remove("#")
punctuations.remove("+")

In [43]:
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)
def remove_punctuations(text):
    no_punctuations_text = [w for w in text.split() if not w in punctuations]
    return ' '.join(no_punctuations_text)


In [44]:
def clean_text(text):
    text = text.lower()
    text = remove_punctuations(text)
    text = remove_stopwords(text)
    return text

In [55]:
df['clean_text'] = df['title'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,id,title,tags,clean_text
0,33179067,Problems configuring JNDI with CDI,java|tomcat|jndi,problems configuring jndi cdi
1,37331745,ChartJS (2.1.3): Override global option for si...,javascript|chart.js,chartjs (2.1.3): override global option single...
2,37237666,Kivy set textinput to kivy's default keyboard,python|kivy,kivy set textinput kivy's default keyboard
3,33256152,Image search engine using opencv,python|opencv3.0,image search engine using opencv
4,35808182,insert query into json array,java|mysql|jdbc,insert query json array


In [56]:
tags = []
for tag in df['tags']:
    tag = tag.split('|')
    tags.append(tag)
    


In [57]:
df = pd.DataFrame({'id': df['id'], 'text': df['title'], 'tags' : tags, 'clean_title' : df['clean_text']})
df.head()

Unnamed: 0,id,text,tags,clean_title
0,33179067,Problems configuring JNDI with CDI,"[java, tomcat, jndi]",problems configuring jndi cdi
1,37331745,ChartJS (2.1.3): Override global option for si...,"[javascript, chart.js]",chartjs (2.1.3): override global option single...
2,37237666,Kivy set textinput to kivy's default keyboard,"[python, kivy]",kivy set textinput kivy's default keyboard
3,33256152,Image search engine using opencv,"[python, opencv3.0]",image search engine using opencv
4,35808182,insert query into json array,"[java, mysql, jdbc]",insert query json array


In [58]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['tags'])

y = multilabel_binarizer.transform(df['tags'])

In [59]:
y, y.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 (10000, 2298))

In [60]:
train, test, y_train, y_test = train_test_split(df, y, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((8000, 4), (2000, 4))

In [61]:
train_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
train_tokenizer.fit_on_texts(train['clean_title'].values)
train_word_index = train_tokenizer.word_index

In [62]:
vocab_length = len(train_word_index) + 1
vocab_length

7390

In [63]:
train_sequences = train_tokenizer.texts_to_sequences(train['clean_title'].values)
len(train_sequences)

8000

In [65]:
test_sequences = train_tokenizer.texts_to_sequences(test['clean_title'].values)
len(test_sequences)

2000

In [66]:
longest_sentence = len(max(train_sequences, key=len))
train_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=longest_sentence, truncating='post')
test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=longest_sentence, truncating='post')
longest_sentence

22

In [71]:
tf.keras.backend.clear_session()

embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=longest_sentence),
    
    tf.keras.layers.LSTM(500, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(250, return_sequences=True),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

In [72]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 22, 50)            369500    
_________________________________________________________________
lstm (LSTM)                  (None, 22, 500)           1102000   
_________________________________________________________________
batch_normalization (BatchNo (None, 22, 500)           2000      
_________________________________________________________________
dropout (Dropout)            (None, 22, 500)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 22, 250)           751000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 22, 250)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 22, 250)           1

In [73]:
history = model.fit(train_padded_seqeunces, y_train, epochs=10, 
                    validation_data=(test_padded_seqeunces, y_test))

2022-10-20 07:11:36.922100: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [74]:
test_prob = model.predict(test_padded_seqeunces)
t = 0.2
test_pred = (test_prob >= t).astype(int)

In [75]:
for i in range(10):
    k = test.sample(1).index[0]
    print("Title: ", test['text'][k])
    print("Actual Tag: ", multilabel_binarizer.inverse_transform(y_test[k].reshape(1, -1))[0])
    print("Predicted tag: ", multilabel_binarizer.inverse_transform(test_pred[k].reshape(1, -1))[0])
    print('\n')

Title:  Regular expression matching digits and space
Actual Tag:  ('java', 'regex')
Predicted tag:  ('c',)


Title:  Matplotlib.animation: display points after plotting?
Actual Tag:  ('matplotlib', 'python')
Predicted tag:  ('c',)


Title:  3 div liquid design
Actual Tag:  ('css', 'html')
Predicted tag:  ('c',)


Title:  When I am Going to update my database valuesThen changes will not affecteing in database
Actual Tag:  ('c#', 'ms-access-2010')
Predicted tag:  ('c',)


Title:  How to use wcf service in desktop application with one stance for one session
Actual Tag:  ('asp.net', 'c#-4.0', 'wcf')
Predicted tag:  ('c',)


Title:  Running command using exec channel with JSch does not return any output
Actual Tag:  ('android', 'jsch', 'ssh')
Predicted tag:  ('c',)


Title:  addition in while( )
Actual Tag:  ('javascript', 'jquery')
Predicted tag:  ('c',)


Title:  not displaying all the buttons
Actual Tag:  ('java', 'layout', 'swing')
Predicted tag:  ('c',)


Title:  Collision of two diffr