In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [2]:
client = bigquery.Client()
dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

Using Kaggle's public dataset BigQuery integration.


In [3]:
sql = """
        SELECT 
            id, title , tags 
        FROM 
            `bigquery-public-data.stackoverflow.stackoverflow_posts`
        WHERE
            title NOT LIKE '%None%' AND 
            (tags LIKE '%|python|%' OR tags LIKE '%|c#|%' OR
            tags LIKE '%|java|%' OR tags LIKE '%|r|%' OR
            tags LIKE '%|android|%' OR tags LIKE '%|html|%' OR
            tags LIKE '%|c++|%' OR tags LIKE '%|sql|%' OR
            tags LIKE '%|c|%' OR tags LIKE '%kotlin%') AND 
            LENGTH(tags) < 20
        LIMIT
             10000;
        """

results = client.query(sql).to_dataframe()

results.head()

  "Cannot create BigQuery Storage client, the dependency "


Unnamed: 0,id,title,tags
0,30310354,Boost bjam tutorial keeps looking for the wron...,python|c++|boost
1,18401224,Does Java have any framework similar to WPF( a...,c#|java|wpf
2,35068061,curl url sent to flask server not working,php|python|curl
3,35920841,Generate files after compile (javac) in Androi...,java|android|gradle
4,24772652,Sending Dynamic NdefRecords to the constructor,java|android|nfc


In [4]:
tags = []
for tag in results['tags']:
    tag = tag.split('|')
    if len(tag)<20:
        tags.append(tag)

In [5]:
df = pd.DataFrame({'id': results['id'], 'text': results['title'], 'tags' : tags})


In [6]:
df.head()

Unnamed: 0,id,text,tags
0,30310354,Boost bjam tutorial keeps looking for the wron...,"[python, c++, boost]"
1,18401224,Does Java have any framework similar to WPF( a...,"[c#, java, wpf]"
2,35068061,curl url sent to flask server not working,"[php, python, curl]"
3,35920841,Generate files after compile (javac) in Androi...,"[java, android, gradle]"
4,24772652,Sending Dynamic NdefRecords to the constructor,"[java, android, nfc]"


In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

punctuations = list(string.punctuation)
punctuations.remove("#")
punctuations.remove("+")

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [8]:
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)
def remove_punctuations(text):
    no_punctuations_text = [w for w in text.split() if not w in punctuations]
    return ' '.join(no_punctuations_text)

In [9]:
def clean_text(text):
    text = text.lower()
    text = remove_punctuations(text)
    text = remove_stopwords(text)
    return text

In [10]:
df['clean_text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,id,text,tags,clean_text
0,30310354,Boost bjam tutorial keeps looking for the wron...,"[python, c++, boost]",boost bjam tutorial keeps looking wrong mcvs v...
1,18401224,Does Java have any framework similar to WPF( a...,"[c#, java, wpf]",java framework similar wpf( capability using s...
2,35068061,curl url sent to flask server not working,"[php, python, curl]",curl url sent flask server working
3,35920841,Generate files after compile (javac) in Androi...,"[java, android, gradle]",generate files compile (javac) androidstudio g...
4,24772652,Sending Dynamic NdefRecords to the constructor,"[java, android, nfc]",sending dynamic ndefrecords constructor


In [11]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['tags'])

y = multilabel_binarizer.transform(df['tags'])

In [12]:
y, y.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 (10000, 1578))

In [13]:
train, test, y_train, y_test = train_test_split(df, y, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((8000, 4), (2000, 4))

In [14]:
train_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='')
train_tokenizer.fit_on_texts(train['clean_text'].values)
train_word_index = train_tokenizer.word_index

In [15]:
vocab_length = len(train_word_index) + 1
vocab_length

6033

In [16]:
train_sequences = train_tokenizer.texts_to_sequences(train['clean_text'].values)
len(train_sequences)

8000

In [17]:
test_sequences = train_tokenizer.texts_to_sequences(test['clean_text'].values)
len(test_sequences)

2000

In [18]:
longest_sentence = len(max(train_sequences, key=len))
train_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=longest_sentence, truncating='post')
test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=longest_sentence, truncating='post')
longest_sentence

20

In [19]:
tf.keras.backend.clear_session()

embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=longest_sentence),
    
   tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(300, return_sequences=True),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

2022-11-05 15:58:34.633781: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [20]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 50)            301650    
_________________________________________________________________
dropout (Dropout)            (None, 20, 50)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 20, 300)           421200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 300)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 20, 300)           1200      
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 100)           160400    
_________________________________________________________________
batch_normalization_1 (Batch (None, 20, 100)           4

In [21]:
history = model.fit(train_padded_seqeunces, y_train, epochs=10, 
                    validation_data=(test_padded_seqeunces, y_test))

2022-11-05 15:58:58.138943: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
test_prob = model.predict(test_padded_seqeunces)
t = 0.2
test_pred = (test_prob >= t).astype(int)

In [23]:
for i in range(10):
    k = test.sample(1).index[0]
    print("Title: ", test['text'][k])
    print("Actual Tag: ", multilabel_binarizer.inverse_transform(y_test[k].reshape(1, -1))[0])
    print("Predicted tag: ", multilabel_binarizer.inverse_transform(test_pred[k].reshape(1, -1))[0])
    print('\n')

Title:  Issue requesting a post on android
Actual Tag:  ('android', 'http', 'java')
Predicted tag:  ('sql',)


Title:  Deploying C# application with SQL database
Actual Tag:  ('c#', 'deployment', 'sql')
Predicted tag:  ('sql',)


Title:  Range<Date> issue
Actual Tag:  ('android', 'guava', 'java')
Predicted tag:  ('sql',)


Title:  Internet explorer Div alignment issue
Actual Tag:  ('css', 'html', 'jquery')
Predicted tag:  ('sql',)


Title:  Reuse code when using screens in Libgdx
Actual Tag:  ('android', 'java', 'libgdx')
Predicted tag:  ('sql',)


Title:  Unable to load DLL The specified module could not be found. (Exception from HRESULT: 0x8007007E)
Actual Tag:  ('c#', 'c++', 'dll')
Predicted tag:  ('sql',)


Title:  Static Method as Function Pointer
Actual Tag:  ('c', 'c++', 'oop')
Predicted tag:  ('sql',)


Title:  Top 5 average scores?
Actual Tag:  ('codeigniter', 'php', 'sql')
Predicted tag:  ('sql',)


Title:  Socket communication between Java and C# application
Actual Tag:  ('c