<a href="https://www.kaggle.com/code/akshadagaonkar/stack-overflow-tags-prediction?scriptVersionId=108480795" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

### Accessing Big Query data

In [2]:
client = bigquery.Client()
dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

Using Kaggle's public dataset BigQuery integration.


In [3]:
sql = """SELECT id, title , tags 
        FROM bigquery-public-data.stackoverflow.stackoverflow_posts
        WHERE title NOT LIKE '%None%'
        LIMIT 10000
        """

results = client.query(sql).to_dataframe()

results.head()

  "Cannot create BigQuery Storage client, the dependency "


Unnamed: 0,id,title,tags
0,34404321,Android 6.0.1 Do Not Disturb Next Alarm value ...,alarmmanager|android-6.0-marshmallow
1,36500766,Binding Label to int causes the label to disap...,data-binding
2,31634443,Cast from 'SecCertificateRef *' (aka 'struct _...,objective-c|afnetworking-2|xcode7|xcode7-beta4
3,20711943,FancyTree persist and cookies,javascript|grails|cookies|groovy|fancytree
4,36584642,iOS Simulator: strange behavior with iPad,ipad|uiview|ios-simulator


### Preprocessing data

In [4]:
tags = []
for tag in results['tags']:
    tag = tag.split('|')
    if len(tag)<20:
        tags.append(tag)

In [5]:
df = pd.DataFrame({'id': results['id'], 'text': results['title'], 'tags' : tags})
df.head()

Unnamed: 0,id,text,tags
0,34404321,Android 6.0.1 Do Not Disturb Next Alarm value ...,"[alarmmanager, android-6.0-marshmallow]"
1,36500766,Binding Label to int causes the label to disap...,[data-binding]
2,31634443,Cast from 'SecCertificateRef *' (aka 'struct _...,"[objective-c, afnetworking-2, xcode7, xcode7-b..."
3,20711943,FancyTree persist and cookies,"[javascript, grails, cookies, groovy, fancytree]"
4,36584642,iOS Simulator: strange behavior with iPad,"[ipad, uiview, ios-simulator]"


In [6]:
req_tags = ['python', 'r', 'c#', 'java', 'android', 'html', 'kotlin', 'c', 'c++']

def clean_tags(tag):
    clean_tag = []
    for t in tag:
        if t in req_tags:
            clean_tag.append(t)
        return clean_tag 

In [7]:
df['clean_tags'] = df['tags'].apply(lambda x: clean_tags(x))

In [8]:
for i, tag in enumerate(df['clean_tags']):
    if len(tag)==0:
        df.drop(i, inplace=True)

In [9]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,text,tags,clean_tags
0,37600153,Socket Rocket alternative for Android,"[android, websocket, autobahn]",[android]
1,37745304,AbandonedMutexException isn't raised,"[c#, exception, mutex, abandonedmutexexception]",[c#]
2,34824220,Is it possible to bind administered objects in...,"[java, jms, jndi, hornetq]",[java]
3,37502406,Java: send button id value to a text area upon...,"[java, button, textarea, value]",[java]
4,36949957,Loading a .WAV file for OpenAL,"[c++, audio, wav, openal]",[c++]


In [11]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

punctuations = list(string.punctuation)
punctuations.remove("#")
punctuations.remove("+")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)
def remove_punctuations(text):
    no_punctuations_text = [w for w in text.split() if not w in punctuations]
    return ' '.join(no_punctuations_text)

In [13]:
def clean_text(text):
    text = text.lower()
    text = remove_punctuations(text)
    text = remove_stopwords(text)
    return text

In [14]:
df['clean_text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,id,text,tags,clean_tags,clean_text
0,37600153,Socket Rocket alternative for Android,"[android, websocket, autobahn]",[android],socket rocket alternative android
1,37745304,AbandonedMutexException isn't raised,"[c#, exception, mutex, abandonedmutexexception]",[c#],abandonedmutexexception raised
2,34824220,Is it possible to bind administered objects in...,"[java, jms, jndi, hornetq]",[java],possible bind administered objects hornetq mul...
3,37502406,Java: send button id value to a text area upon...,"[java, button, textarea, value]",[java],java: send button id value text area upon butt...
4,36949957,Loading a .WAV file for OpenAL,"[c++, audio, wav, openal]",[c++],loading .wav file openal


### Preparing target variable 

In [15]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['clean_tags'])

y = multilabel_binarizer.transform(df['clean_tags'])

In [16]:
y, y.shape

(array([[1, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]]),
 (2966, 9))

### Train & Test sets

In [17]:
train, test, y_train, y_test = train_test_split(df, y, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((2372, 5), (594, 5))

### Tokenizing & applying Pad-Sequences 

In [18]:
train_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
train_tokenizer.fit_on_texts(train['clean_text'].values)
train_word_index = train_tokenizer.word_index

In [19]:
vocab_length = len(train_word_index) + 1
vocab_length

4348

In [20]:
train_sequences = train_tokenizer.texts_to_sequences(train['clean_text'].values)
len(train_sequences)

2372

In [21]:
test_sequences = train_tokenizer.texts_to_sequences(test['clean_text'].values)
len(test_sequences)

594

In [22]:
longest_sentence = len(max(train_sequences, key=len))
train_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=longest_sentence, truncating='post')
test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=longest_sentence, truncating='post')
longest_sentence

20

### LSTM Model

In [23]:
tf.keras.backend.clear_session()

embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=longest_sentence),
    
    tf.keras.layers.LSTM(500, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(250, return_sequences=True),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

2022-10-18 18:01:10.171507: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-18 18:01:10.181803: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-18 18:01:10.182548: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-18 18:01:10.183540: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [24]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 50)            217400    
_________________________________________________________________
lstm (LSTM)                  (None, 20, 500)           1102000   
_________________________________________________________________
batch_normalization (BatchNo (None, 20, 500)           2000      
_________________________________________________________________
dropout (Dropout)            (None, 20, 500)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 250)           751000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 250)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 20, 250)           1

In [25]:
history = model.fit(train_padded_seqeunces, y_train, epochs=10, 
                    validation_data=(test_padded_seqeunces, y_test))

Epoch 1/10


2022-10-18 18:01:12.094719: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-10-18 18:01:16.429862: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Predicting Tags

In [26]:
test_prob = model.predict(test_padded_seqeunces)
t = 0.2
test_pred = (test_prob >= t).astype(int)

In [27]:
for i in range(10):
    k = test.sample(1).index[0]
    print("Title: ", test['text'][k])
    print("Actual Tag: ", multilabel_binarizer.inverse_transform(y_test[k].reshape(1, -1))[0])
    print("Predicted tag: ", multilabel_binarizer.inverse_transform(test_pred[k].reshape(1, -1))[0])
    print('\n')

Title:  Trying to test class with foreign key
Actual Tag:  ('c#',)
Predicted tag:  ('c#',)


Title:  How to bring the app to front when starting 2 intents from a widget?
Actual Tag:  ('android',)
Predicted tag:  ('android',)


Title:  What is the best method to code physical address in html?
Actual Tag:  ('html',)
Predicted tag:  ('c#',)


Title:  Android Gallery and Adapters
Actual Tag:  ('android',)
Predicted tag:  ('android',)


Title:  Linking issues with pantheios - undefined reference pantheios_init
Actual Tag:  ('c++',)
Predicted tag:  ('c++',)


Title:  How Can I Prevent Activation For Some ListView Items When The Selection Mode Is MultiChoiceModal?
Actual Tag:  ('android',)
Predicted tag:  ('android', 'c#')


Title:  To create a CrystalReport from a Specific date to other
Actual Tag:  ('c#',)
Predicted tag:  ('java',)


Title:  Label disappears after pass my mouse over an option in pyqt
Actual Tag:  ('python',)
Predicted tag:  ('c#',)


Title:  Creating class instances based o