In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import Library

In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import re 
import string
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

### Dataset access to Big Query Data

In [3]:
client = bigquery.Client()
to_refer = client.dataset("stackoverflow", project="bigquery-public-data")

data = client.get_dataset(to_refer)


Using Kaggle's public dataset BigQuery integration.


In [4]:
sql = """
        SELECT id, title , tags 
        FROM 
        `bigquery-public-data.stackoverflow.stackoverflow_posts`
        WHERE
        title NOT LIKE '%None%' AND 
        (tags LIKE '%|python|%' OR tags LIKE '%|c#|%' OR
        tags LIKE '%|java|%' OR tags LIKE '%|r|%' OR
        tags LIKE '%|android|%' OR tags LIKE '%|html|%' OR
        tags LIKE '%|c++|%' OR tags LIKE '%|sql|%' OR
        tags LIKE '%|c|%' OR tags LIKE '%kotlin%') AND 
        LENGTH(tags) < 20
        LIMIT
        10000;
        """

df = client.query(sql).to_dataframe()

df.head()

  "Cannot create BigQuery Storage client, the dependency "


Unnamed: 0,id,title,tags
0,3371822,Is there anything that i can do in C but not i...,c++|c|oop
1,36106611,Bitmaps swallowing more memory than expected,java|android|bitmap
2,36371514,Error connect Arduino to HTML net::ERR_CONNECT...,php|html|arduino
3,37169462,How can I check if the device has an audio out...,java|android|audio
4,18448210,Is there any library similar to conio.h?,c++|c|linux|ubuntu


### Preprocessing

### Stopwords

In [5]:
df.title  = df.title.str.lower()
stop_words = set(stopwords.words('english'))

df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()


Unnamed: 0,id,title,tags
0,3371822,anything c c++?,c++|c|oop
1,36106611,bitmaps swallowing memory expected,java|android|bitmap
2,36371514,error connect arduino html net::err_connection...,php|html|arduino
3,37169462,check device audio output caused app,java|android|audio
4,18448210,library similar conio.h?,c++|c|linux|ubuntu


### Punctuation and hyperlinks except # and +

In [6]:
punctuations = list(string.punctuation)
punctuations.remove("#")
punctuations.remove("+")

In [7]:
def remove_punctuations(text):
    for punctuation in punctuations:
        text = text.replace(punctuation, '')
    return text

In [8]:
df['title'] = df['title'].apply(remove_punctuations)
df.head()

Unnamed: 0,id,title,tags
0,3371822,anything c c++,c++|c|oop
1,36106611,bitmaps swallowing memory expected,java|android|bitmap
2,36371514,error connect arduino html neterrconnectionref...,php|html|arduino
3,37169462,check device audio output caused app,java|android|audio
4,18448210,library similar conioh,c++|c|linux|ubuntu


#### Convert Tags string column to list column

In [9]:
df['tags'] = df['tags'].str.split('|')
df.head()

Unnamed: 0,id,title,tags
0,3371822,anything c c++,"[c++, c, oop]"
1,36106611,bitmaps swallowing memory expected,"[java, android, bitmap]"
2,36371514,error connect arduino html neterrconnectionref...,"[php, html, arduino]"
3,37169462,check device audio output caused app,"[java, android, audio]"
4,18448210,library similar conioh,"[c++, c, linux, ubuntu]"


### MultiLabel Binarizer

In [10]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['tags'])

y = multilabel_binarizer.transform(df['tags'])
y, y.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 (10000, 1591))

### Train Test Split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.2)
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
x_train.shape, x_test.shape

((8000, 3), (2000, 3))

### Tokenizing and on apply Pad Sequence

In [12]:
train_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='')
train_tokenizer.fit_on_texts(x_train['title'].values)
train_word_index = train_tokenizer.word_index

In [13]:
vocab_length = len(train_word_index) + 1
vocab_length

6560

In [14]:
train_sequences = train_tokenizer.texts_to_sequences(x_train['title'].values)
len(train_sequences)

8000

In [15]:
test_sequences = train_tokenizer.texts_to_sequences(x_test['title'].values)
len(test_sequences)

2000

In [16]:
longest_sentence = len(max(train_sequences, key=len))
train_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=longest_sentence, truncating='post')
test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=longest_sentence, truncating='post')
longest_sentence

17

### LSTM Model

In [17]:
tf.keras.backend.clear_session()

embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=longest_sentence), 
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LSTM(50, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(16),
    tf.keras.layers.Dense(y.shape[1], activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

2022-11-05 16:10:54.597768: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 17, 50)            328000    
_________________________________________________________________
dropout (Dropout)            (None, 17, 50)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 17, 100)           60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 17, 100)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 17, 100)           400       
_________________________________________________________________
lstm_1 (LSTM)                (None, 17, 50)            30200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 17, 50)            0

In [18]:
history = model.fit(train_padded_seqeunces, y_train, epochs=20,
                    validation_data=(test_padded_seqeunces, y_test))


2022-11-05 16:11:07.507720: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Predicting Tags

In [19]:
pred_prob = model.predict(test_padded_seqeunces)
t = 0.3
test_pred = (pred_prob >= t).astype(int)

In [20]:
for i in range(10):
    k = x_test.sample(1).index[0]
    print("Title: ", x_test['title'][k])
    print("Actual Tag: ", multilabel_binarizer.inverse_transform(y_test[k].reshape(1, -1))[0])
    print("Predicted tag: ", multilabel_binarizer.inverse_transform(test_pred[k].reshape(1, -1))[0])
    print('\n')

Title:  screenorientation fullsensor got error manifest
Actual Tag:  ('android', 'java', 'sensor')
Predicted tag:  ('android', 'java')


Title:  two javascript functions called return type function first called
Actual Tag:  ('css', 'html', 'javascript')
Predicted tag:  ('css', 'html', 'javascript')


Title:  send json php file connection refused
Actual Tag:  ('android', 'json', 'php')
Predicted tag:  ('html', 'java', 'php')


Title:  order displayed items id descending
Actual Tag:  ('php', 'sql', 'website', 'yii')
Predicted tag:  ('css', 'html', 'php')


Title:  zebra style css html table
Actual Tag:  ('css', 'html', 'php')
Predicted tag:  ('css', 'html', 'javascript')


Title:  sql syntax error
Actual Tag:  ('mysql', 'sql', 'syntax')
Predicted tag:  ('mysql', 'php', 'sql')


Title:  access email recipients html email
Actual Tag:  ('email', 'html', 'php')
Predicted tag:  ('html', 'php')


Title:  attach function form element
Actual Tag:  ('css', 'html', 'javascript')
Predicted tag:  ('