In [1]:
# Project 1 : Multiclass Multilabel prediction For stack overflow Questions

import numpy as np
import pandas as pd

# Data: https://www.dropbox.com/s/5721wcs2guuykzl/stacksample.zip?dl=0
# Input data files are kept in the "../Project1/data/" directory.

import os
print(os.listdir("../Project1/Data"))

# Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm

%matplotlib inline

import collections

['Answers.csv', 'Questions.csv', 'Tags.csv']


In [2]:
df_tags = pd.read_csv('../Project1/Data/Tags.csv')

In [3]:
def plot_tags(tagCount):
    
    x,y = zip(*tagCount)
    
    colormap = plt.cm.gist_ncar  
    colors = [colormap(i) for i in np.linspace(0, 0.8,50)]   

    area = [i/4 for i in list(y)]   # 0 to 15 point radiuses and adjusted scale for visibility #4000
    plt.figure(figsize=(9,8))
    plt.ylabel("Number of question associations")
    for i in range(len(y)):
        plt.plot(i,y[i], marker='o', linestyle='',ms=area[i],label=x[i])

    plt.legend(numpoints=1)
    plt.show()

In [4]:
#tagCount =  collections.Counter(list(plot_tags(['Tag']))).most_common(10)
#print(tagCount)
#for i, item in enumerate(tagCount):
#    print(f"{i}: {item}")
#plot_tags(tagCount)

top10 = ['python', 'java', 'python', 'c++', 'java', 'python', 'c++', 'c++']
tagCount = collections.Counter(top10).most_common(3)

In [5]:
tag_top10= df_tags[df_tags.Tag.isin(top10)]
print (tag_top10.shape)
tag_top10.head()

(227404, 2)


Unnamed: 0,Id,Tag
18,330,c++
107,3150,c++
112,3230,c++
127,4080,java
145,4630,java


In [6]:
tag_top10['Id'].value_counts().head(10)

25073130    3
21153850    3
36914550    3
1742750     3
5730330     3
30478040    3
28857740    3
8783690     3
10561010    3
14079650    3
Name: Id, dtype: int64

In [7]:
tag_top10.head()

Unnamed: 0,Id,Tag
18,330,c++
107,3150,c++
112,3230,c++
127,4080,java
145,4630,java


In [8]:
def add_tags(question_id):
    return tag_top10[tag_top10['Id'] == question_id['Id']].Tag.values

top10 = tag_top10.apply(add_tags, axis=1)

In [9]:
len(top10),tag_top10.shape

(227404, (227404, 2))

In [10]:
tag_top10=pd.concat([tag_top10, top10.rename('Tags')], axis=1)
tag_top10.head()

Unnamed: 0,Id,Tag,Tags
18,330,c++,[c++]
107,3150,c++,[c++]
112,3230,c++,[c++]
127,4080,java,[java]
145,4630,java,[java]


In [11]:
tag_top10.drop(["Tag"], axis=1, inplace=True)
tag_top10.shape

(227404, 2)

In [12]:
top10_tags=tag_top10.loc[tag_top10.astype(str).drop_duplicates().index]

In [13]:
ques = pd.read_csv('../Project1/question_clean.csv', encoding='iso-8859-1')
ques.head()

Unnamed: 0,Id,Title,Body
0,80,SQLStatement.execute() - multiple queries in o...,I've written a database generation script in S...
1,90,Good branching and merging tutorials for Torto...,Are there any really good tutorials explaining...
2,120,ASP.NET Site Maps,Has anyone got experience creating SQL-based A...
3,180,Function for creating color wheels,This is something I've pseudo-solved many time...
4,260,Adding scripting functionality to .NET applica...,I have a little game written in C#. It uses a ...


In [14]:
total=pd.merge(ques, top10_tags, on='Id')
print(total.shape)
total.head()

(72282, 4)


Unnamed: 0,Id,Title,Body,Tags
0,330,Should I use nested classes in this case?,I am working on a collection of classes used f...,[c++]
1,3150,How to set up unit testing for Visual Studio C++,I'm having trouble figuring out how to get the...,[c++]
2,3230,How do you pack a visual studio c++ project fo...,I'm wondering how to make a release build that...,[c++]
3,4080,What code analysis tools do you use for your J...,What code analysis tools do you use on your Ja...,[java]
4,4630,"How can I Java webstart multiple, dependent, n...",Example: I have two shared objects (same shoul...,[java]


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from nltk import word_tokenize

In [16]:
!pip install tensorflow



In [17]:
from tensorflow import keras 
import tensorflow as tf

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization, GRU ,concatenate
from tensorflow.keras.models import Model

In [19]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(total.Tags)
labels = multilabel_binarizer.classes_
labels

array(['c++', 'java', 'python'], dtype=object)

In [20]:
train,test=train_test_split(total[:550000],test_size=0.25,random_state=24)

In [21]:
train.shape,test.shape

((54211, 4), (18071, 4))

In [22]:
X_train_t=train['Title']
X_train_b=train['Body']
y_train=multilabel_binarizer.transform(train['Tags'])
X_test_t=test['Title']
X_test_b=test['Body']
y_test=multilabel_binarizer.transform(test['Tags'])

In [23]:
sent_lens_t=[]
for sent in train['Title']:
    sent_lens_t.append(len(word_tokenize(sent)))
max(sent_lens_t)

43

In [24]:
np.quantile(sent_lens_t,0.97)

19.0

In [25]:
max_len_t = 18
tok = Tokenizer(char_level=False,split=' ')
tok.fit_on_texts(X_train_t)
sequences_train_t = tok.texts_to_sequences(X_train_t)

In [26]:
vocab_len_t=len(tok.index_word.keys())
vocab_len_t

20674

In [27]:
sequences_matrix_train_t = sequence.pad_sequences(sequences_train_t,maxlen=max_len_t)
sequences_matrix_train_t

array([[    0,     0,     0, ...,   379,     2,    11],
       [    0,     0,     0, ...,   502,   305,   944],
       [    0,     0,     0, ...,  1811,   797,  1263],
       ...,
       [    0,     0,     0, ...,  2896,   332, 20674],
       [    0,     0,     0, ...,     2,    11,   586],
       [    0,     0,     0, ...,  6616,     2,     8]])

In [28]:
sequences_test_t = tok.texts_to_sequences(X_test_t)
sequences_matrix_test_t = sequence.pad_sequences(sequences_test_t,maxlen=max_len_t)

In [29]:
sequences_matrix_train_t.shape,sequences_matrix_test_t.shape,y_train.shape,y_test.shape

((54211, 18), (18071, 18), (54211, 3), (18071, 3))

In [30]:
sent_lens_b=[]
for sent in train['Body']:
    sent_lens_b.append(len(word_tokenize(sent)))
max(sent_lens_b)

8883

In [31]:
np.quantile(sent_lens_b,0.90)

491.0

In [32]:
max_len_b = 600
tok = Tokenizer(char_level=False,split=' ')
tok.fit_on_texts(X_train_b)
sequences_train_b = tok.texts_to_sequences(X_train_b)

In [33]:
vocab_len_b =len(tok.index_word.keys())
vocab_len_b 

233663

In [34]:
sequences_matrix_train_b = sequence.pad_sequences(sequences_train_b,maxlen=max_len_b)
sequences_matrix_train_b

array([[    0,     0,     0, ...,  4589,  1290,   140],
       [    0,     0,     0, ...,  6106,  3955, 39573],
       [    0,     0,     0, ...,   140,     4,   626],
       ...,
       [    0,     0,     0, ...,   140,     8,   513],
       [    0,     0,     0, ...,   131,     4,   155],
       [    0,     0,     0, ...,   219,    30,  3345]])

In [35]:
sequences_test_b = tok.texts_to_sequences(X_test_b)
sequences_matrix_test_b = sequence.pad_sequences(sequences_test_b,maxlen=max_len_b)

In [36]:
sequences_matrix_train_t.shape,sequences_matrix_train_b.shape,y_train.shape

((54211, 18), (54211, 600), (54211, 3))

In [37]:
sequences_matrix_test_t.shape,sequences_matrix_test_b.shape,y_test.shape

((18071, 18), (18071, 600), (18071, 3))

In [38]:
def RNN():
    # Title Only
    title_input = Input(name='title_input',shape=[max_len_t])
    title_Embed = Embedding(vocab_len_t+1,2000,input_length=max_len_t,mask_zero=True,name='title_Embed')(title_input)
    gru_out_t = GRU(300)(title_Embed)
    # auxiliary output to tune GRU weights smoothly 
    #auxiliary_output = Dense(10, activation='sigmoid', name='aux_output')(gru_out_t)
    # Mapping the number of Layers and number of Output classe
    auxiliary_output = Dense(3, activation='sigmoid', name='aux_output')(gru_out_t)   
    
    # Body Only
    body_input = Input(name='body_input',shape=[max_len_b]) 
    body_Embed = Embedding(vocab_len_b+1,170,input_length=max_len_b,mask_zero=True,name='body_Embed')(body_input)
    gru_out_b = GRU(200)(body_Embed)
    
    # combined with GRU output
    com = concatenate([gru_out_t, gru_out_b])
    
    # now the combined data is being fed to dense layers
    dense1 = Dense(400,activation='relu')(com)
    dp1 = Dropout(0.5)(dense1)
    bn = BatchNormalization()(dp1) 
    dense2 = Dense(150,activation='relu')(bn)
    
    #main_output = Dense(10, activation='sigmoid', name='main_output')(dense2)
    # Mapping the number of Layers and number of Output classe
    main_output = Dense(3, activation='sigmoid', name='main_output')(dense2)
    
    model = Model(inputs=[title_input, body_input],outputs=[main_output, auxiliary_output])
    return model

In [39]:
model = RNN()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 title_input (InputLayer)       [(None, 18)]         0           []                               
                                                                                                  
 body_input (InputLayer)        [(None, 600)]        0           []                               
                                                                                                  
 title_Embed (Embedding)        (None, 18, 2000)     41350000    ['title_input[0][0]']            
                                                                                                  
 body_Embed (Embedding)         (None, 600, 170)     39722880    ['body_input[0][0]']             
                                                                                              

In [40]:
print(np.unique(y_train, return_counts=True))

(array([0, 1]), array([107965,  54668], dtype=int64))


In [41]:
model.compile(optimizer='adam',loss={'main_output': 'categorical_crossentropy', 'aux_output': 'categorical_crossentropy'},
              metrics=['accuracy'])

In [None]:
results=model.fit({'title_input': sequences_matrix_train_t, 'body_input': sequences_matrix_train_b},
          {'main_output': y_train, 'aux_output': y_train},
          validation_data=[{'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b},
          {'main_output': y_test, 'aux_output': y_test}],
          epochs=5, batch_size=800)

Epoch 1/2
Epoch 2/2

In [None]:
(predicted_main, predicted_aux)=model.predict({'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b},verbose=1)

In [None]:
from sklearn.metrics import classification_report,f1_score

In [None]:
print(f1_score(y_test,predicted_main>.55,average='samples'))

In [None]:
print(classification_report(y_test,predicted_main>.55))

In [None]:
test.iloc[24]

In [None]:
predicted_main[24].round(decimals = 2)

In [None]:
labels

In [None]:
#Output file is saved in the Hierarchical Data Format (HDF) version 5
model.save('./prj1_stackoverflow_tags.h5')