In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# Objective

Build a model to automatically predict tags for a given a StackExchange question by using the text of the question.
![alt text](https://cdn.sstatic.net/Sites/stackoverflow/company/img/logos/se/se-logo.svg?v=d29f0785ebb7)

__Dataset Specs__: Over 85,000 questions

[Download Link](https://www.kaggle.com/stackoverflow/statsquestions#Questions.csv)

# Steps to Follow



1. Load Data and Import Libraries
2. Text Cleaning
3. Merge Tags with Questions
4. Dataset Prepartion
5. Text Representation
6. Model Building
    1. Define Model Architecture
    2. Train the Model
7. Model Predictions
8. Model Evaluation
9. Inference



# Load Data and Import Libraries

In [None]:
# extract data from the ZIP file
# !unzip '/content/drive/statsquestions.zip'

In [1]:
#string matching
import re 

#reading files
import pandas as pd

#handling html data
from bs4 import BeautifulSoup

#visualization
import matplotlib.pyplot as plt  

pd.set_option('display.max_colwidth', 200)

In [23]:
# load the stackoverflow questions dataset
questions_df = pd.read_csv('../../../../../LargeData/Analytics_Vidhya/StackOverflow_tagging/Questions.csv',encoding='latin-1')

# load the tags dataset
tags_df = pd.read_csv('../../../../../LargeData/Analytics_Vidhya/StackOverflow_tagging/Tags.csv')

In [24]:
#print first 5 rows
questions_df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learning?,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Mach..."
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\n...
2,22,66.0,2010-07-19T19:25:39Z,208,Bayesian and frequentist reasoning in plain English,<p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n
3,31,13.0,2010-07-19T19:28:44Z,138,What is the meaning of p values and t values in statistical tests?,"<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests...."
4,36,8.0,2010-07-19T19:31:47Z,58,Examples for teaching: Correlation does not mean causation,"<p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:</p>\n\n<ol>\n<li>number of storks and birth ..."


In [25]:
tags_df.head()

Unnamed: 0,Id,Tag
0,1,bayesian
1,1,prior
2,1,elicitation
3,2,distributions
4,2,normality


# Text Cleaning

Let's define a function to clean the text data.

In [26]:
def cleaner(text):

  text = BeautifulSoup(text).get_text()
  
  # fetch alphabetic characters
  text = re.sub("[^a-zA-Z]", " ", text)

  # convert text to lower case
  text = text.lower()

  # split text into tokens to remove whitespaces
  tokens = text.split()

  return " ".join(tokens)

In [27]:
# call preprocessing function
questions_df['cleaned_text'] = questions_df['Body'].apply(cleaner)

In [28]:
questions_df['Body'][1]

"<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\nareas are a lot larger than condensed\nurban areas. Is there a need to account for the area size difference?</li>\n<li>if let's say I have census data\ndating back to 4 - 5 census periods,\nhow far can i forecast it into the\nfuture?</li>\n<li>if some of the census zone change\nlightly in boundaries, how can i\naccount for that change?</li>\n<li>What are the methods to validate\ncensus forecasts? for example, if i\nhave data for existing 5 census\nperiods, should I model the first 3\nand test it on the latter two? or is\nthere another way?</li>\n<li>what's the state of practice in\nforecasting census data, and what are\nsome of the state of the art methods?</li>\n</ul>\n"

In [29]:
questions_df['cleaned_text'][1]

'what are some of the ways to forecast demographic census with some validation and calibration techniques some of the concerns census blocks vary in sizes as rural areas are a lot larger than condensed urban areas is there a need to account for the area size difference if let s say i have census data dating back to census periods how far can i forecast it into the future if some of the census zone change lightly in boundaries how can i account for that change what are the methods to validate census forecasts for example if i have data for existing census periods should i model the first and test it on the latter two or is there another way what s the state of practice in forecasting census data and what are some of the state of the art methods'

# Merge Tags with Questions

Let's now explore the tags data.

In [30]:
tags_df.head()

Unnamed: 0,Id,Tag
0,1,bayesian
1,1,prior
2,1,elicitation
3,2,distributions
4,2,normality


In [31]:
# count of unique tags
len(tags_df['Tag'].unique())

1315

In [32]:
tags_df['Tag'].value_counts()

r                   13236
regression          10959
machine-learning     6089
time-series          5559
probability          4217
                    ...  
network-layout          1
concept-drift           1
replicability           1
qsar                    1
shapley-value           1
Name: Tag, Length: 1315, dtype: int64

In [33]:
# remove "-" from the tags
tags_df['Tag']= tags_df['Tag'].apply(lambda x:re.sub("-"," ",x))

In [34]:
# group tags Id wise
tags_df = tags_df.groupby('Id').apply(lambda x:x['Tag'].values).reset_index(name='tags')
tags_df.head()

Unnamed: 0,Id,tags
0,1,"[bayesian, prior, elicitation]"
1,2,"[distributions, normality]"
2,3,"[software, open source]"
3,4,"[distributions, statistical significance]"
4,6,[machine learning]


In [35]:
# merge tags and questions
df = pd.merge(questions_df,tags_df,how='inner',on='Id')

In [36]:
df = df[['Id','Body','cleaned_text','tags']]
df.head()

Unnamed: 0,Id,Body,cleaned_text,tags
0,6,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Mach...",last year i read a blog post from brendan o connor entitled statistics vs machine learning fight that discussed some of the differences between the two fields andrew gelman responded favorably to ...,[machine learning]
1,21,<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\n...,what are some of the ways to forecast demographic census with some validation and calibration techniques some of the concerns census blocks vary in sizes as rural areas are a lot larger than conde...,"[forecasting, population, census]"
2,22,<p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n,how would you describe in plain english the characteristics that distinguish bayesian from frequentist reasoning,"[bayesian, frequentist]"
3,31,"<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests....",after taking a statistics course and then trying to help fellow students i noticed one subject that inspires much head desk banging is interpreting the results of statistical hypothesis tests it s...,"[hypothesis testing, t test, p value, interpretation, intuition]"
4,36,"<p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:</p>\n\n<ol>\n<li>number of storks and birth ...",there is an old saying correlation does not mean causation when i teach i tend to use the following standard examples to illustrate this point number of storks and birth rate in denmark number of ...,"[correlation, teaching]"


In [37]:
df.shape

(85085, 4)

There are over 85,000 unique questions and over 1300 tags.

# Dataset Preparation

In [38]:
# check frequency of occurence of each tag
freq= {}
for i in df['tags']:
  for j in i:
    if j in freq.keys():
      freq[j] = freq[j] + 1
    else:
      freq[j] = 1

Let's find out the most frequent tags.

In [39]:
# sort the dictionary in descending order
freq = dict(sorted(freq.items(), key=lambda x:x[1],reverse=True))

In [40]:
# freq.items() # too much data returned

# Simple way to slice a dictionary
import itertools

dict(itertools.islice(freq.items(), 10))

{'r': 13236,
 'regression': 10959,
 'machine learning': 6089,
 'time series': 5559,
 'probability': 4217,
 'hypothesis testing': 3869,
 'self study': 3732,
 'distributions': 3501,
 'logistic': 3316,
 'classification': 2881}

In [41]:
# Top 10 most frequent tags
common_tags = list(freq.keys())[:10]
common_tags

['r',
 'regression',
 'machine learning',
 'time series',
 'probability',
 'hypothesis testing',
 'self study',
 'distributions',
 'logistic',
 'classification']

We will use only those questions/queries that have the above 10 tags associated with it.

In [42]:
x=[]
y=[]

for i in range(len(df['tags'])):
  
  temp=[]
  for j in df['tags'][i]:
    if j in common_tags:
      temp.append(j)

  if(len(temp)>1):
    x.append(df['cleaned_text'][i])
    y.append(temp)

In [43]:
# number of questions left
len(x)

11106

In [44]:
y[:10]

[['r', 'time series'],
 ['regression', 'distributions'],
 ['distributions', 'probability', 'hypothesis testing'],
 ['hypothesis testing', 'self study'],
 ['r', 'regression', 'time series'],
 ['r', 'time series', 'self study'],
 ['probability', 'hypothesis testing'],
 ['r', 'regression'],
 ['r', 'regression'],
 ['regression', 'logistic']]

We will the input sequences to our model to the length of 100

In [45]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
 
y = mlb.fit_transform(y)
y.shape

(11106, 10)

In [46]:
y[0,:]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1])

In [47]:
mlb.classes_

array(['classification', 'distributions', 'hypothesis testing',
       'logistic', 'machine learning', 'probability', 'r', 'regression',
       'self study', 'time series'], dtype=object)

We can now split the dataset into training set and validation set. 

In [48]:
from sklearn.model_selection import train_test_split
x_tr,x_val,y_tr,y_val=train_test_split(x, y, test_size=0.2, random_state=0,shuffle=True)

# Text Representation

In [49]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 

#prepare a tokenizer
x_tokenizer = Tokenizer() 

#prepare vocabulary
x_tokenizer.fit_on_texts(x_tr)

In [52]:
# x_tokenizer.word_index # returns too much data

# List the first 20 items in the dictionary.  Note the numbers is the row, not count of occurence!
dict(itertools.islice(x_tokenizer.word_index.items(), 20))

{'the': 1,
 'i': 2,
 'to': 3,
 'a': 4,
 'of': 5,
 'is': 6,
 'and': 7,
 'in': 8,
 'l': 9,
 'x': 10,
 'for': 11,
 'that': 12,
 'data': 13,
 'this': 14,
 't': 15,
 'have': 16,
 'y': 17,
 'with': 18,
 'model': 19,
 'it': 20}

In [53]:
len(x_tokenizer.word_index)

25315

There are around 25,000 tokens in the training dataset. Let's see how many tokens appear at least 5 times in the dataset.

In [54]:
thresh = 3

cnt=0
for key,value in x_tokenizer.word_counts.items():
  if value>=thresh:
    cnt=cnt+1

print(cnt)

12575


Over 12,000 tokens have appeared three times or more in the training set.

In [55]:
# prepare the tokenizer again
x_tokenizer = Tokenizer(num_words=cnt,oov_token='unk')

#prepare vocabulary
x_tokenizer.fit_on_texts(x_tr)

Now that we have encoded every token to an integer, let's convert the text sequences to integer sequences. After that we will pad the integer sequences to the maximum sequence length, i.e., 100.

In [56]:
#define threshold for maximum length of a setence
max_len=100

#convert text sequences into integer sequences
x_tr_seq = x_tokenizer.texts_to_sequences(x_tr) 
x_val_seq = x_tokenizer.texts_to_sequences(x_val)

#padding up with zero 
x_tr_seq = pad_sequences(x_tr_seq,  padding='post', maxlen=max_len)
x_val_seq = pad_sequences(x_val_seq, padding='post', maxlen=max_len)

Since we are padding the sequences with zeros, we must increment the vocabulary size by one.

In [57]:
#no. of unique words
x_voc_size = x_tokenizer.num_words + 1
x_voc_size

12576

In [58]:
x_tr_seq[0]

array([1953, 5711,  416, 2023,    1,  226, 1747, 3740,  609,   43,  181,
       1953,  372,   19,  100,  416,    9, 1747, 3839,  238,   27,   27,
         27,   27,   27,   70,    6, 6919,    8, 1163,   70,    6,   43,
         43, 1802, 1802, 1802,   36,   36,   36,   36, 4308, 5410,    4,
        124,  592,  107,   22,    2, 1747, 4065,   27,   10, 1309,   10,
       6415,   10,  190,   10,  416,   10,   27,   10, 1309,   10, 6415,
         10,  190,   10,  416,   10,  456,  139,   15,    7,    2, 4610,
        164,   27,   10, 1309,   10, 6415,   10,  190,   10,  416,   10,
         27,   76,   27, 1309,   76,   27, 6415,   76,   27,  190,   76,
         27])

# Model Building

In [63]:
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras import backend as K

### Define Model Architecture

In [65]:
# define model architecture
K.clear_session()
model =  Sequential()
model.add(Embedding(x_voc_size, 50, trainable=True, input_shape=(max_len,)))  #embedding layer
  
model.add(Conv1D(64,3,padding='same'))  #conv1d layer: 64 filters and 3 is height of filter and 
# this is what what feature maps as output
model.add(Dropout(0.1))

model.add(GlobalMaxPooling1D()) # applies max pooling on the feature maps
  
model.add(Dense(128,activation='relu'))  #dense layer

model.add(Dense(10,activation='sigmoid')) #output layer
model.summary() #summary) of model

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           628800    
_________________________________________________________________
conv1d (Conv1D)              (None, 100, 64)           9664      
_________________________________________________________________
dropout (Dropout)            (None, 100, 64)           0         
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 128)               8320      
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
Total params: 648,074
Trainable params: 648,074
Non-trainable params: 0
__________________________________________________

In [66]:
#define optimizer and loss
model.compile(optimizer='adam',loss='binary_crossentropy')

#checkpoint to save best model during training
mc = ModelCheckpoint("weights.best.hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')

### Train the Model

This is MUCH faster!

In [68]:
#train the model 
model.fit(x_tr_seq, y_tr, batch_size=128, epochs=10, verbose=1, validation_data=(x_val_seq, y_val), callbacks=[mc])

Epoch 1/10
Epoch 00001: val_loss did not improve from 0.26730
Epoch 2/10
Epoch 00002: val_loss did not improve from 0.26730
Epoch 3/10
Epoch 00003: val_loss did not improve from 0.26730
Epoch 4/10
Epoch 00004: val_loss did not improve from 0.26730
Epoch 5/10
Epoch 00005: val_loss did not improve from 0.26730
Epoch 6/10
Epoch 00006: val_loss did not improve from 0.26730
Epoch 7/10
Epoch 00007: val_loss did not improve from 0.26730
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.26730
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.26730
Epoch 10/10
Epoch 00010: val_loss did not improve from 0.26730


<tensorflow.python.keras.callbacks.History at 0x2b3ac0870a0>

# Model Predictions 

In [69]:
# load weights into new model
model.load_weights("weights.best.hdf5")

#predict probabilities
pred_prob = model.predict(x_val_seq)

In [70]:
pred_prob[0]

array([0.05173081, 0.05248803, 0.00771904, 0.09648392, 0.24082986,
       0.04391071, 0.9517    , 0.47870407, 0.03080544, 0.05613467],
      dtype=float32)

The predictions are in terms of probabilities for each of the 10 tags. Hence we need to have a threshold value to convert these probabilities to 0 or 1.

Let's specify a set of candidate threshold values. We will select the threshold value that performs the best for the validation set.

In [72]:
import numpy as np
#define candidate threshold values
threshold  = np.arange(0,0.5,0.01)
threshold

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49])

Let's define a function that takes a threshold value and uses it to convert probabilities into 1 or 0.

In [73]:
# convert probabilities into classes or tags based on a threshold value
def classify(pred_prob,thresh):
  y_pred_seq = []

  for i in pred_prob:
    temp=[]
    for j in i:
      if j>=thresh:
        temp.append(1)
      else:
        temp.append(0)
    y_pred_seq.append(temp)

  return y_pred_seq

In [74]:
from sklearn import metrics
score=[]

#convert to 1 array
y_true = np.array(y_val).ravel() 

for thresh in threshold:
    
    #classes for each threshold
    y_pred_seq = classify(pred_prob,thresh) 

    #convert to 1d array
    y_pred = np.array(y_pred_seq).ravel()

    score.append(metrics.f1_score(y_true,y_pred))

In [75]:
# find the optimal threshold
opt = threshold[score.index(max(score))]
opt

0.36

# Model Evaluation

macro avg is 0.83.  This outperfomrs the others!!  This is not always the case but it works better this time.  For auto tagging, CNNs can be the champion.

Surely CNN can perform better in a few cases than RNN but this is not always true. For problems like machine translation and text summarization RNN performs significantly well. Both models have their respective advantages.

In [76]:
#predictions for optimal threshold
y_pred_seq = classify(pred_prob,opt)
y_pred = np.array(y_pred_seq).ravel()

In [None]:
print(metrics.classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93     17520
           1       0.75      0.72      0.74      4700

    accuracy                           0.89     22220
   macro avg       0.84      0.83      0.83     22220
weighted avg       0.89      0.89      0.89     22220



In [None]:
y_pred = mlb.inverse_transform(np.array(y_pred_seq))
y_true = mlb.inverse_transform(np.array(y_val))

df = pd.DataFrame({'comment':x_val,'actual':y_true,'predictions':y_pred})

In [None]:
df.sample(10)

Unnamed: 0,comment,actual,predictions
780,is there any function for m estimation in multivariate linear regression model in r i can estimate the beta s in my model by using the rlm by rewriting the y variables into one column but i would ...,"(r, regression)","(r, regression)"
1554,thanks in advance for the help i am writing a paper and for the life of me can t remember the proper term for a model that works as follows rawdata model outputmodel model outputmodel more specifi...,"(machine learning, regression)","(regression,)"
823,i am trying to understand the coefficients retrieved from running auto arima in r on my monthly time series of the annual change in house prices when doing so i obtain the following outcome series...,"(r, time series)","(r, time series)"
445,let f y prod i binom n i y i pi x i y i pi x i n i y i where pi x i frac e sum j x ij b j e sum j x ij b j then the likelihood is l propto prod i pi x i y i pi x i n i y i l sum i y i log frac pi ...,"(logistic, regression)","(regression, self study)"
1192,i recently received the following question via email i ll post an answer below but i was interested to hear what others thought would you call logistic regression a non parametric test my understa...,"(hypothesis testing, logistic)","(logistic,)"
1918,the background first i am currently working on some predictive modelling of some client shopping data to see if it is possible to categorise clients into one of nine ordinal categories according t...,"(classification, machine learning)","(classification, machine learning)"
616,begin eqnarray e left left left x k right right left x k right right e left k left left x k right right right e left x left left x k right right right end eqnarray k is a constant and x is a rando...,"(probability, self study)","(distributions, probability, self study)"
1882,suppose there is a vector v that contains the body height of every person over years old on earth it looks something like this and is normally distributed also suppose there is simple function f t...,"(distributions, hypothesis testing)","(distributions, probability)"
1059,i have a record of one climate variable with a data point every year and another one which has sample spacing that varies between and years i even have a few ages in that series for which i have t...,"(regression, time series)","(r, regression)"
762,i m trying to analyse the effect of smoking on health the health variable is binary healthy or not but there are different exposures to smoking active variable a and passive variable b variable a ...,"(hypothesis testing, r)","(r, regression)"


# Inference

In [None]:
def predict_tag(comment):  
  text=[]

  #preprocess  
  text = [cleaner(comment)]

  #convert to integer sequences
  seq = x_tokenizer.texts_to_sequences(text)

  #pad the sequence
  pad_seq = pad_sequences(seq,  padding='post', maxlen=max_len)

  #make predictions
  pred_prob = model.predict(pad_seq)
  classes = classify(pred_prob,opt)[0]
  
  classes = np.array([classes])
  classes = mlb.inverse_transform(classes)  
  return classes

In [None]:
comment = "For example, in the case of logistic regression, the learning function is a Sigmoid function that tries to separate the 2 classes"

print("Comment:",comment)
print("Predicted Tags:",predict_tag(comment))

Comment: For example, in the case of logistic regression, the learning function is a Sigmoid function that tries to separate the 2 classes
Predicted Tags: [('classification', 'logistic', 'machine learning', 'regression')]
