# StackOverflow Tag Predictor - Model Optimization

**By Aziz Presswala**

In [22]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import sqlite3
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re
import os
from sqlalchemy import create_engine # database connection
import datetime as dt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from sklearn.model_selection import GridSearchCV

In [2]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
 
    return None

In [3]:
def tags_to_choose(n):
    t = multilabel_y.sum(axis=0).tolist()[0]
    sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True)
    multilabel_yn=multilabel_y[:,sorted_tags_i[:n]]
    return multilabel_yn

def questions_explained_fn(n):
    multilabel_yn = tags_to_choose(n)
    x= multilabel_yn.sum(axis=1)
    return (np.count_nonzero(x==0))

In [4]:
write_db = 'Titlemoreweight.db'
if os.path.isfile(write_db):
    conn_r = create_connection(write_db)
    if conn_r is not None:
        preprocessed_data = pd.read_sql_query("""SELECT question, Tags FROM QuestionsProcessed""", conn_r)
conn_r.commit()
conn_r.close()

In [5]:
preprocessed_data.head()

Unnamed: 0,question,tags
0,dynam datagrid bind silverlight dynam datagrid...,c# silverlight data-binding
1,dynam datagrid bind silverlight dynam datagrid...,c# silverlight data-binding columns
2,java.lang.noclassdeffounderror javax servlet j...,jsp jstl
3,java.sql.sqlexcept microsoft odbc driver manag...,java jdbc
4,better way updat feed fb php sdk better way up...,facebook api facebook-php-sdk


In [6]:
print("number of data points in sample :", preprocessed_data.shape[0])
print("number of dimensions :", preprocessed_data.shape[1])

number of data points in sample : 500000
number of dimensions : 2


**Converting string Tags to multilable output variables**

In [11]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
multilabel_y = vectorizer.fit_transform(preprocessed_data['tags'])

**Selecting 500 Tags**

In [12]:
questions_explained = []
total_tags=multilabel_y.shape[1]
total_qs=preprocessed_data.shape[0]
for i in range(500, total_tags, 100):
    questions_explained.append(np.round(((total_qs-questions_explained_fn(i))/total_qs)*100,3))

In [13]:
# we will be taking 500 tags
multilabel_yx = tags_to_choose(500)
print("number of questions that are not covered :", questions_explained_fn(500),"out of ", total_qs)

number of questions that are not covered : 45221 out of  500000


In [14]:
x_train=preprocessed_data.head(400000)
x_test=preprocessed_data.tail(preprocessed_data.shape[0] - 400000)

y_train = multilabel_yx[0:400000,:]
y_test = multilabel_yx[400000:preprocessed_data.shape[0],:]

In [15]:
print("Number of data points in train data :", y_train.shape)
print("Number of data points in test data :", y_test.shape)

Number of data points in train data : (400000, 500)
Number of data points in test data : (100000, 500)


### Part 1 - Logistic Regression with BoW

<h4> Featurizing data with BoW vectorizer - ngram_range=(1,4)</h4>

In [16]:
start = datetime.now()
vectorizer = CountVectorizer(min_df=0.00009, max_features=200000,
                             tokenizer = lambda x: x.split(), ngram_range=(1,4))
x_train_multilabel = vectorizer.fit_transform(x_train['question'])
x_test_multilabel = vectorizer.transform(x_test['question'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 3:41:09.156093


In [17]:
print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape)

Dimensions of train data X: (400000, 95585) Y : (400000, 500)
Dimensions of test data X: (100000, 95585) Y: (100000, 500)


<h4>Applying Logistic Regression with OneVsRestClassifier</h4>

In [20]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'))
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict (x_test_multilabel)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.10774
Hamming loss  0.0060036
Micro-average quality numbers
Precision: 0.2847, Recall: 0.4805, F1-measure: 0.3575
Macro-average quality numbers
Precision: 0.2056, Recall: 0.4105, F1-measure: 0.2661
              precision    recall  f1-score   support

           0       0.73      0.80      0.76      5519
           1       0.44      0.44      0.44      8190
           2       0.52      0.52      0.52      6529
           3       0.49      0.60      0.54      3231
           4       0.54      0.53      0.53      6430
           5       0.42      0.53      0.47      2879
           6       0.58      0.62      0.60      5086
           7       0.58      0.68      0.63      4533
           8       0.23      0.22      0.22      3000
           9       0.54      0.63      0.59      2765
          10       0.32      0.34      0.33      3051
          11       0.46      0.52      0.49      3009
          12       0.38      0.44      0.41      2630
          13       0.35      0.4

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### Part 2 - Hyperparamter tuning for Logistic Regression

In [25]:
start = datetime.now()

#setting alpha values that need to be tried on the classifier
params = {'estimator__alpha':[10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3]}

classifier = OneVsRestClassifier(SGDClassifier(loss='log', penalty='l1'))
gscv = GridSearchCV(classifier, param_grid = params, scoring='f1_micro', return_train_score=True, n_jobs=-1)
gscv.fit(x_train_multilabel, y_train)
print(gscv.best_score_)
print(gscv.best_params_)
print("Time taken to run this cell :", datetime.now() - start)

0.4485524752691557
{'estimator__alpha': 0.001}
Time taken to run this cell : 4:37:02.728530


In [28]:
start = datetime.now()

#setting alpha values that need to be tried on the classifier
params = {'estimator__alpha':[10**-5, 10**-4]}

classifier = OneVsRestClassifier(SGDClassifier(loss='log', penalty='l1'))
gscv = GridSearchCV(classifier, param_grid = params, scoring='f1_micro', return_train_score=True, n_jobs=-1)
gscv.fit(x_train_multilabel, y_train)
print(gscv.best_score_)
print(gscv.best_params_)
print("Time taken to run this cell :", datetime.now() - start)

0.4426540793261227
{'estimator__alpha': 0.0001}
Time taken to run this cell : 1:42:20.344928


**From the above 2 cells, we observe that the best value of alpha is *0.001***

In [27]:
# Training the model with the optimal value of alpha
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l1'))
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict (x_test_multilabel)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.18564
Hamming loss  0.00322702
Micro-average quality numbers
Precision: 0.5618, Recall: 0.3258, F1-measure: 0.4125
Macro-average quality numbers
Precision: 0.4054, Recall: 0.2385, F1-measure: 0.2821
              precision    recall  f1-score   support

           0       0.80      0.68      0.74      5519
           1       0.53      0.23      0.32      8190
           2       0.64      0.42      0.51      6529
           3       0.60      0.48      0.53      3231
           4       0.81      0.36      0.50      6430
           5       0.65      0.42      0.51      2879
           6       0.76      0.56      0.64      5086
           7       0.84      0.57      0.68      4533
           8       0.53      0.14      0.22      3000
           9       0.76      0.48      0.59      2765
          10       0.58      0.14      0.22      3051
          11       0.57      0.38      0.45      3009
          12       0.63      0.24      0.35      2630
          13       0.53      0.

### Part 3 - Linear SVM with OneVsRestClassifier

In [24]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.00001, penalty='l1'))
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict (x_test_multilabel)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.10822
Hamming loss  0.00591506
Micro-average quality numbers
Precision: 0.2886, Recall: 0.4787, F1-measure: 0.3601
Macro-average quality numbers
Precision: 0.2087, Recall: 0.4072, F1-measure: 0.2683
              precision    recall  f1-score   support

           0       0.71      0.81      0.75      5519
           1       0.45      0.47      0.46      8190
           2       0.49      0.53      0.51      6529
           3       0.54      0.57      0.55      3231
           4       0.53      0.54      0.53      6430
           5       0.40      0.50      0.44      2879
           6       0.59      0.62      0.60      5086
           7       0.60      0.67      0.64      4533
           8       0.22      0.24      0.23      3000
           9       0.57      0.64      0.60      2765
          10       0.32      0.34      0.33      3051
          11       0.45      0.53      0.48      3009
          12       0.37      0.39      0.38      2630
          13       0.35      0.

## Conclusion:-

In this assignment we improved the performance of the model by applying different techniques such as:-
1. Bag of Words with ngram_range=(1,4)
2. Hyperparameter tuning for alpha of Logistic Regression
3. Training the model using Linear SVM

After performing the above mentioned steps, the best performing model was **Linear Regression** with **alpha=0.001** trained using **Bag of Words** Vectorizer with a **ngram_range=(1,4)**.