## Predict tags on StackOverflow with linear models

In this project we predict tags for posts from [StackOverflow](https://stackoverflow.com). To solve this task you will use multilabel classification approach.

The libraries used are: numpy, pandas, sklearn, nltk

In [400]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import sys
import re
from ast import literal_eval

import nltk
# Required to remove english stop words from the corpus
nltk.download('stopwords')
from nltk.corpus import stopwords 

from sklearn.preprocessing import MultiLabelBinarizer
from scipy import sparse as sp_sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [401]:
# Loading the train/val/test dataset
# literal_eval is a function to convert a data type in string form to it's original type.
# The labels train['tags'] & test['tags'] need to be processed by literal_eval

train = pd.read_csv('data/train.tsv', sep = '\t')
train['tags'] = train['tags'].apply(literal_eval)

validation = pd.read_csv('data/validation.tsv', sep = '\t')
validation['tags'] = validation['tags'].apply(literal_eval)
    
test = pd.read_csv('data/test.tsv', sep = '\t')


In [402]:
# Printing the first 5 rows of the loaded train dataset 
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [403]:
# Partitioning to X (features), y (labels) 
# .values gives the output in array form. Equivalent to  DataFrame.to_numpy()
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values


In [None]:
# Text processing...

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower() # lowercase text
    text = re.sub(REPLACE_BY_SPACE_RE, " ", text)   # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub(BAD_SYMBOLS_RE, "", text) # delete symbols which are in BAD_SYMBOLS_RE from text
    
    # Removing englisg stopwords from the corpus
    split_text = text.split()
    text_t1 = ' '.join([word for word in split_text if word not in STOPWORDS])

    return text_t1

X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]


We shall consider 2 approaches for the training. In first approach we train with Bag of words on 5000 frequent words from the corpus. In the second approach we train with sklearn's TfidfVectorizer()

In [None]:
# Preparing train data for the BOW (first approach)
tags_counts = {} # Dictionary of all tags from train corpus with their counts.
words_counts = {} # Dictionary of all words from train corpus with their counts.

for i in range(len(y_train)):
    for j in range(len(y_train[i])):
        tag = y_train[i][j]
        if tag in tags_counts.keys():
            tags_counts[tag] += 1
        else:
            tags_counts[tag] = 1
            

for i in range(len(X_train)):
    word_list = X_train[i].split()
    for j in range(len(word_list)):
        word = word_list[j]
        if word in words_counts.keys():
            words_counts[word] += 1
        else:
            words_counts[word] = 1    


In [None]:
DICT_SIZE = 5000

W_COUNTS = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)
WORDS_TO_INDEX = {q[0]: p for p,q in enumerate(W_COUNTS[:DICT_SIZE])}
INDEX_TO_WORDS = {p: q[0] for p,q in enumerate(W_COUNTS[:DICT_SIZE])}
ALL_WORDS = [p for p,q in w_counts[:DICT_SIZE]]

# The below function takes text, size of dictionary and return a vector which is a bag-of-words representation of 'text'
def my_bag_of_words(text, words_to_index, dict_size):    
    result_vector = np.zeros(dict_size)
    
    text_list = text.split()
    words = words_to_index.keys()
    
    for w in text_list:
        if w in words:
            ind = words_to_index[w]
            result_vector[ind] += 1
    
    return result_vector

# We transform the data to sparse representation, to store the information efficiently. 
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])

print('X_train shape ', X_train_mybag.shape)


X_train shape  (100000, 5000)


In [None]:
# Here we do our second approach i.e. TfidfVectorizer
# Input X_train, X_val, X_test — samples        
# Output TF-IDF vectorized representation of each sample and vocabulary
    
def tfidf_features(X_train, X_val, X_test):
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df = 5, max_df = 0.9, token_pattern = '(\S+)')
   
    X_train = tfidf_vectorizer.fit_transform(X_train)   
    X_val = tfidf_vectorizer.transform(X_val)
    X_test = tfidf_vectorizer.transform(X_test)
    
    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_

X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}


In [None]:
# This is a multi label classification problem. So we need to binarize the labels.
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)


In [None]:
# Function to train the classifier. Input - training data. Output - trained classifier

def train_classifier(X_train, y_train): 
    # LogisticRegression wrapped into OneVsRestClassifier
    lr = LogisticRegression(penalty = "l1", solver = 'saga')
    ovr_classifier = OneVsRestClassifier(lr)
    ovr_classifier.fit(X_train, y_train)
    
    return ovr_classifier


In [None]:
# Training the classifiers for different data transformations: bag-of-words and tf-idf.
classifier_mybag = train_classifier(X_train_mybag, y_train)
classifier_tfidf = train_classifier(X_train_tfidf, y_train)




In [None]:
# Predictions for the data.Two types of predictions are done here: labels and scores
y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)

y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)


In [None]:
# Evaluation 1.) Accuracy 2.) F1-score 3.) Precision score
def print_evaluation_scores(y_val, predicted):
    
    print("Accuracy score: ", accuracy_score(y_val, predicted))
    print("F-1 score weighted: ", f1_score(y_val, predicted, average='weighted'))
    print("Average precision score: ", average_precision_score(y_val, predicted))
   

In [None]:
# Printing the results of prediction by BOW and TFIDF
print('Bag-of-words')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)

print('')
print('Tfidf')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)


In [None]:
# Predictions for the test dataset
test_predictions = classifier_tfidf.predict(X_test_tfidf)
test_pred_inversed = mlb.inverse_transform(test_predictions)
