# Predict tags on StackOverflow

## Text preprocessing

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/blackbird/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from ast import literal_eval
import pandas as pd
import numpy as np

In [5]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [6]:
train = read_data('data/train.tsv')
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv', sep='\t')

In [7]:
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [8]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

In [9]:
import re
from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/blackbird/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE, ' ', text)
    text = re.sub(BAD_SYMBOLS_RE, '', text)
    token_words = word_tokenize(text)
    filtered_words = [word for word in token_words if word not in STOPWORDS]
    text = ''
    for word in filtered_words:
        if word != filtered_words[len(filtered_words)-1]:
            text = text + word + ' '
        else:
            text = text + word
    return text

In [11]:
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

### Words Tag Counter

In [12]:
from collections import Counter
tags_counts = Counter() #{}
words_counts = Counter() #{}

for sentence in X_train:
    for word in sentence.split():
        words_counts[word] += 1

for l in y_train:
    for tag in l:
        tags_counts[tag] += 1

In [13]:
tags_counts.keys()

dict_keys(['r', 'php', 'mysql', 'c#', 'javascript', 'jquery', 'java', 'ruby-on-rails', 'ruby', 'ruby-on-rails-3', 'json', 'spring', 'spring-mvc', 'codeigniter', 'class', 'html', 'ios', 'c++', 'eclipse', 'python', 'list', 'objective-c', 'swift', 'xaml', 'asp.net', 'wpf', 'multithreading', 'image', 'performance', 'twitter-bootstrap', 'linq', 'xml', 'numpy', 'ajax', 'django', 'laravel', 'android', 'rest', 'asp.net-mvc', 'web-services', 'string', 'excel', 'winforms', 'arrays', 'c', 'sockets', 'osx', 'entity-framework', 'mongodb', 'opencv', 'xcode', 'uitableview', 'algorithm', 'python-2.7', 'angularjs', 'dom', 'swing', '.net', 'vb.net', 'google-maps', 'hibernate', 'wordpress', 'iphone', 'sql', 'visual-studio', 'linux', 'facebook', 'database', 'file', 'generics', 'visual-studio-2010', 'regex', 'html5', 'jsp', 'csv', 'forms', 'validation', 'parsing', 'function', 'pandas', 'sorting', 'qt', 'wcf', 'css', 'date', 'node.js', 'sql-server', 'unit-testing', 'python-3.x', 'loops', 'windows', 'pointer

## 1. Bag of words
The bag-of-words model is a simplifying representation used in natural language processing and information retrieval (IR). Also known as the vector space model. In this model, a text (such as a sentence or a document) is represented as the bag (multiset) of its words, disregarding grammar and even word order but keeping multiplicity. 

In [14]:
DICT_SIZE = 5000
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[0:DICT_SIZE]
WORDS_TO_INDEX = {}
INDEX_TO_WORDS = {}
ALL_WORDS = WORDS_TO_INDEX.keys()

for i in range(DICT_SIZE):
    WORDS_TO_INDEX[most_common_words[i]] = i
    INDEX_TO_WORDS[i] = most_common_words[i]

def my_bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size)
    for word in text.split():
        if word in words_to_index:
            result_vector[words_to_index[word]] += 1
    return result_vector

In [15]:
from scipy import sparse as sp_sparse

In [16]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (100000, 5000)
X_val shape  (30000, 5000)
X_test shape  (20000, 5000)


## 2.TF-IDF

The second approach extends the bag-of-words framework by taking into account total frequencies of words in the corpora. It helps to penalize too frequent words and provide better features space.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
def tfidf_features(X_train, X_val, X_test):
    tfidf_vectorizer = TfidfVectorizer(min_df=5,max_df=0.9,ngram_range=(1,2),token_pattern= '(\S+)')
    X_train = tfidf_vectorizer.fit_transform(X_train)
    X_val = tfidf_vectorizer.transform(X_val)
    X_test = tfidf_vectorizer.transform(X_test)
    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_

In [22]:
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [23]:
tfidf_vocab

{'draw': 4672,
 'stacked': 14705,
 'r': 12538,
 'mysql': 10223,
 'select': 13797,
 'records': 12798,
 'datetime': 3976,
 'field': 5686,
 'less': 8922,
 'specified': 14567,
 'value': 16867,
 'mysql select': 10256,
 'select records': 13831,
 'datetime field': 3980,
 'terminate': 15462,
 'windows': 17578,
 'phone': 11523,
 '81': 335,
 'app': 971,
 'windows phone': 17601,
 'phone 81': 11526,
 '81 app': 336,
 'get': 6532,
 'current': 3623,
 'time': 15643,
 'specific': 14522,
 'country': 3431,
 'via': 17153,
 'jquery': 8499,
 'get current': 6563,
 'current time': 3639,
 'time specific': 15670,
 'via jquery': 17167,
 'configuring': 3050,
 'tomcat': 15731,
 'use': 16415,
 'ssl': 14691,
 'awesome': 1449,
 'nested': 10378,
 'set': 14098,
 'plugin': 11801,
 'add': 506,
 'new': 10442,
 'children': 2502,
 'tree': 15809,
 'various': 17104,
 'levels': 8935,
 'add new': 552,
 'create': 3453,
 'map': 9519,
 'json': 8620,
 'response': 13187,
 'ruby': 13476,
 'rails': 12567,
 '3': 238,
 'json response': 

### Classification

In [30]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

In [33]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

### MultilabelClassification

In [35]:
def train_classifier(X_train, y_train, penalty):
    model = OneVsRestClassifier(LogisticRegression(penalty = penalty))
    model = model.fit(X_train, y_train)       
    return model

classifier_mybag_l1 = train_classifier(X_train_mybag, y_train, 'l1')
classifier_mybag_l2 = train_classifier(X_train_mybag, y_train, 'l2')
classifier_tfidf_l1 = train_classifier(X_train_tfidf, y_train, 'l1')
classifier_tfidf_l2 = train_classifier(X_train_tfidf, y_train, 'l2')

y_val_predicted_labels_mybag_l1 = classifier_mybag_l1.predict(X_val_mybag)
y_val_predicted_scores_mybag_l1 = classifier_mybag_l1.decision_function(X_val_mybag)
y_val_predicted_labels_mybag_l2 = classifier_mybag_l2.predict(X_val_mybag)
y_val_predicted_scores_mybag_l2 = classifier_mybag_l2.decision_function(X_val_mybag)

y_val_predicted_labels_tfidf_l1 = classifier_tfidf_l1.predict(X_val_tfidf)
y_val_predicted_scores_tfidf_l1 = classifier_tfidf_l1.decision_function(X_val_tfidf)
y_val_predicted_labels_tfidf_l2 = classifier_tfidf_l2.predict(X_val_tfidf)
y_val_predicted_scores_tfidf_l2 = classifier_tfidf_l2.decision_function(X_val_tfidf)

In [36]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(y_val, predicted):
    accuracy=accuracy_score(y_val, predicted)
    f1_score_macro=f1_score(y_val, predicted, average='macro')
    f1_score_micro=f1_score(y_val, predicted, average='micro')
    f1_score_weighted=f1_score(y_val, predicted, average='weighted')
    precision_macro=average_precision_score(y_val, predicted, average='macro')
    precision_micro=average_precision_score(y_val, predicted, average='micro')
    precision_weighted=average_precision_score(y_val, predicted, average='weighted')
    print("Accuracy:",accuracy,"\nF1-Score:",f1_score_macro,f1_score_micro,f1_score_weighted,"\nPrecision:",precision_macro,precision_micro,precision_weighted)

print('Bag-of-words_l1')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag_l1)
print('Bag-of-words_l2')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag_l2)
print('Tfidf_l1')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf_l1)
print('Tfidf_l2')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf_l2)

Bag-of-words_l1


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 0.0 
F1-Score: 0.0 0.0 0.0 
Precision: 0.01950333333333333 0.019503333333333334 0.08404558878824134
Bag-of-words_l2
Accuracy: 0.0 
F1-Score: 0.0 0.0 0.0 
Precision: 0.01950333333333333 0.019503333333333334 0.08404558878824134
Tfidf_l1
Accuracy: 0.36393333333333333 
F1-Score: 0.5080348321977922 0.6735434614823005 0.6510180465751839 
Precision: 0.3481820426698627 0.4855875341642438 0.5152161577529164
Tfidf_l2
Accuracy: 0.33363333333333334 
F1-Score: 0.4442030131093959 0.6403418428837515 0.6128137266508615 
Precision: 0.3007538166371564 0.45526079280229065 0.48358673078825803


In [46]:
y_test_predicted_labels_tfidf_l1 = classifier_tfidf_l1.predict(X_test_tfidf)
y_test_predicted_scores_tfidf_l1 = classifier_tfidf_l1.decision_function(X_test_tfidf)

y_text_predicted_inversed = mlb.inverse_transform(y_test_predicted_labels_tfidf_l1)

In [53]:
for i in range(20):
    print(test['title'].values[i],"\n",y_text_predicted_inversed[i],"\n")

 ('mysql', 'php') 

get click coordinates from <input type='image'> via javascript 
 ('javascript',) 

How to implement cloud storage for media assets in ZF? 
 () 

What is catcomplete in jQuery's autocomplete plugin? 
 ('javascript', 'jquery') 

Error building Android app with Cordova 3.1 CLI 
 ('android', 'java') 

How to Parse XML File in PHP 
 ('php', 'xml') 

Uploading files via JSON Post request to a Web Service provided by Teambox 
 ('json',) 

Adding rows to JTable in the right order. 
 ('java', 'swing') 

How to read input file in Python? 
 ('python',) 

PDF generation from an html containing images and text 
 ('html',) 

Trying to get sql query to be dynamic with jquery 
 ('jquery',) 

Fiting 2-parameters weibull distribution for tabulated data 
 () 

Add six months in php 
 ('php',) 

Where/How to code Constants in Rails 3 Application 
 ('ruby-on-rails', 'ruby-on-rails-3') 

Comparing list of items with one of the column in DataTable 
 ('c#',) 

python sort upper case and lo

### Analysis

In [38]:
def print_words_for_tag(classifier, tag, tags_classes, index_to_words, all_words):
    print('Tag:\t{}'.format(tag))
    est = classifier.estimators_[tags_classes.index(tag)]
    top_positive_words = [index_to_words[index] for index in est.coef_.argsort().tolist()[0][-5:]]  # top-5 words sorted by the coefficiens.
    top_negative_words = [index_to_words[index] for index in est.coef_.argsort().tolist()[0][:5]] # bottom-5 words  sorted by the coefficients.
    print('Top positive words:\t{}'.format(', '.join(top_positive_words)))
    print('Top negative words:\t{}\n'.format(', '.join(top_negative_words)))

In [41]:
print_words_for_tag(classifier_tfidf_l1, 'c', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf_l1, 'c++', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf_l1, 'linux', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf_l1, 'python', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf_l1, 'java', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf_l1, 'android', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf_l1, 'r', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf_l1, 'ios', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf_l1, 'windows', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)

Tag:	c
Top positive words:	printf, fscanf, malloc, scanf, c
Top negative words:	c #, php, javascript, java, python

Tag:	c++
Top positive words:	stl, mfc, qt, boost, c++
Top negative words:	php, java, javascript, python, jquery

Tag:	linux
Top positive words:	address, system call, ubuntu, signal, linux
Top negative words:	#, javascript, jquery, aspnet, array

Tag:	python
Top positive words:	tkinter, matplotlib, numpy, pandas, python
Top negative words:	php, java, c, django python, jquery

Tag:	java
Top positive words:	jtable, jar, hibernate, spring, java
Top negative words:	php, python, ruby, rails, django

Tag:	android
Top positive words:	intent, edittext, asynctask, retrofit, android
Top negative words:	python, c, swift, iphone, phonegap android

Tag:	r
Top positive words:	rstudio, shiny, ggplot, ggplot2, r
Top negative words:	android, php, java, python, c

Tag:	ios
Top positive words:	afnetworking, objective, uicollectionview, swift, ios
Top negative words:	java, python, php, jquery