In [35]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import hstack
import collections
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import joblib

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from skmultilearn.problem_transform import LabelPowerset, ClassifierChain, BinaryRelevance
from sklearn.metrics import roc_auc_score, accuracy_score, hamming_loss, f1_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

## Load data and extract features

In [4]:
data = pd.read_pickle('tokenized_data.pkl')

In [3]:
data.head()

Unnamed: 0,Id,Title_token,Body_token,Tag
0,80,"[multiple, queries, one, statement]","[written, database, generation, script, sql, w...","[flex, actionscript-3, air]"
1,90,"[good, branching, merging, tutorials, tortoise...","[really, good, tutorials, explaining, branchin...","[svn, tortoisesvn, branch, branching-and-merging]"
2,120,"[asp.net, site, maps]","[anyone, got, experience, creating, asp.net, p...","[sql, asp.net, sitemap]"
3,180,"[function, creating, color, wheels]","[something, many, times, never, quite, found, ...","[algorithm, language-agnostic, colors, color-s..."
4,260,"[adding, scripting, functionality, .net, appli...","[little, game, written, c#, uses, database, tr...","[c#, .net, scripting, compiler-construction]"


##### Due to the high volume of data that exist in the dataset, we have to only keep some of them due to hardware limitations. We will only keep these data that better describe most of the data.

In [6]:
data_init_shape = data.shape[0]

tags_collection = collections.Counter()

def count_tag(tags):
    for tag in tags:
        tags_collection[tag] += 1
    return tags_collection

data['Tag'].apply(count_tag)

tqdm.pandas()

most_common = tags_collection.most_common(50)
most_common_tags = [count[0] for count in most_common]
joblib.dump(most_common_tags, 'most_common_tags.pkl')
#data['Tag'] = data['Tag'].progress_apply(lambda Tag: [tag for tag in Tag if tag in most_common_tags])

#common_data = data[data['Tag'].map(lambda tags: len(tags) > 0)]
#common_data_shape = common_data.shape[0]
#print(common_data_shape)

['most_common_tags.pkl']

In [9]:
most_common_tags

['javascript',
 'java',
 'c#',
 'php',
 'android',
 'jquery',
 'python',
 'html',
 'c++',
 'ios',
 'mysql',
 'css',
 'sql',
 'asp.net',
 'objective-c',
 'ruby-on-rails',
 '.net',
 'c',
 'iphone',
 'angularjs',
 'arrays',
 'sql-server',
 'json',
 'ruby',
 'r',
 'ajax',
 'regex',
 'xml',
 'node.js',
 'asp.net-mvc',
 'linux',
 'django',
 'wpf',
 'database',
 'swift',
 'xcode',
 'string',
 'excel',
 'vb.net',
 'windows',
 'spring',
 'wordpress',
 'eclipse',
 'html5',
 'multithreading',
 'oracle',
 'git',
 'facebook',
 'forms',
 'bash']

In [8]:
common_data.sample(10)

Unnamed: 0,Id,Title_token,Body_token,Tag
1104678,35954210,"[uialertcontroller, presenting, weird, location]","[whenever, try, present, alert, keeps, appeari...",[ios]
751114,25461050,"[function, showing, success, echo, executing]","[assigning, loop, function, working, fine, enr...",[php]
399037,14123180,"[want, help, jquery, finding, imagebutton, id,...","[creating, dynamic, table, inside, div, tag, i...","[jquery, asp.net]"
3049,233490,"[need, impersonate, user, foraccessing, networ...","[need, access, network, resource, given, domai...","[c#, .net, asp.net]"
609061,20985090,"[mfc, c++, wpf, c#, default, directories, open...","[trying, determine, default, directories, assi...","[c#, .net, wpf]"
772345,26128360,"[array, wont, br, javascript]","[try, get, array, writen, webpage, write, webp...",[javascript]
28945,1506110,"[asp.net, unit, testing, dependency, web, serv...","[let, say, class, like, following, public, cla...","[c#, .net, asp.net]"
1178088,37921970,"[choose, image, gallery, fit, imageview, box, ...","[might, similar, question, able, find, satisfa...",[android]
1105387,35974430,"[android, image, appear, gridview]","[good, day, trying, capture, image, camera, pr...",[android]
828669,27901640,"[laravel, eloquent, relationships]","[troubles, setup, right, eloquent, relationshi...",[php]


In [19]:
for tag in most_common_tags:
    tmp_tag = []
    print(tag)
    for inst in common_data['Tag']:
        if tag in inst:
            tmp_tag.append(1)
        else:
            tmp_tag.append(0)
    
    #print(tmp_tag)
    common_data[tag] = tmp_tag

#len(tmp_tag)

javascript
java
c#
php
android
jquery
python
html
c++
ios
mysql
css
sql
asp.net
objective-c
ruby-on-rails
.net
c
iphone
angularjs
arrays
sql-server
json
ruby
r
ajax
regex
xml
node.js
asp.net-mvc
linux
django
wpf
database
swift
xcode
string
excel
vb.net
windows
spring
wordpress
eclipse
html5
multithreading
oracle
git
facebook
forms
bash


##### By keeping 5000 tags, we manage to keep a great deal of the questions, therefore, we will only keep so many questions, as to reach the 5000 tags. So we managed to keep most of the questions which is good for our model to be trained properly.

In [22]:
common_data.sample(5)

Unnamed: 0,Id,Title_token,Body_token,Tag,javascript,java,c#,php,android,jquery,...,spring,wordpress,eclipse,html5,multithreading,oracle,git,facebook,forms,bash
1104019,35936650,"[error, resourcemanager, unable, find, resourc...","[controller, public, class, foremancontroller,...","[java, spring]",0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1045804,34348170,"[wrting, http, proxy, java, using, socket, class]","[trying, write, http, proxy, java, using, sock...",[java],0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
418606,14763430,"[c++, python, beginner]","[totally, new, python, code, snippet, c++, cou...",[python],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
390587,13845020,"[iterate, spans, using, javascript]","[using, javascript, iterate, every, span, id, ...",[javascript],1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37128,1848830,"[moving, picturebox, timer]","[trying, move, picturebox, containing, control...",[c#],0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
common_data.to_pickle('most_common50split.pkl')

### Extract features and split data for the training procedure

In [8]:
data = pd.read_pickle('most_common50split.pkl')

In [9]:
# Load the vectorizer. We have performed most of the actions that are offered from Vectorizer, 
#however we need a blank tokenizer due to limitations from sklearn.

def tokenizer_init(string): return string

#Splitting the dataset in order to fit on training and transform on testing sets
#X_train, X_test, y_train, y_test = train_test_split(data[['Title_token', 'Body_token']], data[['Tag']], test_size=0.3, random_state=42, shuffle=True)
train, test = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)

# We choose to get the 10.000 most common words from the title, in order not to extend our dataset
title_vectorizer = TfidfVectorizer(tokenizer=tokenizer_init, lowercase=False, max_features=2000)
#X_train_title = title_vectorizer.fit_transform(X_train['Title_token'])
#X_test_title = title_vectorizer.transform(X_test['Title_token'])
title_train = title_vectorizer.fit_transform(train['Title_token'])
title_test = title_vectorizer.transform(test['Title_token'])

#Respectivelly, we choose to keep 100.000 features from the body of the questions
body_vectorizer = TfidfVectorizer(tokenizer=tokenizer_init, lowercase=False, max_features=10000)
#X_train_body = body_vectorizer.fit_transform(X_train['Body_token'])
#X_test_body = body_vectorizer.transform(X_test['Body_token'])
body_train = body_vectorizer.fit_transform(train['Body_token'])
body_test = body_vectorizer.transform(test['Body_token'])



In [10]:
joblib.dump(train, 'train.pkl')
joblib.dump(test, 'test.pkl')

['test.pkl']

In [11]:
y_train = train.drop(labels = ['Id','Body_token', 'Title_token', 'Tag'], axis=1)
y_test = test.drop(labels = ['Id','Body_token', 'Title_token', 'Tag'], axis=1)

In [12]:
x_train = hstack([title_train, body_train])
x_test = hstack([title_test, body_test])

In [5]:
x_train = hstack([X_train_title, X_train_body])
x_test = hstack([X_test_title, X_test_body])

In [6]:
print(x_train.shape)
print(x_test.shape)

(690648, 6000)
(295993, 6000)


In [13]:
# Regarding the tags, we want to avoid NaN issues

y_train = y_train.apply(lambda Tag: [tag if not isinstance(tag, float) else "nan" for tag in Tag])
y_test = y_test.apply(lambda Tag: [tag if not isinstance(tag, float) else "nan" for tag in Tag])

In [8]:
#multi_label_binarizer = MultiLabelBinarizer()
#y_train = multi_label_binarizer.fit_transform(y_train)
#y_test = multi_label_binarizer.transform(y_test)

In [7]:
print(y_train.shape, y_train)

(690648, 50)          javascript  java  c#  php  android  jquery  python  html  c++  ios  \
485826            0     0   1    0        0       0       0     0    0    0   
392867            0     0   0    1        0       1       0     0    0    0   
1257240           0     1   0    0        0       0       0     0    0    0   
1022537           0     1   0    0        0       0       0     0    0    0   
56581             0     0   0    1        0       0       0     0    0    0   
...             ...   ...  ..  ...      ...     ...     ...   ...  ...  ...   
334493            1     0   0    0        0       1       0     0    0    0   
471698            0     0   0    0        0       0       0     0    0    0   
172161            1     0   0    1        0       0       0     0    0    0   
855488            0     0   0    0        0       0       0     0    0    1   
159320            0     1   0    0        0       0       0     0    0    0   

         ...  spring  wordpress  eclip

In [14]:
joblib.dump(x_train, 'x_train.pkl')
joblib.dump(x_test, 'x_test.pkl')
joblib.dump(y_train, 'y_train.pkl')
joblib.dump(y_test, 'y_test.pkl')
#joblib.dump(multi_label_binarizer.classes_, 'y_classes.pkl')

['y_test.pkl']

## Insert sklearn classification methods

In [18]:
x_train = joblib.load('x_train.pkl')
x_test = joblib.load('x_test.pkl')
y_train = joblib.load('y_train.pkl')
y_test = joblib.load('y_test.pkl')
most_common_tags = joblib.load('most_common_tags.pkl')
train = joblib.load('train.pkl')
test = joblib.load('test.pkl')
#y_classes = joblib.load('y_classes.pkl')

In [19]:
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),])

acc = []
f1_all = []
hamm = []
auc = []

for category in most_common_tags:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    acc.append(accuracy_score(test[category], prediction))
    f1_all.append(f1_score(test[category], prediction, average="micro"))
    hamm.append(hamming_loss(test[category], prediction))
    auc.append(roc_auc_score(test[category], prediction))

**Processing javascript comments...**
**Processing java comments...**
**Processing c# comments...**
**Processing php comments...**
**Processing android comments...**
**Processing jquery comments...**
**Processing python comments...**
**Processing html comments...**
**Processing c++ comments...**
**Processing ios comments...**
**Processing mysql comments...**
**Processing css comments...**
**Processing sql comments...**
**Processing asp.net comments...**
**Processing objective-c comments...**
**Processing ruby-on-rails comments...**
**Processing .net comments...**
**Processing c comments...**
**Processing iphone comments...**
**Processing angularjs comments...**
**Processing arrays comments...**
**Processing sql-server comments...**
**Processing json comments...**
**Processing ruby comments...**
**Processing r comments...**
**Processing ajax comments...**
**Processing regex comments...**
**Processing xml comments...**
**Processing node.js comments...**
**Processing asp.net-mvc comments.

In [20]:
# More features with OnevsRest

avg_acc = np.mean(acc)
std = np.std(acc)

avg_f1 = np.mean(f1_all)
std_f1 = np.std(f1_all)

avg_hamm = np.mean(hamm)
std_hamm = np.std(hamm)

avg_auc = np.mean(auc)
std_auc = np.std(auc)

print('Results are:\n Accuracy = {:.2f}+-{:.2f}\n F1-score = {:.2f}+-{:.2f}\n Hamming Loss = {:.2f}+-{:.2f}\n AUC = {:.2f}+-{:.2f}\
'.format(avg_acc, std, avg_f1, std_f1, avg_hamm, std_hamm, avg_auc, std_auc))

Results are:
 Accuracy = 0.98+-0.01
 F1-score = 0.98+-0.01
 Hamming Loss = 0.02+-0.01
 AUC = 0.78+-0.09


In [29]:
# Less features with OnevsRest

avg_acc = np.mean(acc)
std = np.std(acc)

avg_f1 = np.mean(f1_all)
std_f1 = np.std(f1_all)

avg_hamm = np.mean(hamm)
std_hamm = np.std(hamm)

avg_auc = np.mean(auc)
std_auc = np.std(auc)

print('Results are:\n Accuracy = {:.2f}+-{:.2f}\n F1-score = {:.2f}+-{:.2f}\n Hamming Loss = {:.2f}+-{:.2f}\n AUC = {:.2f}+-{:.2f}\
'.format(avg_acc, std, avg_f1, std_f1, avg_hamm, std_hamm, avg_auc, std_auc))

Results are:
 Accuracy = 0.98+-0.01
 F1-score = 0.98+-0.01
 Hamming Loss = 0.02+-0.01
 AUC = 0.77+-0.09


In [21]:
sgd_classifier = SGDClassifier(n_jobs=-1)
clf = OneVsRestClassifier(sgd_classifier)

acc = []
f1_all = []
hamm = []
auc = []

for category in most_common_tags:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    clf.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = clf.predict(x_test)
    acc.append(accuracy_score(test[category], prediction))
    f1_all.append(f1_score(test[category], prediction, average="micro"))
    hamm.append(hamming_loss(test[category], prediction))
    auc.append(roc_auc_score(test[category], prediction))

**Processing javascript comments...**
**Processing java comments...**
**Processing c# comments...**
**Processing php comments...**
**Processing android comments...**
**Processing jquery comments...**
**Processing python comments...**
**Processing html comments...**
**Processing c++ comments...**
**Processing ios comments...**
**Processing mysql comments...**
**Processing css comments...**
**Processing sql comments...**
**Processing asp.net comments...**
**Processing objective-c comments...**
**Processing ruby-on-rails comments...**
**Processing .net comments...**
**Processing c comments...**
**Processing iphone comments...**
**Processing angularjs comments...**
**Processing arrays comments...**
**Processing sql-server comments...**
**Processing json comments...**
**Processing ruby comments...**
**Processing r comments...**
**Processing ajax comments...**
**Processing regex comments...**
**Processing xml comments...**
**Processing node.js comments...**
**Processing asp.net-mvc comments.

In [22]:
# More features with SGD

avg_acc = np.mean(acc)
std = np.std(acc)

avg_f1 = np.mean(f1_all)
std_f1 = np.std(f1_all)

avg_hamm = np.mean(hamm)
std_hamm = np.std(hamm)

avg_auc = np.mean(auc)
std_auc = np.std(auc)

print('Results are:\n Accuracy = {:.2f}+-{:.2f}\n F1-score = {:.2f}+-{:.2f}\n Hamming Loss = {:.2f}+-{:.2f}\n AUC = {:.2f}+-{:.2f}\
'.format(avg_acc, std, avg_f1, std_f1, avg_hamm, std_hamm, avg_auc, std_auc))

Results are:
 Accuracy = 0.98+-0.02
 F1-score = 0.98+-0.02
 Hamming Loss = 0.02+-0.02
 AUC = 0.72+-0.10


In [7]:
# Less features with SGD

avg_acc = np.mean(acc)
std = np.std(acc)

avg_f1 = np.mean(f1_all)
std_f1 = np.std(f1_all)

avg_hamm = np.mean(hamm)
std_hamm = np.std(hamm)

avg_auc = np.mean(auc)
std_auc = np.std(auc)

print('Results are:\n Accuracy = {:.2f}+-{:.2f}\n F1-score = {:.2f}+-{:.2f}\n Hamming Loss = {:.2f}+-{:.2f}\n AUC = {:.2f}+-{:.2f}\
'.format(avg_acc, std, avg_f1, std_f1, avg_hamm, std_hamm, avg_auc, std_auc))

Results are:
 Accuracy = 0.98+-0.02
 F1-score = 0.98+-0.02
 Hamming Loss = 0.02+-0.02
 AUC = 0.72+-0.10


In [24]:
lr_classifier = LogisticRegression(C=1., solver='sag', max_iter=200)

acc = []
f1_all = []
hamm = []
auc = []

for category in most_common_tags:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    lr_classifier.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = lr_classifier.predict(x_test)
    acc.append(accuracy_score(test[category], prediction))
    f1_all.append(f1_score(test[category], prediction, average="micro"))
    hamm.append(hamming_loss(test[category], prediction))
    auc.append(roc_auc_score(test[category], prediction))

**Processing javascript comments...**
**Processing java comments...**
**Processing c# comments...**
**Processing php comments...**
**Processing android comments...**
**Processing jquery comments...**
**Processing python comments...**
**Processing html comments...**
**Processing c++ comments...**
**Processing ios comments...**
**Processing mysql comments...**
**Processing css comments...**
**Processing sql comments...**
**Processing asp.net comments...**
**Processing objective-c comments...**
**Processing ruby-on-rails comments...**
**Processing .net comments...**
**Processing c comments...**
**Processing iphone comments...**
**Processing angularjs comments...**
**Processing arrays comments...**
**Processing sql-server comments...**
**Processing json comments...**
**Processing ruby comments...**
**Processing r comments...**
**Processing ajax comments...**
**Processing regex comments...**
**Processing xml comments...**
**Processing node.js comments...**
**Processing asp.net-mvc comments.

In [25]:
# More features with LR

avg_acc = np.mean(acc)
std = np.std(acc)

avg_f1 = np.mean(f1_all)
std_f1 = np.std(f1_all)

avg_hamm = np.mean(hamm)
std_hamm = np.std(hamm)

avg_auc = np.mean(auc)
std_auc = np.std(auc)

print('Results are:\n Accuracy = {:.2f}+-{:.2f}\n F1-score = {:.2f}+-{:.2f}\n Hamming Loss = {:.2f}+-{:.2f}\n AUC = {:.2f}+-{:.2f}\
'.format(avg_acc, std, avg_f1, std_f1, avg_hamm, std_hamm, avg_auc, std_auc))

Results are:
 Accuracy = 0.98+-0.01
 F1-score = 0.98+-0.01
 Hamming Loss = 0.02+-0.01
 AUC = 0.78+-0.09


In [36]:
mlb = MultinomialNB()

acc = []
f1_all = []
hamm = []
auc = []

for category in most_common_tags:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    mlb.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = mlb.predict(x_test)
    acc.append(accuracy_score(test[category], prediction))
    f1_all.append(f1_score(test[category], prediction, average="micro"))
    hamm.append(hamming_loss(test[category], prediction))
    auc.append(roc_auc_score(test[category], prediction))

**Processing javascript comments...**
**Processing java comments...**
**Processing c# comments...**
**Processing php comments...**
**Processing android comments...**
**Processing jquery comments...**
**Processing python comments...**
**Processing html comments...**
**Processing c++ comments...**
**Processing ios comments...**
**Processing mysql comments...**
**Processing css comments...**
**Processing sql comments...**
**Processing asp.net comments...**
**Processing objective-c comments...**
**Processing ruby-on-rails comments...**
**Processing .net comments...**
**Processing c comments...**
**Processing iphone comments...**
**Processing angularjs comments...**
**Processing arrays comments...**
**Processing sql-server comments...**
**Processing json comments...**
**Processing ruby comments...**
**Processing r comments...**
**Processing ajax comments...**
**Processing regex comments...**
**Processing xml comments...**
**Processing node.js comments...**
**Processing asp.net-mvc comments.

In [37]:
# More features with MultinomialNB

avg_acc = np.mean(acc)
std = np.std(acc)

avg_f1 = np.mean(f1_all)
std_f1 = np.std(f1_all)

avg_hamm = np.mean(hamm)
std_hamm = np.std(hamm)

avg_auc = np.mean(auc)
std_auc = np.std(auc)

print('Results are:\n Accuracy = {:.2f}+-{:.2f}\n F1-score = {:.2f}+-{:.2f}\n Hamming Loss = {:.2f}+-{:.2f}\n AUC = {:.2f}+-{:.2f}\
'.format(avg_acc, std, avg_f1, std_f1, avg_hamm, std_hamm, avg_auc, std_auc))

Results are:
 Accuracy = 0.98+-0.02
 F1-score = 0.98+-0.02
 Hamming Loss = 0.02+-0.02
 AUC = 0.78+-0.08
