In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import sqlite3
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#from wordcloud import WordCloud
import re
import os
from sqlalchemy import create_engine # database connection
import datetime as dt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from skmultilearn.adapt import mlknn
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB
from datetime import datetime

In [2]:
data = pd.read_csv('mpst_full_data.csv')
data.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


In [3]:
# function for cleaning the plots of the movies
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    #text = re.sub('\W', ' ', text)
    #text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

# function for text cleaning 
def cleaned(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

In [4]:
data['cleaned'] = list(data['plot_synopsis'].apply(clean_text))

In [5]:
data['cleaned'] = list(data['cleaned'].apply(cleaned))

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ankan_rokr/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

data['cleaned'] = data['cleaned'].apply(lambda x: remove_stopwords(x))

In [9]:
data.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,cleaned
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb,note synopsis orginal italian release segments...
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb,two thousand years ago nhagruul foul sorcerer ...
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb,matuschek gift store budapest workplace alfred...
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb,glenn holland morning person anyone standards ...
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb,may cuban man named tony montana al pacino cla...


In [10]:
train_data =  data['split']=='train'
train = data[train_data]

   
    
test_data =  data['split']=='test'
test = data[test_data]

  
    
validation_data =  data['split']=='val'
val = data[validation_data]


# BoW with 15 features


In [11]:
vectorizer_tags = CountVectorizer(tokenizer = lambda x: x.split(','), binary='true', max_features = 15)
y_train = vectorizer_tags.fit_transform(train['tags'])
y_test = vectorizer_tags.transform(test['tags'])

In [12]:
y_train.shape

(9489, 15)

# Uni

In [13]:

start = datetime.now()
vectorizer_unigram = TfidfVectorizer(min_df=5, max_features=20000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,1))
x_train_multilabel_unigram = vectorizer_unigram.fit_transform(train['cleaned'])
x_test_multilabel_unigram = vectorizer_unigram.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:04.237314


In [14]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs=-1)
classifier.fit(x_train_multilabel_unigram,y_train)
predictions = classifier.predict (x_test_multilabel_unigram)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.04888739042481457
Hamming loss  0.22852326365475387
Micro-average quality numbers
Precision: 0.2453, Recall: 0.4939, F1-measure: 0.3278
Macro-average quality numbers
Precision: 0.2288, Recall: 0.4491, F1-measure: 0.2975
              precision    recall  f1-score   support

           0       0.23      0.47      0.31       351
           1       0.24      0.44      0.31       515
           2       0.11      0.27      0.15       150
           3       0.49      0.64      0.55       885
           4       0.15      0.39      0.22       229
           5       0.19      0.46      0.27       268
           6       0.18      0.36      0.24       311
           7       0.37      0.65      0.48       593
           8       0.14      0.35      0.20       248
           9       0.11      0.27      0.15       200
          10       0.18      0.41      0.25       270
          11       0.41      0.44      0.43       166
          12       0.17      0.39      0.24       239
          

In [15]:

start = datetime.now()
vectorizer_bigram = TfidfVectorizer(min_df=5, max_features=20000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(2,2))
x_train_multilabel_bigram = vectorizer_bigram.fit_transform(train['cleaned'])
x_test_multilabel_bigram = vectorizer_bigram.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:28.340605


In [16]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs=-1)
classifier.fit(x_train_multilabel_bigram,y_train)
predictions = classifier.predict (x_test_multilabel_bigram)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.08664868509777478
Hamming loss  0.18419869633625535
Micro-average quality numbers
Precision: 0.2763, Recall: 0.3907, F1-measure: 0.3237
Macro-average quality numbers
Precision: 0.2462, Recall: 0.3344, F1-measure: 0.2785
              precision    recall  f1-score   support

           0       0.24      0.37      0.29       351
           1       0.27      0.39      0.32       515
           2       0.16      0.19      0.18       150
           3       0.48      0.58      0.52       885
           4       0.18      0.23      0.20       229
           5       0.22      0.36      0.27       268
           6       0.19      0.31      0.24       311
           7       0.39      0.55      0.46       593
           8       0.16      0.26      0.20       248
           9       0.11      0.13      0.12       200
          10       0.17      0.24      0.20       270
          11       0.47      0.31      0.37       166
          12       0.16      0.19      0.17       239
          

In [17]:
start = datetime.now()
vectorizer_trigram = TfidfVectorizer(min_df=10, max_features=20000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(3,3))
x_train_multilabel_trigram = vectorizer_trigram.fit_transform(train['cleaned'])
x_test_multilabel_trigram = vectorizer_trigram.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:40.036978


In [18]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs=-1)
classifier.fit(x_train_multilabel_trigram,y_train)
predictions = classifier.predict (x_test_multilabel_trigram)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.0684423465947404
Hamming loss  0.2823331085637222
Micro-average quality numbers
Precision: 0.1540, Recall: 0.3343, F1-measure: 0.2108
Macro-average quality numbers
Precision: 0.1428, Recall: 0.3102, F1-measure: 0.1898
              precision    recall  f1-score   support

           0       0.14      0.33      0.20       351
           1       0.22      0.39      0.28       515
           2       0.07      0.31      0.12       150
           3       0.37      0.40      0.39       885
           4       0.09      0.32      0.14       229
           5       0.13      0.35      0.19       268
           6       0.13      0.36      0.20       311
           7       0.26      0.38      0.31       593
           8       0.09      0.29      0.14       248
           9       0.07      0.19      0.10       200
          10       0.13      0.29      0.18       270
          11       0.07      0.20      0.11       166
          12       0.08      0.21      0.11       239
          13

In [19]:
start = datetime.now()
vectorizer_fourgram = TfidfVectorizer(min_df=10, max_features=20000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(4,4))
x_train_multilabel_fourgram = vectorizer_fourgram.fit_transform(train['cleaned'])
x_test_multilabel_fourgram = vectorizer_fourgram.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:41.245621


In [20]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs=-1)
classifier.fit(x_train_multilabel_fourgram,y_train)
predictions = classifier.predict (x_test_multilabel_fourgram)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.0
Hamming loss  0.3870532703978422
Micro-average quality numbers
Precision: 0.0915, Recall: 0.2724, F1-measure: 0.1370
Macro-average quality numbers
Precision: 0.1718, Recall: 0.3425, F1-measure: 0.0787
              precision    recall  f1-score   support

           0       0.28      0.03      0.06       351
           1       0.37      0.02      0.04       515
           2       0.08      0.01      0.02       150
           3       0.52      0.03      0.05       885
           4       0.09      0.02      0.03       229
           5       0.10      0.02      0.03       268
           6       0.10      0.98      0.19       311
           7       0.24      0.02      0.04       593
           8       0.13      0.01      0.02       248
           9       0.15      0.04      0.06       200
          10       0.09      0.98      0.17       270
          11       0.06      0.98      0.11       166
          12       0.08      0.99      0.15       239
          13       0.16    

In [21]:
start = datetime.now()
vectorizer_ngram = TfidfVectorizer(min_df=10, max_features=20000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,4))
x_train_multilabel_ngram = vectorizer_ngram.fit_transform(train['cleaned'])
x_test_multilabel_ngram = vectorizer_ngram.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:02:26.303597


In [22]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs=-1)
classifier.fit(x_train_multilabel_ngram,y_train)
predictions = classifier.predict (x_test_multilabel_ngram)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.04686446392447741
Hamming loss  0.23429984266127218
Micro-average quality numbers
Precision: 0.2437, Recall: 0.5119, F1-measure: 0.3302
Macro-average quality numbers
Precision: 0.2294, Recall: 0.4699, F1-measure: 0.3023
              precision    recall  f1-score   support

           0       0.22      0.50      0.31       351
           1       0.25      0.47      0.33       515
           2       0.12      0.33      0.18       150
           3       0.49      0.65      0.56       885
           4       0.15      0.40      0.22       229
           5       0.19      0.49      0.27       268
           6       0.18      0.38      0.24       311
           7       0.38      0.66      0.49       593
           8       0.14      0.38      0.21       248
           9       0.10      0.28      0.15       200
          10       0.18      0.43      0.25       270
          11       0.40      0.46      0.43       166
          12       0.17      0.44      0.25       239
          

In [23]:
from scipy.sparse import coo_matrix, hstack

In [24]:
x_train_1 = hstack((x_train_multilabel_unigram, x_train_multilabel_bigram, x_train_multilabel_trigram),format="csr",dtype='float64')

In [25]:
x_test_1 = hstack((x_test_multilabel_unigram, x_test_multilabel_bigram, x_test_multilabel_trigram),format="csr",dtype='float64')

In [26]:
from sklearn.model_selection import GridSearchCV
start = datetime.now()

model = OneVsRestClassifier(SGDClassifier(loss='log', penalty='l2', class_weight="balanced", n_jobs = -1 ), n_jobs = -1)

param_grid = {
    "estimator__alpha": [10**-4,10**-3, 10**-2,10**-1, 10**0, 10**1, 10**2, 10**3, 10**4]
}

model = GridSearchCV(model, param_grid, scoring='f1_micro',n_jobs=-1)

model.fit(x_train_1, y_train)

print (model.best_score_)
print (model.best_params_)
print("Time taken to run this cell :", datetime.now() - start)

0.30083370327655323
{'estimator__alpha': 0.01}
Time taken to run this cell : 0:00:05.205670


In [27]:
# train model

start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs=-1)
classifier.fit(x_train_1, y_train)
predictions = classifier.predict(x_test_1)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.07687120701281187
Hamming loss  0.19150370869858396
Micro-average quality numbers
Precision: 0.2712, Recall: 0.4134, F1-measure: 0.3275
Macro-average quality numbers
Precision: 0.2420, Recall: 0.3600, F1-measure: 0.2862
              precision    recall  f1-score   support

           0       0.25      0.39      0.30       351
           1       0.27      0.38      0.32       515
           2       0.14      0.24      0.18       150
           3       0.50      0.62      0.56       885
           4       0.16      0.26      0.20       229
           5       0.20      0.33      0.25       268
           6       0.19      0.30      0.23       311
           7       0.39      0.56      0.46       593
           8       0.14      0.23      0.17       248
           9       0.10      0.16      0.12       200
          10       0.19      0.33      0.24       270
          11       0.45      0.40      0.43       166
          12       0.16      0.23      0.19       239
          

In [28]:
x_train_2 = hstack((x_train_multilabel_unigram, x_train_multilabel_bigram, x_train_multilabel_trigram, x_train_multilabel_fourgram),format="csr",dtype='float64')
x_test_2 = hstack((x_test_multilabel_unigram, x_test_multilabel_bigram, x_test_multilabel_trigram, x_test_multilabel_fourgram),format="csr",dtype='float64')

In [29]:
start = datetime.now()

model = OneVsRestClassifier(SGDClassifier(loss='log', penalty='l2', class_weight="balanced", n_jobs = -1 ), n_jobs = -1)

param_grid = {
    "estimator__alpha": [10**-4,10**-3, 10**-2,10**-1, 10**0, 10**1, 10**2, 10**3, 10**4]
}

model = GridSearchCV(model, param_grid, scoring='f1_micro',n_jobs=-1)

model.fit(x_train_2, y_train)

print (model.best_score_)
print (model.best_params_)
print("Time taken to run this cell :", datetime.now() - start)

0.29446349584195286
{'estimator__alpha': 0.01}
Time taken to run this cell : 0:00:05.681882


In [33]:


start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs=-1)
classifier.fit(x_train_2, y_train)
predictions = classifier.predict(x_test_2)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.07484828051247472
Hamming loss  0.19154866262081366
Micro-average quality numbers
Precision: 0.2708, Recall: 0.4124, F1-measure: 0.3270
Macro-average quality numbers
Precision: 0.2396, Recall: 0.3583, F1-measure: 0.2844
              precision    recall  f1-score   support

           0       0.25      0.41      0.31       351
           1       0.27      0.39      0.32       515
           2       0.14      0.23      0.18       150
           3       0.50      0.62      0.55       885
           4       0.16      0.27      0.20       229
           5       0.19      0.32      0.24       268
           6       0.19      0.28      0.23       311
           7       0.39      0.56      0.46       593
           8       0.14      0.24      0.17       248
           9       0.09      0.14      0.11       200
          10       0.19      0.32      0.24       270
          11       0.42      0.40      0.41       166
          12       0.16      0.22      0.18       239
          

In [34]:
train_glove = np.load('glove_tarin.npy') 

In [35]:
test_glove = np.load('glove_test.npy') 

In [36]:
x_train_2 = hstack((x_train_multilabel_unigram, x_train_multilabel_bigram, x_train_multilabel_trigram, x_train_multilabel_fourgram),format="csr",dtype='float64')
x_test_2 = hstack((x_test_multilabel_unigram, x_test_multilabel_bigram, x_test_multilabel_trigram, x_test_multilabel_fourgram),format="csr",dtype='float64')

In [37]:
x_train_2 = x_train_2.toarray()

In [38]:
x_train_2.shape

(9489, 40857)

In [39]:
x_test_2 = x_test_2.toarray()

In [40]:
x_test_2.shape

(2966, 40857)

In [41]:
train_glove.shape

(9489, 300)

In [42]:
test_glove.shape

(2966, 300)

In [43]:
x_train_3 = np.hstack((x_train_2, train_glove))

In [44]:
x_train_3.shape

(9489, 41157)

In [45]:
x_test_3 = np.hstack((x_test_2, test_glove))

In [46]:
x_test_3.shape

(2966, 41157)

In [47]:
np.save('gove+train2.npy', x_train_3)
np.save('gove+test2.npy', x_test_3)

In [48]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs=-1)
classifier.fit(x_train_3, y_train)
predictions = classifier.predict(x_test_3)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.08125421443020904
Hamming loss  0.18314227916385706
Micro-average quality numbers
Precision: 0.3486, Recall: 0.7179, F1-measure: 0.4693
Macro-average quality numbers
Precision: 0.3368, Recall: 0.6919, F1-measure: 0.4436
              precision    recall  f1-score   support

           0       0.33      0.66      0.44       351
           1       0.53      0.87      0.66       515
           2       0.20      0.61      0.31       150
           3       0.57      0.75      0.65       885
           4       0.20      0.63      0.31       229
           5       0.33      0.79      0.46       268
           6       0.30      0.74      0.43       311
           7       0.53      0.76      0.62       593
           8       0.25      0.70      0.36       248
           9       0.23      0.40      0.29       200
          10       0.36      0.76      0.49       270
          11       0.50      0.70      0.58       166
          12       0.17      0.59      0.27       239
          

In [49]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.001, penalty='l2', class_weight="balanced"))
classifier.fit(x_train_3, y_train)
predictions = classifier.predict(x_test_3)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.09743762643290627
Hamming loss  0.174645987862441
Micro-average quality numbers
Precision: 0.3595, Recall: 0.7011, F1-measure: 0.4753
Macro-average quality numbers
Precision: 0.3409, Recall: 0.6613, F1-measure: 0.4438
              precision    recall  f1-score   support

           0       0.29      0.67      0.41       351
           1       0.56      0.85      0.68       515
           2       0.24      0.53      0.33       150
           3       0.53      0.78      0.63       885
           4       0.23      0.47      0.31       229
           5       0.45      0.69      0.54       268
           6       0.30      0.73      0.43       311
           7       0.51      0.79      0.62       593
           8       0.23      0.65      0.34       248
           9       0.21      0.47      0.29       200
          10       0.27      0.83      0.41       270
          11       0.41      0.67      0.51       166
          12       0.20      0.58      0.30       239
          13

In [50]:
x_train_1 = x_train_1.toarray()

In [51]:
x_test_1 = x_test_1.toarray()

In [52]:
x_train_4 = np.hstack((x_train_1, train_glove))

In [53]:
x_test_4 = np.hstack((x_test_1, test_glove))

In [54]:
np.save('gove+train1.npy', x_train_4)
np.save('gove+test1.npy', x_test_4)

In [55]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"))
classifier.fit(x_train_4, y_train)
predictions = classifier.predict(x_test_4)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.08496291301416048
Hamming loss  0.18152393796358732
Micro-average quality numbers
Precision: 0.3501, Recall: 0.7113, F1-measure: 0.4692
Macro-average quality numbers
Precision: 0.3323, Recall: 0.6786, F1-measure: 0.4394
              precision    recall  f1-score   support

           0       0.31      0.67      0.42       351
           1       0.50      0.91      0.65       515
           2       0.20      0.65      0.31       150
           3       0.57      0.76      0.65       885
           4       0.19      0.60      0.29       229
           5       0.40      0.73      0.51       268
           6       0.30      0.73      0.43       311
           7       0.53      0.77      0.62       593
           8       0.21      0.69      0.33       248
           9       0.20      0.47      0.28       200
          10       0.38      0.74      0.50       270
          11       0.42      0.72      0.53       166
          12       0.22      0.44      0.30       239
          

In [56]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.001, penalty='l2', class_weight="balanced", n_jobs = -1))
classifier.fit(x_train_4, y_train)
predictions = classifier.predict(x_test_4)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.05933917734322319
Hamming loss  0.19082939986513822
Micro-average quality numbers
Precision: 0.3352, Recall: 0.7031, F1-measure: 0.4539
Macro-average quality numbers
Precision: 0.3353, Recall: 0.6869, F1-measure: 0.4380
              precision    recall  f1-score   support

           0       0.28      0.72      0.41       351
           1       0.58      0.84      0.69       515
           2       0.20      0.65      0.31       150
           3       0.59      0.65      0.62       885
           4       0.20      0.52      0.29       229
           5       0.35      0.75      0.47       268
           6       0.35      0.66      0.46       311
           7       0.46      0.81      0.59       593
           8       0.21      0.69      0.32       248
           9       0.16      0.62      0.25       200
          10       0.42      0.75      0.54       270
          11       0.40      0.66      0.50       166
          12       0.18      0.64      0.28       239
          

In [57]:
x_train_5 = np.hstack((x_train_4, x_train_3))

In [58]:
x_test_5 = np.hstack((x_test_4, x_test_3))

In [59]:
np.save('all1.npy', x_train_5)
np.save('all2.npy', x_test_5)

In [61]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_5, y_train)
predictions = classifier.predict(x_test_5)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.0832771409305462
Hamming loss  0.16174421218251292
Micro-average quality numbers
Precision: 0.3789, Recall: 0.6788, F1-measure: 0.4864
Macro-average quality numbers
Precision: 0.3609, Recall: 0.6495, F1-measure: 0.4515
              precision    recall  f1-score   support

           0       0.30      0.71      0.42       351
           1       0.64      0.81      0.72       515
           2       0.15      0.71      0.25       150
           3       0.59      0.72      0.65       885
           4       0.22      0.50      0.31       229
           5       0.38      0.75      0.50       268
           6       0.35      0.69      0.46       311
           7       0.52      0.76      0.62       593
           8       0.24      0.62      0.35       248
           9       0.33      0.34      0.33       200
          10       0.37      0.76      0.50       270
          11       0.33      0.71      0.45       166
          12       0.32      0.33      0.33       239
          1

In [65]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.0001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_5, y_train)
predictions = classifier.predict(x_test_5)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.15677680377612946
Hamming loss  0.13011912789390875
Micro-average quality numbers
Precision: 0.4417, Recall: 0.5808, F1-measure: 0.5018
Macro-average quality numbers
Precision: 0.4260, Recall: 0.5289, F1-measure: 0.4610
              precision    recall  f1-score   support

           0       0.34      0.50      0.41       351
           1       0.64      0.80      0.71       515
           2       0.34      0.37      0.36       150
           3       0.58      0.72      0.64       885
           4       0.24      0.37      0.29       229
           5       0.29      0.71      0.41       268
           6       0.44      0.51      0.47       311
           7       0.55      0.64      0.59       593
           8       0.30      0.45      0.36       248
           9       0.44      0.24      0.31       200
          10       0.41      0.69      0.51       270
          11       0.76      0.60      0.67       166
          12       0.26      0.30      0.27       239
          

In [66]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_5, y_train)
predictions = classifier.predict(x_test_5)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.21072151045178691
Hamming loss  0.1135760845133738
Micro-average quality numbers
Precision: 0.4959, Recall: 0.4144, F1-measure: 0.4515
Macro-average quality numbers
Precision: 0.4417, Recall: 0.3552, F1-measure: 0.3819
              precision    recall  f1-score   support

           0       0.46      0.26      0.34       351
           1       0.68      0.65      0.67       515
           2       0.49      0.23      0.32       150
           3       0.57      0.64      0.61       885
           4       0.37      0.17      0.24       229
           5       0.49      0.34      0.41       268
           6       0.44      0.32      0.37       311
           7       0.62      0.51      0.56       593
           8       0.36      0.21      0.27       248
           9       0.31      0.28      0.29       200
          10       0.45      0.28      0.35       270
          11       0.30      0.56      0.39       166
          12       0.26      0.21      0.23       239
          1

In [62]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_5, y_train)
predictions = classifier.predict(x_test_5)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.11598111935266352
Hamming loss  0.14495392222971454
Micro-average quality numbers
Precision: 0.4096, Recall: 0.6455, F1-measure: 0.5012
Macro-average quality numbers
Precision: 0.3929, Recall: 0.6023, F1-measure: 0.4663
              precision    recall  f1-score   support

           0       0.35      0.61      0.44       351
           1       0.59      0.83      0.69       515
           2       0.23      0.47      0.31       150
           3       0.61      0.69      0.65       885
           4       0.24      0.43      0.30       229
           5       0.41      0.67      0.51       268
           6       0.36      0.65      0.47       311
           7       0.54      0.78      0.64       593
           8       0.28      0.54      0.37       248
           9       0.22      0.42      0.29       200
          10       0.42      0.74      0.53       270
          11       0.65      0.60      0.62       166
          12       0.34      0.28      0.31       239
          

In [68]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.0001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_5, y_train)
predictions = classifier.predict(x_test_5)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.19453809844908967
Hamming loss  0.11692515171948753
Micro-average quality numbers
Precision: 0.4810, Recall: 0.4620, F1-measure: 0.4713
Macro-average quality numbers
Precision: 0.4324, Recall: 0.4278, F1-measure: 0.4221
              precision    recall  f1-score   support

           0       0.47      0.27      0.35       351
           1       0.69      0.74      0.71       515
           2       0.23      0.19      0.21       150
           3       0.58      0.54      0.56       885
           4       0.29      0.24      0.26       229
           5       0.44      0.59      0.50       268
           6       0.43      0.51      0.47       311
           7       0.61      0.50      0.55       593
           8       0.36      0.33      0.34       248
           9       0.37      0.23      0.28       200
          10       0.46      0.44      0.45       270
          11       0.55      0.67      0.61       166
          12       0.27      0.41      0.32       239
          

In [69]:
vectorizer_char = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(1, 1),  max_features=20000)
x_train_char = vectorizer_char.fit_transform(train['cleaned'])
x_test_char = vectorizer_char.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:03:33.371687


In [70]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_char, y_train)
predictions = classifier.predict(x_test_char)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.0
Hamming loss  0.3751854349291976
Micro-average quality numbers
Precision: 0.1506, Recall: 0.5013, F1-measure: 0.2316
Macro-average quality numbers
Precision: 0.1373, Recall: 0.4702, F1-measure: 0.1808
              precision    recall  f1-score   support

           0       0.15      0.77      0.25       351
           1       0.23      0.26      0.24       515
           2       0.07      0.67      0.13       150
           3       0.31      0.82      0.45       885
           4       0.10      0.56      0.18       229
           5       0.11      0.69      0.20       268
           6       0.15      0.18      0.17       311
           7       0.28      0.30      0.29       593
           8       0.10      0.74      0.18       248
           9       0.05      0.01      0.01       200
          10       0.00      0.00      0.00       270
          11       0.12      0.47      0.19       166
          12       0.15      0.02      0.04       239
          13       0.13    

In [71]:
vectorizer_char_2 = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(2, 2),  max_features=20000)
x_train_char_2 = vectorizer_char_2.fit_transform(train['cleaned'])
x_test_char_2 = vectorizer_char_2.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:16.204881


In [72]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_char_2, y_train)
predictions = classifier.predict(x_test_char_2)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.010114632501685773
Hamming loss  0.3314452685996853
Micro-average quality numbers
Precision: 0.1836, Recall: 0.5621, F1-measure: 0.2767
Macro-average quality numbers
Precision: 0.1776, Recall: 0.5486, F1-measure: 0.2568
              precision    recall  f1-score   support

           0       0.21      0.56      0.31       351
           1       0.26      0.43      0.32       515
           2       0.08      0.60      0.14       150
           3       0.41      0.65      0.50       885
           4       0.13      0.44      0.20       229
           5       0.17      0.50      0.26       268
           6       0.16      0.58      0.25       311
           7       0.31      0.60      0.41       593
           8       0.12      0.58      0.20       248
           9       0.10      0.13      0.12       200
          10       0.11      0.72      0.20       270
          11       0.11      0.71      0.20       166
          12       0.11      0.50      0.19       239
          

In [73]:
vectorizer_char_3 = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(3, 3),  max_features=20000)
x_train_char_3 = vectorizer_char_3.fit_transform(train['cleaned'])
x_test_char_3 = vectorizer_char_3.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:21.718450


In [74]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"),n_jobs = -1)
classifier.fit(x_train_char_3, y_train)
predictions = classifier.predict(x_test_char_3)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.028658125421443022
Hamming loss  0.2658125421443021
Micro-average quality numbers
Precision: 0.2346, Recall: 0.5993, F1-measure: 0.3372
Macro-average quality numbers
Precision: 0.2211, Recall: 0.5724, F1-measure: 0.3130
              precision    recall  f1-score   support

           0       0.22      0.64      0.33       351
           1       0.28      0.54      0.37       515
           2       0.12      0.44      0.19       150
           3       0.49      0.68      0.57       885
           4       0.15      0.53      0.23       229
           5       0.19      0.59      0.29       268
           6       0.19      0.62      0.29       311
           7       0.36      0.70      0.48       593
           8       0.16      0.54      0.24       248
           9       0.11      0.45      0.18       200
          10       0.17      0.49      0.25       270
          11       0.27      0.61      0.37       166
          12       0.16      0.51      0.24       239
          

In [75]:
vectorizer_char_4 = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(4, 4),  max_features=20000)
x_train_char_4 = vectorizer_char_4.fit_transform(train['cleaned'])
x_test_char_4 = vectorizer_char_4.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:30.628558


In [76]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_char_4, y_train)
predictions = classifier.predict(x_test_char_4)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.034726904922454484
Hamming loss  0.25830523713193976
Micro-average quality numbers
Precision: 0.2424, Recall: 0.6069, F1-measure: 0.3465
Macro-average quality numbers
Precision: 0.2327, Recall: 0.5787, F1-measure: 0.3248
              precision    recall  f1-score   support

           0       0.24      0.63      0.34       351
           1       0.27      0.62      0.37       515
           2       0.12      0.51      0.19       150
           3       0.50      0.69      0.58       885
           4       0.15      0.53      0.23       229
           5       0.20      0.60      0.30       268
           6       0.20      0.50      0.29       311
           7       0.38      0.69      0.49       593
           8       0.16      0.49      0.24       248
           9       0.12      0.47      0.19       200
          10       0.17      0.56      0.27       270
          11       0.35      0.58      0.43       166
          12       0.18      0.49      0.26       239
         

In [77]:
vectorizer_char_5 = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(5, 5),  max_features=20000)
x_train_char_5 = vectorizer_char_5.fit_transform(train['cleaned'])
x_test_char_5 = vectorizer_char_5.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:40.414777


In [78]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_char_5, y_train)
predictions = classifier.predict(x_test_char_5)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.037086985839514496
Hamming loss  0.2539896605978872
Micro-average quality numbers
Precision: 0.2452, Recall: 0.6023, F1-measure: 0.3486
Macro-average quality numbers
Precision: 0.2318, Recall: 0.5707, F1-measure: 0.3236
              precision    recall  f1-score   support

           0       0.24      0.62      0.34       351
           1       0.27      0.58      0.37       515
           2       0.13      0.55      0.21       150
           3       0.49      0.71      0.58       885
           4       0.15      0.52      0.23       229
           5       0.20      0.63      0.30       268
           6       0.21      0.51      0.30       311
           7       0.39      0.72      0.50       593
           8       0.16      0.46      0.24       248
           9       0.12      0.40      0.18       200
          10       0.18      0.46      0.26       270
          11       0.33      0.57      0.41       166
          12       0.17      0.51      0.25       239
          

In [79]:
vectorizer_char_6 = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(6, 6),  max_features=20000)
x_train_char_6 = vectorizer_char_6.fit_transform(train['cleaned'])
x_test_char_6 = vectorizer_char_6.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:53.580652


In [80]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_char_6, y_train)
predictions = classifier.predict(x_test_char_6)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.03405259608900876
Hamming loss  0.252596089008766
Micro-average quality numbers
Precision: 0.2448, Recall: 0.5943, F1-measure: 0.3468
Macro-average quality numbers
Precision: 0.2315, Recall: 0.5653, F1-measure: 0.3222
              precision    recall  f1-score   support

           0       0.23      0.60      0.33       351
           1       0.28      0.57      0.38       515
           2       0.13      0.51      0.21       150
           3       0.50      0.68      0.58       885
           4       0.15      0.51      0.23       229
           5       0.20      0.65      0.30       268
           6       0.21      0.54      0.30       311
           7       0.39      0.71      0.50       593
           8       0.16      0.49      0.24       248
           9       0.11      0.41      0.17       200
          10       0.18      0.50      0.26       270
          11       0.30      0.56      0.39       166
          12       0.17      0.47      0.25       239
          13

In [81]:
vectorizer_char_8 = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(7, 7),  max_features=20000)
x_train_char_8 = vectorizer_char_8.fit_transform(train['cleaned'])
x_test_char_8 = vectorizer_char_8.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:01:18.730078


In [82]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_char_8, y_train)
predictions = classifier.predict(x_test_char_8)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.03607552258934592
Hamming loss  0.24747134187457856
Micro-average quality numbers
Precision: 0.2483, Recall: 0.5888, F1-measure: 0.3493
Macro-average quality numbers
Precision: 0.2332, Recall: 0.5581, F1-measure: 0.3228
              precision    recall  f1-score   support

           0       0.23      0.58      0.33       351
           1       0.30      0.57      0.39       515
           2       0.14      0.53      0.22       150
           3       0.50      0.68      0.58       885
           4       0.15      0.48      0.23       229
           5       0.21      0.63      0.32       268
           6       0.20      0.52      0.29       311
           7       0.40      0.71      0.51       593
           8       0.16      0.52      0.25       248
           9       0.11      0.34      0.16       200
          10       0.18      0.50      0.26       270
          11       0.30      0.54      0.38       166
          12       0.17      0.46      0.25       239
          

In [83]:
vectorizer_charngram = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(1, 6),  max_features=20000)
x_train_charngram = vectorizer_charngram.fit_transform(train['cleaned'])
x_test_charngram = vectorizer_charngram.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:03:19.747493


In [84]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_charngram, y_train)
predictions = classifier.predict(x_test_charngram)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.030681051921780174
Hamming loss  0.2696561024949427
Micro-average quality numbers
Precision: 0.2335, Recall: 0.6089, F1-measure: 0.3375
Macro-average quality numbers
Precision: 0.2244, Recall: 0.5908, F1-measure: 0.3171
              precision    recall  f1-score   support

           0       0.23      0.67      0.34       351
           1       0.28      0.53      0.37       515
           2       0.12      0.56      0.20       150
           3       0.50      0.68      0.58       885
           4       0.14      0.59      0.23       229
           5       0.18      0.64      0.29       268
           6       0.20      0.52      0.29       311
           7       0.38      0.69      0.49       593
           8       0.15      0.52      0.23       248
           9       0.12      0.50      0.19       200
          10       0.17      0.54      0.26       270
          11       0.28      0.60      0.38       166
          12       0.17      0.49      0.25       239
          

In [85]:
vectorizer_charngram = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(2, 4),  max_features=20000)
x_train_charngram = vectorizer_charngram.fit_transform(train['cleaned'])
x_test_charngram = vectorizer_charngram.transform(test['cleaned'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:01:20.744426


In [86]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_charngram, y_train)
predictions = classifier.predict(x_test_charngram)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.037086985839514496
Hamming loss  0.25567543268150145
Micro-average quality numbers
Precision: 0.2410, Recall: 0.5890, F1-measure: 0.3420
Macro-average quality numbers
Precision: 0.2255, Recall: 0.5577, F1-measure: 0.3149
              precision    recall  f1-score   support

           0       0.25      0.58      0.35       351
           1       0.26      0.61      0.37       515
           2       0.12      0.49      0.19       150
           3       0.50      0.68      0.58       885
           4       0.15      0.51      0.23       229
           5       0.19      0.64      0.29       268
           6       0.19      0.54      0.28       311
           7       0.38      0.68      0.49       593
           8       0.15      0.55      0.24       248
           9       0.13      0.30      0.18       200
          10       0.18      0.38      0.24       270
          11       0.28      0.58      0.38       166
          12       0.17      0.51      0.25       239
         

In [87]:
x_train_char_6 = x_train_char_6.toarray()

In [88]:
x_train_char_6 .shape

(9489, 20000)

In [89]:
x_test_6 = x_test_char_6.toarray()

In [90]:
x_train_6 = np.hstack((x_train_5, x_train_char_6))

In [92]:
x_test_6 = np.hstack((x_test_5, x_test_char_6))

In [93]:
np.save('final_train.npy', x_train_6)
np.save('final_test.npy', x_test_6)

In [94]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_6, y_train)
predictions = classifier.predict(x_test_6)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.08968307484828052
Hamming loss  0.15652955720386605
Micro-average quality numbers
Precision: 0.3892, Recall: 0.6808, F1-measure: 0.4953
Macro-average quality numbers
Precision: 0.3645, Recall: 0.6481, F1-measure: 0.4599
              precision    recall  f1-score   support

           0       0.32      0.70      0.44       351
           1       0.60      0.82      0.70       515
           2       0.21      0.61      0.31       150
           3       0.61      0.69      0.65       885
           4       0.21      0.56      0.31       229
           5       0.38      0.73      0.50       268
           6       0.35      0.67      0.46       311
           7       0.49      0.83      0.62       593
           8       0.25      0.61      0.36       248
           9       0.28      0.36      0.31       200
          10       0.41      0.74      0.53       270
          11       0.43      0.69      0.53       166
          12       0.28      0.40      0.33       239
          

In [95]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.0001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_6, y_train)
predictions = classifier.predict(x_test_6)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.19082939986513822
Hamming loss  0.124881995954147
Micro-average quality numbers
Precision: 0.4581, Recall: 0.5850, F1-measure: 0.5138
Macro-average quality numbers
Precision: 0.4342, Recall: 0.5203, F1-measure: 0.4638
              precision    recall  f1-score   support

           0       0.42      0.42      0.42       351
           1       0.54      0.88      0.67       515
           2       0.38      0.43      0.40       150
           3       0.60      0.74      0.66       885
           4       0.32      0.25      0.28       229
           5       0.52      0.50      0.51       268
           6       0.39      0.34      0.36       311
           7       0.52      0.77      0.62       593
           8       0.36      0.40      0.38       248
           9       0.32      0.31      0.32       200
          10       0.36      0.64      0.46       270
          11       0.78      0.57      0.66       166
          12       0.28      0.43      0.34       239
          13

In [98]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_6, y_train)
predictions = classifier.predict(x_test_6)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.25151719487525287
Hamming loss  0.10570914812317375
Micro-average quality numbers
Precision: 0.5405, Recall: 0.4198, F1-measure: 0.4726
Macro-average quality numbers
Precision: 0.4959, Recall: 0.3713, F1-measure: 0.4141
              precision    recall  f1-score   support

           0       0.45      0.30      0.36       351
           1       0.72      0.67      0.70       515
           2       0.45      0.25      0.32       150
           3       0.61      0.53      0.57       885
           4       0.39      0.14      0.21       229
           5       0.57      0.39      0.46       268
           6       0.45      0.30      0.36       311
           7       0.66      0.53      0.59       593
           8       0.38      0.21      0.27       248
           9       0.42      0.19      0.26       200
          10       0.44      0.41      0.43       270
          11       0.71      0.60      0.65       166
          12       0.38      0.11      0.17       239
          

In [96]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_6, y_train)
predictions = classifier.predict(x_test_6)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.12306136210384357
Hamming loss  0.13639020004495392
Micro-average quality numbers
Precision: 0.4297, Recall: 0.6392, F1-measure: 0.5139
Macro-average quality numbers
Precision: 0.4046, Recall: 0.5886, F1-measure: 0.4728
              precision    recall  f1-score   support

           0       0.37      0.62      0.47       351
           1       0.61      0.84      0.71       515
           2       0.38      0.39      0.38       150
           3       0.58      0.72      0.65       885
           4       0.24      0.37      0.29       229
           5       0.37      0.70      0.48       268
           6       0.34      0.65      0.45       311
           7       0.57      0.75      0.65       593
           8       0.32      0.44      0.37       248
           9       0.27      0.31      0.29       200
          10       0.42      0.74      0.54       270
          11       0.63      0.63      0.63       166
          12       0.31      0.40      0.35       239
          

In [97]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.0001, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_6, y_train)
predictions = classifier.predict(x_test_6)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.1830748482805125
Hamming loss  0.12485951899303215
Micro-average quality numbers
Precision: 0.4464, Recall: 0.4443, F1-measure: 0.4453
Macro-average quality numbers
Precision: 0.4296, Recall: 0.4241, F1-measure: 0.4106
              precision    recall  f1-score   support

           0       0.42      0.31      0.36       351
           1       0.65      0.76      0.70       515
           2       0.40      0.41      0.41       150
           3       0.61      0.43      0.51       885
           4       0.31      0.21      0.25       229
           5       0.32      0.47      0.38       268
           6       0.41      0.48      0.44       311
           7       0.70      0.45      0.55       593
           8       0.34      0.37      0.36       248
           9       0.31      0.25      0.28       200
          10       0.41      0.13      0.19       270
          11       0.67      0.60      0.63       166
          12       0.20      0.35      0.26       239
          1

In [99]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.01, penalty='l2', class_weight="balanced"), n_jobs = -1)
classifier.fit(x_train_6, y_train)
predictions = classifier.predict(x_test_6)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)

Accuracy : 0.044504383007417395
Hamming loss  0.2279838165879973
Micro-average quality numbers
Precision: 0.2950, Recall: 0.7346, F1-measure: 0.4210
Macro-average quality numbers
Precision: 0.2974, Recall: 0.7303, F1-measure: 0.4081
              precision    recall  f1-score   support

           0       0.27      0.74      0.39       351
           1       0.49      0.93      0.64       515
           2       0.15      0.72      0.25       150
           3       0.57      0.63      0.60       885
           4       0.17      0.75      0.27       229
           5       0.28      0.80      0.41       268
           6       0.28      0.76      0.41       311
           7       0.47      0.79      0.59       593
           8       0.20      0.76      0.32       248
           9       0.18      0.47      0.26       200
          10       0.42      0.74      0.53       270
          11       0.35      0.72      0.47       166
          12       0.14      0.77      0.24       239
          