In [56]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle
import nltk
from nltk import *
from sklearn import tree
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.kernel_approximation import RBFSampler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier

def pickle_file(filename, obj):
    with open('dumps/' + filename, 'wb') as f:
        pickle.dump(obj, f)


def unpickle_file(filename):
    with open('dumps/' + filename, 'rb') as f:
        return pickle.load(f)
    
def print_classification_report(true, pred):
    print(classification_report(true, pred))
    

In [17]:
def preprocess(text):
# Remove punctuation, stopword and then stemmering
    stop = set(stopwords.words("english"))
    stemmer = stem.PorterStemmer()
    punctuation = set(string.punctuation)

    doc = [stemmer.stem(word) for word in nltk.word_tokenize(text) if (word not in punctuation) and (word not in stop)]

    doc = ' '.join(w for w in doc)
    return doc

data_train = pd.read_csv('data_train.csv', encoding='utf-8')

data_train["Summary"] = data_train["Summary"].apply(preprocess)

summaries_train = data_train['Summary'].as_matrix()

ydf = data_train.drop('Summary', axis = 1)
ydf = ydf.drop(ydf.columns[0], axis = 1)
ydf = ydf.drop(ydf.columns[0], axis = 1)
ydf = ydf.as_matrix()

tfidfVect = TfidfVectorizer()
tfidf = tfidfVect.fit_transform(summaries_train)

pickle_file('tfidf.dat', tfidf)
pickle_file('tfidf_vocab.dat', tfidfVect.vocabulary_)
pickle_file('ydf.dat', ydf)



In [18]:
tfidf_vocab = TfidfVectorizer(decode_error="replace",vocabulary=unpickle_file('tfidf_vocab.dat'))
tfidf_train = unpickle_file('tfidf.dat')
ydf = unpickle_file('ydf.dat')
X_train = tfidf_train
y_train = ydf

In [63]:
def train_linear_svc(features_train, labels_train):
    print ("Training linear SVC")
    clf = LinearSVC(C=1, dual=True, class_weight='balanced')
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf


def train_randomForest(features_train, labels_train):
    print ("Training random forest")
    clf = RandomForestClassifier(n_estimators=70, max_depth=70)
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_decisionTree(features_train, labels_train):
    print ("Training decision tree")
    clf = tree.DecisionTreeClassifier()
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_adaboost_decision_tree(features_train, labels_train):
    print ("Training adaboost decision tree")
    clf = AdaBoostClassifier( tree.DecisionTreeClassifier(max_depth=5),
                              n_estimators=600,
                              learning_rate=1)
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_SGD_l1(features_train, labels_train):
    print ("Training SGD l1")
    clf = SGDClassifier(alpha=.0001, n_iter=50, penalty="l1")
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf
                        
def train_SGD_l2(features_train, labels_train):
    print ("Training SGD l2")
    clf = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_SGD_elasticnet(features_train, labels_train):
    print ("Training SGD elascticnet")
    clf = SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_PassiveAggressiveClassifier(features_train, labels_train):
    print ("Training PassiveAggressiveClassifier")
    clf = PassiveAggressiveClassifier(n_iter=50)
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_RidgeClassifier(features_train, labels_train):
    print ("Training RidgeClassifier")
    clf = RidgeClassifier(tol=1e-2, solver="sag")
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_KNeighborsClassifier(features_train, labels_train):
    print ("Training KNeighborsClassifier")
    clf = KNeighborsClassifier(n_neighbors=10)
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_MultinomialNB(features_train, labels_train):
    print ("Training MultinomialNB")
    clf = MultinomialNB(alpha=.01)
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_BernoulliNB(features_train, labels_train):
    print ("Training BernoulliNB")
    clf = BernoulliNB(alpha=.01)
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_MLP(features_train, labels_train):
    print ("Training MLP")
    clf = MLPClassifier(random_state=0, max_iter=400)
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_NearestCentroid(features_train, labels_train):
    print ("Training NearestCentroid")
    clf = NearestCentroid()
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf
                        


In [32]:
data_test = pd.read_csv('data_test.csv', encoding='utf-8')

data_test["Summary"] = data_test["Summary"].apply(preprocess)

summaries_test = data_test['Summary'].as_matrix()

transformer = TfidfTransformer()
tfidf_test = transformer.fit_transform(tfidf_vocab.fit_transform(summaries_test))
pickle_file('tfidf_test.dat', tfidf)
X_test = tfidf_test


ydf_test = pd.read_csv('labels_test.csv', encoding='utf-8')
ydf_test = ydf_test.drop(ydf_test.columns[0], axis = 1)
ydf_test = ydf_test.as_matrix()

In [35]:
svm_classifier_linear = train_linear_svc(X_train, y_train)
pickle_file('svm_classifier_linear', svm_classifier_linear)
#svm_classifier_linear = unpickle_file('svm_classifier_linear')
labels_pred = svm_classifier_linear.predict(X_test)
print_classification_report(ydf_test, labels_pred)

Training linear SVC
             precision    recall  f1-score   support

          0       0.74      0.71      0.73     11088
          1       0.55      0.50      0.53      3786
          2       0.52      0.46      0.49      3435
          3       0.70      0.63      0.67      1376
          4       0.51      0.44      0.47      2825
          5       0.46      0.31      0.37      1227
          6       0.39      0.28      0.33      1519
          7       0.37      0.25      0.29      1374
          8       0.36      0.25      0.30      1171
          9       0.47      0.36      0.41      3759
         10       0.73      0.54      0.62      1313
         11       0.64      0.33      0.43       359
         12       0.56      0.46      0.51      3883
         13       0.67      0.43      0.52       868
         14       0.78      0.49      0.60       612
         15       0.57      0.47      0.51      1790

avg / total       0.59      0.50      0.54     40385



In [37]:
random_forest = train_randomForest(X_train, y_train)
pickle_file('random_forest', random_forest)
labels_pred = random_forest.predict(X_test)
print_classification_report(ydf_test, labels_pred)

Training random forest
             precision    recall  f1-score   support

          0       0.69      0.88      0.78     11088
          1       0.82      0.04      0.08      3786
          2       0.78      0.01      0.02      3435
          3       0.91      0.04      0.08      1376
          4       1.00      0.01      0.01      2825
          5       1.00      0.00      0.00      1227
          6       1.00      0.00      0.00      1519
          7       0.00      0.00      0.00      1374
          8       0.00      0.00      0.00      1171
          9       0.67      0.00      0.00      3759
         10       0.93      0.06      0.12      1313
         11       1.00      0.01      0.03       359
         12       0.81      0.01      0.02      3883
         13       1.00      0.01      0.01       868
         14       0.95      0.06      0.11       612
         15       0.97      0.02      0.04      1790

avg / total       0.76      0.26      0.24     40385



  'precision', 'predicted', average, warn_for)


In [38]:
decision_tree = train_decisionTree(X_train, y_train)
pickle_file('decision_tree', decision_tree)
labels_pred = decision_tree.predict(X_test)
print_classification_report(ydf_test, labels_pred)

Training decision tree
             precision    recall  f1-score   support

          0       0.65      0.69      0.67     11088
          1       0.45      0.36      0.40      3786
          2       0.38      0.31      0.34      3435
          3       0.45      0.41      0.43      1376
          4       0.41      0.33      0.37      2825
          5       0.31      0.23      0.26      1227
          6       0.23      0.17      0.19      1519
          7       0.20      0.13      0.16      1374
          8       0.20      0.13      0.16      1171
          9       0.36      0.20      0.26      3759
         10       0.53      0.50      0.52      1313
         11       0.36      0.27      0.31       359
         12       0.43      0.28      0.34      3883
         13       0.44      0.34      0.38       868
         14       0.50      0.51      0.50       612
         15       0.34      0.30      0.32      1790

avg / total       0.46      0.40      0.42     40385



In [41]:
adaboost_decision_tree = train_adaboost_decision_tree(X_train, y_train)
labels_pred = adaboost_decision_tree.predict(X_test)
pickle_file('adaboost_decision_tree', adaboost_decision_tree)
print_classification_report(ydf_test, labels_pred)

Training adaboost decision tree
             precision    recall  f1-score   support

          0       0.70      0.73      0.71     11088
          1       0.53      0.36      0.43      3786
          2       0.51      0.32      0.39      3435
          3       0.71      0.38      0.50      1376
          4       0.56      0.29      0.39      2825
          5       0.47      0.11      0.18      1227
          6       0.46      0.12      0.18      1519
          7       0.48      0.09      0.15      1374
          8       0.45      0.09      0.16      1171
          9       0.48      0.19      0.27      3759
         10       0.78      0.39      0.52      1313
         11       0.73      0.19      0.31       359
         12       0.56      0.32      0.40      3883
         13       0.65      0.28      0.39       868
         14       0.82      0.40      0.54       612
         15       0.60      0.24      0.34      1790

avg / total       0.59      0.39      0.45     40385



In [44]:
SGD_l1 = train_SGD_l1(X_train, y_train)
labels_pred = SGD_l1.predict(X_test)
pickle_file('SGD_l1.dat', SGD_l1)
print_classification_report(ydf_test, labels_pred)

Training SGD l1
             precision    recall  f1-score   support

          0       0.71      0.88      0.78     11088
          1       0.74      0.13      0.23      3786
          2       0.72      0.05      0.10      3435
          3       0.87      0.28      0.43      1376
          4       0.70      0.11      0.19      2825
          5       0.56      0.01      0.02      1227
          6       0.25      0.00      0.00      1519
          7       0.33      0.00      0.00      1374
          8       0.61      0.02      0.04      1171
          9       0.60      0.02      0.05      3759
         10       0.85      0.28      0.42      1313
         11       0.84      0.06      0.11       359
         12       0.73      0.10      0.17      3883
         13       0.77      0.14      0.23       868
         14       0.81      0.21      0.33       612
         15       0.80      0.12      0.21      1790

avg / total       0.68      0.31      0.33     40385



In [45]:
SGD_l2 = train_SGD_l2(X_train, y_train)
labels_pred = SGD_l2.predict(X_test)
pickle_file('SGD_l2', SGD_l2)
print_classification_report(ydf_test, labels_pred)

Training SGD l2
             precision    recall  f1-score   support

          0       0.73      0.82      0.78     11088
          1       0.74      0.24      0.37      3786
          2       0.75      0.16      0.27      3435
          3       0.90      0.41      0.56      1376
          4       0.73      0.17      0.27      2825
          5       0.78      0.06      0.10      1227
          6       0.69      0.03      0.06      1519
          7       0.48      0.01      0.01      1374
          8       0.77      0.03      0.05      1171
          9       0.69      0.07      0.13      3759
         10       0.90      0.33      0.48      1313
         11       0.89      0.11      0.19       359
         12       0.74      0.19      0.30      3883
         13       0.87      0.24      0.38       868
         14       0.89      0.25      0.39       612
         15       0.83      0.21      0.34      1790

avg / total       0.75      0.35      0.40     40385



In [46]:
SGD_elasticnet = train_SGD_elasticnet(X_train, y_train)
labels_pred = SGD_elasticnet.predict(X_test)
pickle_file('SGD_elasticnet', SGD_elasticnet)
print_classification_report(ydf_test, labels_pred)

Training SGD elascticnet
             precision    recall  f1-score   support

          0       0.73      0.85      0.78     11088
          1       0.75      0.19      0.30      3786
          2       0.76      0.10      0.17      3435
          3       0.90      0.35      0.50      1376
          4       0.74      0.13      0.23      2825
          5       0.78      0.01      0.03      1227
          6       0.76      0.01      0.02      1519
          7       1.00      0.00      0.00      1374
          8       0.77      0.02      0.04      1171
          9       0.69      0.04      0.08      3759
         10       0.89      0.28      0.43      1313
         11       0.90      0.08      0.14       359
         12       0.76      0.15      0.25      3883
         13       0.86      0.18      0.30       868
         14       0.87      0.18      0.29       612
         15       0.85      0.18      0.29      1790

avg / total       0.77      0.32      0.36     40385



In [47]:
PassiveAggressiveClassifier = train_PassiveAggressiveClassifier(X_train, y_train)
labels_pred = PassiveAggressiveClassifier.predict(X_test)
pickle_file('PassiveAggressiveClassifier', PassiveAggressiveClassifier)
print_classification_report(ydf_test, labels_pred)

Training SGD PassiveAggressiveClassifier
             precision    recall  f1-score   support

          0       0.71      0.70      0.70     11088
          1       0.52      0.41      0.46      3786
          2       0.50      0.37      0.42      3435
          3       0.73      0.59      0.65      1376
          4       0.48      0.36      0.41      2825
          5       0.48      0.25      0.33      1227
          6       0.38      0.21      0.27      1519
          7       0.36      0.19      0.25      1374
          8       0.34      0.18      0.23      1171
          9       0.42      0.30      0.35      3759
         10       0.76      0.49      0.60      1313
         11       0.70      0.31      0.43       359
         12       0.51      0.38      0.44      3883
         13       0.67      0.37      0.48       868
         14       0.79      0.47      0.59       612
         15       0.57      0.40      0.47      1790

avg / total       0.57      0.45      0.50     40385



In [50]:
RidgeClassifier = train_RidgeClassifier(X_train, y_train)
labels_pred = RidgeClassifier.predict(X_test)
pickle_file('RidgeClassifier', RidgeClassifier)
print_classification_report(ydf_test, labels_pred)

Training SGD RidgeClassifier




             precision    recall  f1-score   support

          0       0.73      0.79      0.76     11088
          1       0.70      0.26      0.38      3786
          2       0.68      0.22      0.33      3435
          3       0.89      0.40      0.55      1376
          4       0.69      0.20      0.31      2825
          5       0.69      0.10      0.18      1227
          6       0.59      0.06      0.11      1519
          7       0.49      0.04      0.07      1374
          8       0.65      0.05      0.08      1171
          9       0.64      0.14      0.23      3759
         10       0.90      0.33      0.48      1313
         11       0.88      0.13      0.22       359
         12       0.69      0.23      0.34      3883
         13       0.82      0.24      0.38       868
         14       0.89      0.26      0.40       612
         15       0.81      0.24      0.37      1790

avg / total       0.71      0.36      0.43     40385



In [51]:
KNeighborsClassifier = train_KNeighborsClassifier(X_train, y_train)
labels_pred = KNeighborsClassifier.predict(X_test)
pickle_file('KNeighborsClassifier', KNeighborsClassifier)
print_classification_report(ydf_test, labels_pred)

Training SGD KNeighborsClassifier
             precision    recall  f1-score   support

          0       0.67      0.70      0.68     11088
          1       0.60      0.10      0.17      3786
          2       0.60      0.09      0.15      3435
          3       0.82      0.35      0.49      1376
          4       0.55      0.05      0.09      2825
          5       0.60      0.02      0.04      1227
          6       0.56      0.02      0.04      1519
          7       0.48      0.02      0.03      1374
          8       0.63      0.03      0.06      1171
          9       0.38      0.07      0.11      3759
         10       0.87      0.14      0.24      1313
         11       0.78      0.06      0.11       359
         12       0.48      0.07      0.12      3883
         13       0.76      0.10      0.17       868
         14       0.77      0.07      0.12       612
         15       0.72      0.22      0.33      1790

avg / total       0.61      0.26      0.30     40385



In [54]:
MultinomialNB = train_MultinomialNB(X_train, y_train)
labels_pred = MultinomialNB.predict(X_test)
pickle_file('MultinomialNB', MultinomialNB)
print_classification_report(ydf_test, labels_pred)

Training MultinomialNB
             precision    recall  f1-score   support

          0       0.71      0.79      0.75     11088
          1       0.57      0.33      0.42      3786
          2       0.57      0.33      0.42      3435
          3       0.81      0.52      0.63      1376
          4       0.53      0.17      0.26      2825
          5       0.57      0.19      0.28      1227
          6       0.34      0.12      0.17      1519
          7       0.32      0.12      0.17      1374
          8       0.44      0.06      0.11      1171
          9       0.45      0.18      0.26      3759
         10       0.74      0.39      0.51      1313
         11       0.70      0.11      0.19       359
         12       0.55      0.23      0.32      3883
         13       0.70      0.30      0.42       868
         14       0.84      0.25      0.39       612
         15       0.69      0.36      0.47      1790

avg / total       0.60      0.40      0.45     40385



In [55]:
BernoulliNB = train_BernoulliNB(X_train, y_train)
labels_pred = BernoulliNB.predict(X_test)
pickle_file('BernoulliNB', BernoulliNB)
print_classification_report(ydf_test, labels_pred)

Training BernoulliNB
             precision    recall  f1-score   support

          0       0.72      0.85      0.78     11088
          1       0.55      0.48      0.51      3786
          2       0.47      0.52      0.49      3435
          3       0.64      0.58      0.61      1376
          4       0.58      0.43      0.49      2825
          5       0.31      0.46      0.37      1227
          6       0.36      0.28      0.32      1519
          7       0.33      0.28      0.30      1374
          8       0.38      0.27      0.32      1171
          9       0.48      0.54      0.51      3759
         10       0.45      0.54      0.49      1313
         11       0.84      0.30      0.45       359
         12       0.48      0.45      0.46      3883
         13       0.64      0.48      0.55       868
         14       0.32      0.64      0.43       612
         15       0.55      0.46      0.50      1790

avg / total       0.55      0.57      0.55     40385



In [58]:
MLP = train_MLP(X_train, y_train)
labels_pred = MLP.predict(X_test)
pickle_file('MLP', MLP)
print_classification_report(ydf_test, labels_pred)

Training MLP
             precision    recall  f1-score   support

          0       0.72      0.72      0.72     11088
          1       0.54      0.41      0.47      3786
          2       0.52      0.37      0.43      3435
          3       0.75      0.60      0.66      1376
          4       0.49      0.34      0.40      2825
          5       0.52      0.23      0.32      1227
          6       0.39      0.18      0.25      1519
          7       0.36      0.16      0.22      1374
          8       0.41      0.16      0.24      1171
          9       0.45      0.30      0.36      3759
         10       0.77      0.48      0.59      1313
         11       0.69      0.23      0.35       359
         12       0.54      0.38      0.45      3883
         13       0.70      0.37      0.48       868
         14       0.83      0.49      0.62       612
         15       0.62      0.41      0.49      1790

avg / total       0.59      0.45      0.50     40385



In [64]:
NearestCentroid = train_NearestCentroid(X_train, y_train)
labels_pred = NearestCentroid.predict(X_test)
pickle_file('NearestCentroid', NearestCentroid)
print_classification_report(ydf_test, labels_pred)

Training NearestCentroid


TypeError: 'OneVsRestClassifier' object is not callable