In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle
import nltk
from nltk import *
from sklearn import tree
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.kernel_approximation import RBFSampler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

def pickle_file(filename, obj):
    with open('dumps/' + filename, 'wb') as f:
        pickle.dump(obj, f)


def unpickle_file(filename):
    with open('dumps/' + filename, 'rb') as f:
        return pickle.load(f)
    
def print_classification_report(true, pred):
    print(classification_report(true, pred))
    

In [17]:
def preprocess(text):
# Remove punctuation, stopword and then stemmering
    stop = set(stopwords.words("english"))
    stemmer = stem.PorterStemmer()
    punctuation = set(string.punctuation)

    doc = [stemmer.stem(word) for word in nltk.word_tokenize(text) if (word not in punctuation) and (word not in stop)]

    doc = ' '.join(w for w in doc)
    return doc

data_train = pd.read_csv('data_train.csv', encoding='utf-8')

data_train["Summary"] = data_train["Summary"].apply(preprocess)

summaries_train = data_train['Summary'].as_matrix()

ydf = data_train.drop('Summary', axis = 1)
ydf = ydf.drop(ydf.columns[0], axis = 1)
ydf = ydf.drop(ydf.columns[0], axis = 1)
ydf = ydf.as_matrix()

tfidfVect = TfidfVectorizer()
tfidf = tfidfVect.fit_transform(summaries_train)

pickle_file('tfidf.dat', tfidf)
pickle_file('tfidf_vocab.dat', tfidfVect.vocabulary_)
pickle_file('ydf.dat', ydf)



In [18]:
tfidf_vocab = TfidfVectorizer(decode_error="replace",vocabulary=unpickle_file('tfidf_vocab.dat'))
tfidf_train = unpickle_file('tfidf.dat')
ydf = unpickle_file('ydf.dat')
X_train = tfidf_train
y_train = ydf

In [19]:
def train_linear_svc(features_train, labels_train):
    print ("Training linear SVC")
    clf = LinearSVC(C=1, dual=True, class_weight='balanced')
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf


def train_randomForest(features_train, labels_train):
    print ("Training random forest")
    clf = RandomForestClassifier(n_estimators=70, max_depth=70)
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_decisionTree(features_train, labels_train):
    print ("Training decision tree")
    clf = tree.DecisionTreeClassifier()
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

def train_adaboost_decision_tree(features_train, labels_train):
    print ("Training adaboost decision tree")
    clf = AdaBoostClassifier( tree.DecisionTreeClassifier(max_depth=2),
                              n_estimators=600,
                              learning_rate=1)
    clf = OneVsRestClassifier(clf).fit(features_train, labels_train)
    return clf

In [32]:
data_test = pd.read_csv('data_test.csv', encoding='utf-8')

data_test["Summary"] = data_test["Summary"].apply(preprocess)

summaries_test = data_test['Summary'].as_matrix()

transformer = TfidfTransformer()
tfidf_test = transformer.fit_transform(tfidf_vocab.fit_transform(summaries_test))
pickle_file('tfidf_test.dat', tfidf)
X_test = tfidf_test


ydf_test = pd.read_csv('labels_test.csv', encoding='utf-8')
ydf_test = ydf_test.drop(ydf_test.columns[0], axis = 1)
ydf_test = ydf_test.as_matrix()

In [35]:
svm_classifier_linear = train_linear_svc(X_train, y_train)
pickle_file('svm_classifier_linear', svm_classifier_linear)
#svm_classifier_linear = unpickle_file('svm_classifier_linear')
labels_pred = svm_classifier_linear.predict(X_test)
print_classification_report(ydf_test, labels_pred)

Training linear SVC
             precision    recall  f1-score   support

          0       0.74      0.71      0.73     11088
          1       0.55      0.50      0.53      3786
          2       0.52      0.46      0.49      3435
          3       0.70      0.63      0.67      1376
          4       0.51      0.44      0.47      2825
          5       0.46      0.31      0.37      1227
          6       0.39      0.28      0.33      1519
          7       0.37      0.25      0.29      1374
          8       0.36      0.25      0.30      1171
          9       0.47      0.36      0.41      3759
         10       0.73      0.54      0.62      1313
         11       0.64      0.33      0.43       359
         12       0.56      0.46      0.51      3883
         13       0.67      0.43      0.52       868
         14       0.78      0.49      0.60       612
         15       0.57      0.47      0.51      1790

avg / total       0.59      0.50      0.54     40385



In [37]:
random_forest = train_randomForest(X_train, y_train)
pickle_file('random_forest', random_forest)
labels_pred = random_forest.predict(X_test)
print_classification_report(ydf_test, labels_pred)

Training random forest
             precision    recall  f1-score   support

          0       0.69      0.88      0.78     11088
          1       0.82      0.04      0.08      3786
          2       0.78      0.01      0.02      3435
          3       0.91      0.04      0.08      1376
          4       1.00      0.01      0.01      2825
          5       1.00      0.00      0.00      1227
          6       1.00      0.00      0.00      1519
          7       0.00      0.00      0.00      1374
          8       0.00      0.00      0.00      1171
          9       0.67      0.00      0.00      3759
         10       0.93      0.06      0.12      1313
         11       1.00      0.01      0.03       359
         12       0.81      0.01      0.02      3883
         13       1.00      0.01      0.01       868
         14       0.95      0.06      0.11       612
         15       0.97      0.02      0.04      1790

avg / total       0.76      0.26      0.24     40385



  'precision', 'predicted', average, warn_for)


In [38]:
decision_tree = train_decisionTree(X_train, y_train)
pickle_file('decision_tree', decision_tree)
labels_pred = decision_tree.predict(X_test)
print_classification_report(ydf_test, labels_pred)

Training decision tree
             precision    recall  f1-score   support

          0       0.65      0.69      0.67     11088
          1       0.45      0.36      0.40      3786
          2       0.38      0.31      0.34      3435
          3       0.45      0.41      0.43      1376
          4       0.41      0.33      0.37      2825
          5       0.31      0.23      0.26      1227
          6       0.23      0.17      0.19      1519
          7       0.20      0.13      0.16      1374
          8       0.20      0.13      0.16      1171
          9       0.36      0.20      0.26      3759
         10       0.53      0.50      0.52      1313
         11       0.36      0.27      0.31       359
         12       0.43      0.28      0.34      3883
         13       0.44      0.34      0.38       868
         14       0.50      0.51      0.50       612
         15       0.34      0.30      0.32      1790

avg / total       0.46      0.40      0.42     40385



In [39]:
adaboost_decision_tree = train_adaboost_decision_tree(X_train, y_train)
labels_pred = adaboost_decision_tree.predict(X_test)
pickle_file('adaboost_decision_tree', adaboost_decision_tree)
print_classification_report(ydf_test, labels_pred)

Training adaboost decision tree
             precision    recall  f1-score   support

          0       0.71      0.72      0.71     11088
          1       0.54      0.42      0.47      3786
          2       0.50      0.37      0.42      3435
          3       0.60      0.47      0.53      1376
          4       0.51      0.40      0.45      2825
          5       0.38      0.24      0.29      1227
          6       0.39      0.22      0.28      1519
          7       0.32      0.17      0.22      1374
          8       0.34      0.18      0.23      1171
          9       0.46      0.28      0.35      3759
         10       0.67      0.47      0.56      1313
         11       0.63      0.35      0.45       359
         12       0.56      0.40      0.47      3883
         13       0.63      0.41      0.50       868
         14       0.79      0.51      0.62       612
         15       0.50      0.32      0.39      1790

avg / total       0.56      0.45      0.50     40385

