Progress Summary: 3 classifiers (SVM, Log Reg, Multinomial NB) running on all data.

# Reding data ( Stage 1 )

In [1]:
# Reading data
import numpy as np
import pandas as pd
import scipy.io
import classifiers as clf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os

In [2]:
current_dir = os.getcwd()
df = pd.read_csv(current_dir + '/data/movie_reviews/train.tsv',encoding = "ISO-8859-1", sep='\t')
#print(df.info())
print(df.head())

# Spliting data to obtain comments and labels
# Technically should also include summary at this point but ditched to save time

#comments = df['Text']
#score_label = df['Score']
print('\n\nComments size: ', df['Phrase'].shape, "\t", "Labels size: ", df['Sentiment'].shape)

   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   
2         3           1                                           A series   
3         4           1                                                  A   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  


Comments size:  (156060,) 	 Labels size:  (156060,)


In [18]:
# NOT SURE NECESSARY FOR THIS IMPLEMENTATION
#toxic_labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

id comment_text toxic severe_toxic obscene threat insult identity_hate



# Preprocessing data ( Stage 2 )

#### Preprocessing function

In [6]:
import nltk
# Downloading componnents of nltk (execute just one time nltk.download())
# nltk.download()

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
import string 
import pandas as pd 
from nltk import pos_tag 
from nltk.stem import PorterStemmer

# Function to preprocess the text data
def preprocessing(text):
    # Removing standar punctuation (replacing with blank "" spaces)
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
    
    # Tokenizing the text into words (based on white spaces to build the list)
    tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)]
    
    # Changing to lower case every word in the list to reduce duplicates
    tokens = [word.lower() for word in tokens]
    
    # Removing english stop words from the list (stop words do not carry much weight in understanding the sentence)
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    # Removing words which length is lower than 3 (do not apport much of a meaning)
    tokens = [word for word in tokens if len(word) >= 3]
    
    # Using PorterStemmer to stem suffixes in words
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Tagging the words
    # “NN (noun, common, singular), NNP (noun, proper, singular), 
    # NNPS (noun, proper, plural), NNS (noun, common, plural), 
    # VB (verb, base form), VBD (verb, past tense), 
    # VBG (verb, present participle), VBN (verb, past participle), 
    # VBP (verb, present tense, not third person singular), 
    # VBZ (verb, present tense, third person singular)”
    tagged_corpus = pos_tag(tokens)
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
    
    # Lemmatizing model
    lemmatizer = WordNetLemmatizer()
    
    # Validating tags and lemmatizing accordingly
    def prat_lemmatize(token,tag):
        # Nouns
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        # Verbs
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        # Any other
        else:
            return lemmatizer.lemmatize(token,'n')
    
    # Reconstructing text
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             
    # Return reconstructed text
    return pre_proc_text


#### Preprocess and storage process

In [7]:
#from tqdm import tqdm
import pickle
import sys

# Increasing depth in recursion limit
#sys.setrecursionlimit(5000)

# Initialising array to storage preprocessed data
#preprocessed_data = []


# Pre-processing

#df['Phrase'] = df['Phrase'].apply(preprocessing)

#i = 0
#for line in tqdm(comments):
#    i = i+1
#    preprocessed_data.append(preprocessing(line))

In [9]:
# Saving the preprocessed data
#pickle_out = open("movie_preprocessed.pickle","wb")
#pickle.dump(df, pickle_out)
#pickle_out.close()

# Splitting in training and test set ( Stage 3 )

In [3]:
import pickle
from sklearn.model_selection import train_test_split


# Importing preprocessed text data
pickle_in = open("movie_preprocessed.pickle","rb")
trainData = pickle.load(pickle_in)

x_train, x_test, y_train, y_test = train_test_split(
    trainData['Phrase'], trainData['Sentiment'], test_size=0.2, random_state=37)

#x_train, x_test, y_train, y_test = train_test_split(
#    df['Phrase'], df['Sentiment'], test_size=0.2, random_state=37)


print( "Training set size:\t", len(x_train), "\nTest set size:\t\t", len(x_test) )

In [4]:
#ys_train = toxic_labels.as_matrix()[0:train_size];
#ys_test = toxic_labels.as_matrix()[train_size + 1:len(toxic_label)];

# Computing TF-IDF features ( Stage 4 )

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizer model 
# Ignoring terms with lower frequency than 2, range of sequences of words from 1 to 2,
# most frequent 4000 words and normalising with l2
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_words='english', 
                             max_features= 4000,strip_accents='unicode',  norm='l2')

#vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 2),  stop_words='english', 
#                             max_features= 4000,strip_accents='unicode')

features_train = vectorizer.fit_transform(x_train).todense()
features_test = vectorizer.transform(x_test).todense()

# Classifying ( Stage 5 )

#### Multinomial Naive Bayes Classifier

In [19]:
from sklearn.naive_bayes import MultinomialNB

mnb_clf = MultinomialNB().fit(features_train, y_train)

mnb_predicted_train = mnb_clf.predict(features_train)
mnb_predicted_test = mnb_clf.predict(features_test)

#### SVM Classifier

In [20]:
from sklearn.svm import LinearSVC #,SVC

# c = penalty parameter
c = 1.0
svm_clf = LinearSVC(C = c).fit(features_train, y_train)

svm_predicted_train = svm_clf.predict(features_train)
svm_predicted_test = svm_clf.predict(features_test)

#### XGBoost

In [21]:
# CANOT CURRENTLY RUN THIS S NOT INSTALLED

#from xgboost.sklearn import XGBClassifier

# Extreme Gradient Boost
# md = Max depth
# ss = Subsample ratio of training instance
# cs = Subsample ratio of columns when constructing each tree

#md = 1
#ss = 0.8
#cs = 0.8
#clf = XGBClassifier( max_depth = md, subsample = ss,
#                        colsample_bytree = cs).fit(features_train, y_train)

#predicted_train = clf.predict(features_train)
#predicted_test = clf.predict(features_test)

#### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

# C = Inverse of regularization strength
c = 1.0
log_clf = LogisticRegression(C = c).fit(features_train, y_train)

log_predicted_train = log_clf.predict(features_train)
log_predicted_test = log_clf.predict(features_test)

In [23]:
#### Random Forest
print(1)

1


In [24]:
from sklearn.ensemble import RandomForestClassifier

rnf_clf = RandomForestClassifier().fit(features_train, y_train)

rnf_predicted_train = rnf_clf.predict(features_train)
rnf_predicted_test = rnf_clf.predict(features_test)

### Ensemble 

In [None]:
# Voting Ensemble for Classification
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import ensemble

# create the sub models
estimators = []
estimators.append(('logistic', LogisticRegression()))
estimators.append(('cart', DecisionTreeClassifier()))
estimators.append(('svm', SVC()))

# create the ensemble model
## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
clf = ensemble.VotingClassifier(estimators)
clf.fit(features_train, y_train)

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# clf = ensemble.GradientBoostingClassifier(n_estimators=20, random_state=7, verbose=3)
# clf.fit(features_train, y_train)

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
# kfold = model_selection.KFold(n_splits=10, random_state=7)
# model = ensemble.AdaBoostClassifier(n_estimators=10, random_state=7)
# results = model_selection.cross_val_score(model, features_train, y_train, cv=kfold)
# print(results.mean())

# Metrics ( Stage 6 )

In [25]:
from sklearn.metrics import classification_report,accuracy_score

# Training confusion matrix
print ("\nNaive Bayes - Train Confusion Matrix\n\n",
       pd.crosstab(y_train, mnb_predicted_train, rownames = ["Actual"], colnames = ["Predicted"]))
# Training accuracy
print ("\nNaive Bayes- Train accuracy",
       round(accuracy_score(y_train, mnb_predicted_train),3))
# Training report
print ("\nNaive Bayes  - Train Classification Report\n",
       classification_report(y_train, mnb_predicted_train))

print("------------------------------------------------------------")

# Test confusion matrix
print ("\nNaive Bayes - Test Confusion Matrix\n\n",
       pd.crosstab(y_test,mnb_predicted_test,rownames = ["Actual"], colnames = ["Predicted"]))  
# Test accuracy
print ("\nNaive Bayes- Test accuracy",
       round(accuracy_score(y_test,mnb_predicted_test),3))
# Test report
print ("\nNaive Bayes  - Test Classification Report\n",
       classification_report(y_test,mnb_predicted_test))

print("------------------------------------------------------------")





Naive Bayes - Train Confusion Matrix

 Predicted    0     1      2     3    4
Actual                                
0          361  1891   3243   130    4
1          106  4555  16504   614   15
2           27  1476  59539  2501   55
3            6   259  17334  8567  225
4            0    22   3132  3555  727

Naive Bayes- Train accuracy 0.591

Naive Bayes  - Train Classification Report
              precision    recall  f1-score   support

          0       0.72      0.06      0.12      5629
          1       0.56      0.21      0.30     21794
          2       0.60      0.94      0.73     63598
          3       0.56      0.32      0.41     26391
          4       0.71      0.10      0.17      7436

avg / total       0.59      0.59      0.53    124848

------------------------------------------------------------

Naive Bayes - Test Confusion Matrix

 Predicted   0     1      2     3    4
Actual                               
0          66   475    862    38    2
1          31  1020

In [26]:
# Training confusion matrix
print ("\nSVM - Train Confusion Matrix\n\n",
       pd.crosstab(y_train, svm_predicted_train, rownames = ["Actual"], colnames = ["Predicted"]))
# Training accuracy
print ("\nSVM- Train accuracy",
       round(accuracy_score(y_train, svm_predicted_train),3))
# Training report
print ("\nSVM  - Train Classification Report\n",
       classification_report(y_train, svm_predicted_train))

print("------------------------------------------------------------")

# Test confusion matrix
print ("\nSVM - Test Confusion Matrix\n\n",
       pd.crosstab(y_test,svm_predicted_test,rownames = ["Actual"], colnames = ["Predicted"]))  
# Test accuracy
print ("\nSVM- Test accuracy",
       round(accuracy_score(y_test,svm_predicted_test),3))
# Test report
print ("\nSVM  - Test Classification Report\n",
       classification_report(y_test,svm_predicted_test))

print("------------------------------------------------------------")


SVM - Train Confusion Matrix

 Predicted     0     1      2      3     4
Actual                                   
0          1412  2429   1532    243    13
1           596  8391  11435   1311    61
2           205  3287  55795   4082   229
3            49   819  11903  12757   863
4             2    94   1256   3892  2192

SVM- Train accuracy 0.645

SVM  - Train Classification Report
              precision    recall  f1-score   support

          0       0.62      0.25      0.36      5629
          1       0.56      0.39      0.46     21794
          2       0.68      0.88      0.77     63598
          3       0.57      0.48      0.52     26391
          4       0.65      0.29      0.41      7436

avg / total       0.63      0.65      0.62    124848

------------------------------------------------------------

SVM - Test Confusion Matrix

 Predicted    0     1      2     3    4
Actual                                
0          253   670    436    76    8
1          203  1818   3060

In [27]:
# Training confusion matrix
print ("\nLogistic Regression - Train Confusion Matrix\n\n",
       pd.crosstab(y_train, log_predicted_train, rownames = ["Actual"], colnames = ["Predicted"]))
# Training accuracy
print ("\nLogistic Regression- Train accuracy",
       round(accuracy_score(y_train, log_predicted_train),3))
# Training report
print ("\nLogistic Regression  - Train Classification Report\n",
       classification_report(y_train, log_predicted_train))

print("------------------------------------------------------------")

# Test confusion matrix
print ("\nLogistic Regression - Test Confusion Matrix\n\n",
       pd.crosstab(y_test,log_predicted_test,rownames = ["Actual"], colnames = ["Predicted"]))  
# Test accuracy
print ("\nLogistic Regression- Test accuracy",
       round(accuracy_score(y_test,log_predicted_test),3))
# Test report
print ("\nLogistic Regression  - Test Classification Report\n",
       classification_report(y_test,log_predicted_test))

print("------------------------------------------------------------")


Logistic Regression - Train Confusion Matrix

 Predicted    0     1      2      3     4
Actual                                  
0          980  2516   1868    253    12
1          378  7432  12670   1273    41
2          135  2609  57001   3672   181
3           36   681  12896  12179   599
4            2    88   1547   4120  1679

Logistic Regression- Train accuracy 0.635

Logistic Regression  - Train Classification Report
              precision    recall  f1-score   support

          0       0.64      0.17      0.27      5629
          1       0.56      0.34      0.42     21794
          2       0.66      0.90      0.76     63598
          3       0.57      0.46      0.51     26391
          4       0.67      0.23      0.34      7436

avg / total       0.62      0.63      0.60    124848

------------------------------------------------------------

Logistic Regression - Test Confusion Matrix

 Predicted    0     1      2     3    4
Actual                                
0        

In [28]:
# Training confusion matrix
print ("\nRandom Forest - Train Confusion Matrix\n\n",
       pd.crosstab(y_train, rnf_predicted_train, rownames = ["Actual"], colnames = ["Predicted"]))
# Training accuracy
print ("\nRandom Forest- Train accuracy",
       round(accuracy_score(y_train, rnf_predicted_train),3))
# Training report
print ("\nRandom Forest - Train Classification Report\n",
       classification_report(y_train, rnf_predicted_train))

print("------------------------------------------------------------")

# Test confusion matrix
print ("\nRandom Forest - Test Confusion Matrix\n\n",
       pd.crosstab(y_test,rnf_predicted_test,rownames = ["Actual"], colnames = ["Predicted"]))  
# Test accuracy
print ("\nRandom Forest - Test accuracy",
       round(accuracy_score(y_test,rnf_predicted_test),3))
# Test report
print ("\nRandom Forest - Test Classification Report\n",
       classification_report(y_test,rnf_predicted_test))

print("------------------------------------------------------------")


Random Forest - Train Confusion Matrix

 Predicted     0      1      2      3     4
Actual                                    
0          3804   1182    582     58     3
1           836  14706   5845    384    23
2           266   2558  57733   2861   180
3            33    346   6253  18700  1059
4             4     38    462   1739  5193

Random Forest- Train accuracy 0.802

Random Forest - Train Classification Report
              precision    recall  f1-score   support

          0       0.77      0.68      0.72      5629
          1       0.78      0.67      0.72     21794
          2       0.81      0.91      0.86     63598
          3       0.79      0.71      0.75     26391
          4       0.80      0.70      0.75      7436

avg / total       0.80      0.80      0.80    124848

------------------------------------------------------------

Random Forest - Test Confusion Matrix

 Predicted    0     1      2     3    4
Actual                                
0          519   612

In [29]:
# Getting feature names from vectorizer
feature_names = vectorizer.get_feature_names()

# Getting weights assigned to the features (it works only with linear kernels)
# Empirical log probability of features given a class, P(x_i|y).
coefs = mnb_clf.coef_

# Smoothed empirical log probability for each class.
intercept = mnb_clf.intercept_

# Sorted coefs
coefs_with_fns = sorted(zip(mnb_clf.coef_[0], feature_names))

print ("\n\nTop 10 features - First ten & Last ten\n")
n = 10
top_n_coefs = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top_n_coefs:
    # %-15s is for padding left
    print('|\t%.4f\t%-16s\t\t|\t%.4f\t%-16s|' % (coef_1, fn_1, coef_2, fn_2))



Top 10 features - First ten & Last ten

|	-9.6672	1960            		|	-4.4556	movi            |
|	-9.6672	1999            		|	-4.5999	bad             |
|	-9.6672	20th            		|	-4.9988	film            |
|	-9.6672	20th centuri    		|	-5.4037	like            |
|	-9.6672	50              		|	-5.4243	minut           |
|	-9.6672	60              		|	-5.4630	make            |
|	-9.6672	acclaim         		|	-5.6182	dull            |
|	-9.6672	accomplish      		|	-5.6322	worst           |
|	-9.6672	accumul         		|	-5.7489	bore            |
|	-9.6672	ach             		|	-5.7684	charact         |


In [30]:
np.sum(y_test == 2)
#15984/31212


15984