# Reding data ( Stage 1 )

In [4]:
# Reading data
import numpy as np
import pandas as pd
import scipy.io
import classifiers as clf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os

In [5]:
current_dir = os.getcwd()
df1 = pd.read_csv(current_dir + '/data/tweet_sentiment/train1',encoding = "ISO-8859-1")
df2 = pd.read_csv(current_dir + '/data/tweet_sentiment/train2',encoding = "ISO-8859-1")
frames = [df1, df2]
df = pd.concat(frames)
#print(df.info())
print(df.head())

# Spliting data to obtain comments and labels
# Technically should also include summary at this point but ditched to save time

#comments = df['Text']
#score_label = df['Score']
print('\n\nComments size: ', df['5'].shape, "\t", "Labels size: ", df['0'].shape)

   Unnamed: 0                                                  5  0
0      172348  @KMC1121 lol... im going to log off for about ...  0
1      802252  @MissGC : Hahaha. Unfortunately not. My lesbia...  4
2      108054  @vissy i know it does  super cute!! get an iph...  0
3      406610  iss gonna redo her resume and apply at cineple...  0
4     1181642    @amypalko wow, exciting. Nice hard bound copy.   4


Comments size:  (1360000,) 	 Labels size:  (1360000,)


In [18]:
# NOT SURE NECESSARY FOR THIS IMPLEMENTATION
#toxic_labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

id comment_text toxic severe_toxic obscene threat insult identity_hate



# Preprocessing data ( Stage 2 )

#### Preprocessing function

In [5]:
import nltk
# Downloading componnents of nltk (execute just one time nltk.download())
# nltk.download()

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
import string 
import pandas as pd 
from nltk import pos_tag 
from nltk.stem import PorterStemmer

# Function to preprocess the text data
def preprocessing(text):
    # Removing standar punctuation (replacing with blank "" spaces)
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
    
    # Tokenizing the text into words (based on white spaces to build the list)
    tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)]
    
    # Changing to lower case every word in the list to reduce duplicates
    tokens = [word.lower() for word in tokens]
    
    # Removing english stop words from the list (stop words do not carry much weight in understanding the sentence)
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    # Removing words which length is lower than 3 (do not apport much of a meaning)
    tokens = [word for word in tokens if len(word) >= 3]
    
    # Using PorterStemmer to stem suffixes in words
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Tagging the words
    # “NN (noun, common, singular), NNP (noun, proper, singular), 
    # NNPS (noun, proper, plural), NNS (noun, common, plural), 
    # VB (verb, base form), VBD (verb, past tense), 
    # VBG (verb, present participle), VBN (verb, past participle), 
    # VBP (verb, present tense, not third person singular), 
    # VBZ (verb, present tense, third person singular)”
    tagged_corpus = pos_tag(tokens)
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
    
    # Lemmatizing model
    lemmatizer = WordNetLemmatizer()
    
    # Validating tags and lemmatizing accordingly
    def prat_lemmatize(token,tag):
        # Nouns
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        # Verbs
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        # Any other
        else:
            return lemmatizer.lemmatize(token,'n')
    
    # Reconstructing text
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             
    # Return reconstructed text
    return pre_proc_text


#### Preprocess and storage process

In [None]:
#from tqdm import tqdm
import pickle
import sys

# Increasing depth in recursion limit
#sys.setrecursionlimit(5000)

# Initialising array to storage preprocessed data
#preprocessed_data = []


# Pre-processing
df['5'] = df['5'].apply(preprocessing)

#i = 0
#for line in tqdm(comments):
#    i = i+1
#    preprocessed_data.append(preprocessing(line))

In [16]:
# Saving the preprocessed data
pickle_out = open("tweet_preprocessed.pickle","wb")
pickle.dump(df, pickle_out)
pickle_out.close()

# Splitting in training and test set ( Stage 3 )

In [6]:
import pickle
from sklearn.model_selection import train_test_split


# Importing preprocessed text data
#pickle_in = open("tweet_preprocessed.pickle","rb")
#trainData = pickle.load(pickle_in)

#x_train, x_test, y_train, y_test = train_test_split(
#    trainData['5'], trainData['0'], test_size=0.2, random_state=37)

x_train, x_test, y_train, y_test = train_test_split(
    df['5'], df['0'], test_size=0.2, random_state=37)

# Splitting into train and test set (75% train - 25% set)
#train_size = int( round( len(preprocessed_data) * 0.75 ) )

# Filling the training set
#x_train = np.array([''.join(rec) for rec in preprocessed_data[0 : train_size]])
#y_train = np.array([rec for rec in toxic_label[0 : train_size]])

# Filling the test set
#x_test = np.array([''.join(rec) for rec in preprocessed_data[train_size + 1 : len(preprocessed_data)]])
#y_test = np.array([rec for rec in toxic_label[train_size + 1 : len(toxic_label)]])

print( "Training set size:\t", len(x_train), "\nTest set size:\t\t", len(x_test) )

Training set size:	 1088000 
Test set size:		 272000


In [32]:
#ys_train = toxic_labels.as_matrix()[0:train_size];
#ys_test = toxic_labels.as_matrix()[train_size + 1:len(toxic_label)];

(119678, 6)
(39892, 6)


# Computing TF-IDF features ( Stage 4 )

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizer model 
# Ignoring terms with lower frequency than 2, range of sequences of words from 1 to 2,
# most frequent 4000 words and normalising with l2
#vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_words='english', 
#                             max_features= 4000,strip_accents='unicode',  norm='l2')

vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 2),  stop_words='english', 
                             max_features= 4000,strip_accents='unicode')

features_train = vectorizer.fit_transform(x_train).todense()
features_test = vectorizer.transform(x_test).todense()

MemoryError: 

# Classifying ( Stage 5 )

#### Multinomial Naive Bayes Classifier

In [24]:
from sklearn.naive_bayes import MultinomialNB

mnb_clf = MultinomialNB().fit(features_train, y_train)

mnb_predicted_train = mnb_clf.predict(features_train)
mnb_predicted_test = mnb_clf.predict(features_test)

#### SVM Classifier

In [26]:
from sklearn.svm import LinearSVC #,SVC

# c = penalty parameter
c = 1.0
svm_clf = LinearSVC(C = c).fit(features_train, y_train)

svm_predicted_train = svm_clf.predict(features_train)
svm_predicted_test = svm_clf.predict(features_test)

#### XGBoost

In [22]:
# CANOT CURRENTLY RUN THIS S NOT INSTALLED

#from xgboost.sklearn import XGBClassifier

# Extreme Gradient Boost
# md = Max depth
# ss = Subsample ratio of training instance
# cs = Subsample ratio of columns when constructing each tree

#md = 1
#ss = 0.8
#cs = 0.8
#clf = XGBClassifier( max_depth = md, subsample = ss,
#                        colsample_bytree = cs).fit(features_train, y_train)

#predicted_train = clf.predict(features_train)
#predicted_test = clf.predict(features_test)

ModuleNotFoundError: No module named 'xgboost'

#### Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

# C = Inverse of regularization strength
c = 1.0
log_clf = LogisticRegression(C = c).fit(features_train, y_train)

log_predicted_train = log_clf.predict(features_train)
log_predicted_test = log_clf.predict(features_test)

### Ensemble

In [None]:
# Voting Ensemble for Classification
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import ensemble

# create the sub models
estimators = []
estimators.append(('logistic', LogisticRegression()))
estimators.append(('cart', DecisionTreeClassifier()))
estimators.append(('svm', SVC()))

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
# clf = ensemble.VotingClassifier(estimators)

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# clf = ensemble.GradientBoostingClassifier(n_estimators=20, random_state=7, verbose=3)
# clf.fit(features_train, y_train)

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
kfold = model_selection.KFold(n_splits=10, random_state=7)
model = ensemble.AdaBoostClassifier(n_estimators=10, random_state=7)
results = model_selection.cross_val_score(model, features_train, y_train, cv=kfold)
print(results.mean())

# Metrics ( Stage 6 )

In [28]:
from sklearn.metrics import classification_report,accuracy_score

# Training confusion matrix
print ("\nNaive Bayes - Train Confusion Matrix\n\n",
       pd.crosstab(y_train, mnb_predicted_train, rownames = ["Actual"], colnames = ["Predicted"]))
# Training accuracy
print ("\nNaive Bayes- Train accuracy",
       round(accuracy_score(y_train, mnb_predicted_train),3))
# Training report
print ("\nNaive Bayes  - Train Classification Report\n",
       classification_report(y_train, mnb_predicted_train))

print("------------------------------------------------------------")

# Test confusion matrix
print ("\nNaive Bayes - Test Confusion Matrix\n\n",
       pd.crosstab(y_test,mnb_predicted_test,rownames = ["Actual"], colnames = ["Predicted"]))  
# Test accuracy
print ("\nNaive Bayes- Test accuracy",
       round(accuracy_score(y_test,mnb_predicted_test),3))
# Test report
print ("\nNaive Bayes  - Test Classification Report\n",
       classification_report(y_test,mnb_predicted_test))

print("------------------------------------------------------------")





Naive Bayes - Train Confusion Matrix

 Predicted    1   2   3    4      5
Actual                            
1          921   0   7   14   1968
2          145  11   6   41   1490
3           79   0  47   75   2239
4           27   0   6  238   4238
5           53   0   4   41  20350

Naive Bayes- Train accuracy 0.674

Naive Bayes  - Train Classification Report
              precision    recall  f1-score   support

          1       0.75      0.32      0.45      2910
          2       1.00      0.01      0.01      1693
          3       0.67      0.02      0.04      2440
          4       0.58      0.05      0.10      4509
          5       0.67      1.00      0.80     20448

avg / total       0.68      0.67      0.57     32000

------------------------------------------------------------

Naive Bayes - Test Confusion Matrix

 Predicted    1  2  3   4     5
Actual                        
1          192  1  0   4   543
2           27  1  3   9   386
3           16  0  2  22   570
4     

In [29]:
# Training confusion matrix
print ("\nSVM - Train Confusion Matrix\n\n",
       pd.crosstab(y_train, svm_predicted_train, rownames = ["Actual"], colnames = ["Predicted"]))
# Training accuracy
print ("\nSVM- Train accuracy",
       round(accuracy_score(y_train, svm_predicted_train),3))
# Training report
print ("\nSVM  - Train Classification Report\n",
       classification_report(y_train, svm_predicted_train))

print("------------------------------------------------------------")

# Test confusion matrix
print ("\nSVM - Test Confusion Matrix\n\n",
       pd.crosstab(y_test,svm_predicted_test,rownames = ["Actual"], colnames = ["Predicted"]))  
# Test accuracy
print ("\nSVM- Test accuracy",
       round(accuracy_score(y_test,svm_predicted_test),3))
# Test report
print ("\nSVM  - Test Classification Report\n",
       classification_report(y_test,svm_predicted_test))

print("------------------------------------------------------------")


SVM - Train Confusion Matrix

 Predicted     1    2     3     4      5
Actual                                 
1          2233   39    63    48    527
2           247  713   115   118    500
3           154   61  1057   255    913
4            74   35   140  1526   2734
5           135   30   102   376  19805

SVM- Train accuracy 0.792

SVM  - Train Classification Report
              precision    recall  f1-score   support

          1       0.79      0.77      0.78      2910
          2       0.81      0.42      0.55      1693
          3       0.72      0.43      0.54      2440
          4       0.66      0.34      0.45      4509
          5       0.81      0.97      0.88     20448

avg / total       0.78      0.79      0.77     32000

------------------------------------------------------------

SVM - Test Confusion Matrix

 Predicted    1   2    3    4     5
Actual                            
1          408  32   58   37   205
2           90  57   63   41   175
3           65  39

In [30]:
# Training confusion matrix
print ("\nLogistic Regression - Train Confusion Matrix\n\n",
       pd.crosstab(y_train, mnb_predicted_train, rownames = ["Actual"], colnames = ["Predicted"]))
# Training accuracy
print ("\nLogistic Regression- Train accuracy",
       round(accuracy_score(y_train, mnb_predicted_train),3))
# Training report
print ("\nLogistic Regression  - Train Classification Report\n",
       classification_report(y_train, mnb_predicted_train))

print("------------------------------------------------------------")

# Test confusion matrix
print ("\nLogistic Regression - Test Confusion Matrix\n\n",
       pd.crosstab(y_test,log_predicted_test,rownames = ["Actual"], colnames = ["Predicted"]))  
# Test accuracy
print ("\nLogistic Regression- Test accuracy",
       round(accuracy_score(y_test,log_predicted_test),3))
# Test report
print ("\nLogistic Regression  - Test Classification Report\n",
       classification_report(y_test,log_predicted_test))

print("------------------------------------------------------------")


Logistic Regression - Train Confusion Matrix

 Predicted    1   2   3    4      5
Actual                            
1          921   0   7   14   1968
2          145  11   6   41   1490
3           79   0  47   75   2239
4           27   0   6  238   4238
5           53   0   4   41  20350

Logistic Regression- Train accuracy 0.674

Logistic Regression  - Train Classification Report
              precision    recall  f1-score   support

          1       0.75      0.32      0.45      2910
          2       1.00      0.01      0.01      1693
          3       0.67      0.02      0.04      2440
          4       0.58      0.05      0.10      4509
          5       0.67      1.00      0.80     20448

avg / total       0.68      0.67      0.57     32000

------------------------------------------------------------

Logistic Regression - Test Confusion Matrix

 Predicted    1   2   3    4     5
Actual                           
1          370   6  37   20   307
2           86  22  35   28

In [31]:
# Getting feature names from vectorizer
feature_names = vectorizer.get_feature_names()

# Getting weights assigned to the features (it works only with linear kernels)
# Empirical log probability of features given a class, P(x_i|y).
coefs = mnb_clf.coef_

# Smoothed empirical log probability for each class.
intercept = mnb_clf.intercept_

# Sorted coefs
coefs_with_fns = sorted(zip(mnb_clf.coef_[0], feature_names))

print ("\n\nTop 10 features - First ten & Last ten\n")
n = 10
top_n_coefs = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top_n_coefs:
    # %-15s is for padding left
    print('|\t%.4f\t%-16s\t\t|\t%.4f\t%-16s|' % (coef_1, fn_1, coef_2, fn_2))



Top 10 features - First ten & Last ten

|	-9.7700	140             		|	-5.1244	tast            |
|	-9.7700	abl buy         		|	-5.1302	product         |
|	-9.7700	absolut best    		|	-5.2070	like            |
|	-9.7700	afternoon snack 		|	-5.2512	buy             |
|	-9.7700	agav nectar     		|	-5.5338	coffe           |
|	-9.7700	alway love      		|	-5.6061	order           |
|	-9.7700	amazon best     		|	-5.6212	food            |
|	-9.7700	auto ship       		|	-5.6276	tri             |
|	-9.7700	best cup        		|	-5.6356	box             |
|	-9.7700	best deal       		|	-5.6976	dog             |


In [3]:
y_test.head()

NameError: name 'y_test' is not defined