# Andreas Pappas 
# ID: 1115201500201 

## Exemptive Data Mining Project for spring-2020

First and foremost we'll load the libraries needed: 

In [134]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import operator
import nltk
import pickle
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, wordnet
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences

import sys
import unidecode

import sklearn
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split as cv
from sklearn import svm # Support Vector Machine
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.naive_bayes import MultinomialNB # Naive Bayes

Now we'll load the dataset we'll work with:

In [2]:
path = 'data/'
train_data_file = path + 'train.csv'
test_data_file = path + 'impermium_verification_set.csv'
eval_data_file = path + 'impermium_verification_labels.csv'

# Now that we got the paths, we'll load the data into pandas dataframe:

train_data = pd.read_csv(train_data_file)
test_data = pd.read_csv(test_data_file)
eval_data = pd.read_csv(eval_data_file)

In [3]:
train_data.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [9]:
test_data.head()

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,,20120603163526Z,"""like this if you are a tribe fan""",PrivateTest
1,2,,20120531215447Z,"""you're idiot.......................""",PrivateTest
2,3,,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",PrivateTest
3,4,,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",PrivateTest
4,5,,20120602223825Z,"""haha green me red you now loser whos winning ...",PrivateTest


In [10]:
eval_data.head()

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,0,20120603163526Z,"""like this if you are a tribe fan""",PrivateTest
1,2,1,20120531215447Z,"""you're idiot.......................""",PrivateTest
2,3,1,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",PrivateTest
3,4,1,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",PrivateTest
4,5,1,20120602223825Z,"""haha green me red you now loser whos winning ...",PrivateTest


# Preprocessing & Cleaning of data 

*Now that we loaded our data we're going to clean them, so they're in a more readable form for our algorithms, and so we get a better precision and understanding of the data* 

In [4]:
########################################
# Load the cleaned words
########################################

cl_path = 'cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

In [5]:
########################################
## process texts in datasets
########################################
print('Processing text dataset')
# Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^?!.,:a-z\d ]',re.IGNORECASE)

# regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)
word_count_dict = defaultdict(int)

def clean_text(text, remove_stopwords=True, stem_words=True, count_null_words=True, clean_wiki_tokens=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    # dirty words
    #non-ASCII characters to their closest ASCII equivalent automatically.
    #text = unidecode.unidecode(text) 

    text = text.lower() #lower all text
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)
    text = re.sub(r"\\xa|\\xc|\\|\\xe|\\u", " ", text)
    # remove @mentions
    text = re.sub('@[A-Za-z0-9]+', '', text)
    
    if clean_wiki_tokens:
        # Drop the image
        text = re.sub(r"image:[a-zA-Z0-9]*\.jpg", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*\.png", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*\.gif", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*\.bmp", " ", text)

        # Drop css
        text = re.sub(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " ",text)
        text = re.sub(r"\{\|[^\}]*\|\}", " ", text)
        
        # Clean templates
        text = re.sub(r"\[?\[user:.*\]", " ", text)
        text = re.sub(r"\[?\[user:.*\|", " ", text)        
        text = re.sub(r"\[?\[wikipedia:.*\]", " ", text)
        text = re.sub(r"\[?\[wikipedia:.*\|", " ", text)
        text = re.sub(r"\[?\[special:.*\]", " ", text)
        text = re.sub(r"\[?\[special:.*\|", " ", text)
        text = re.sub(r"\[?\[category:.*\]", " ", text)
        text = re.sub(r"\[?\[category:.*\|", " ", text)
    
    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"\!", " ! ", text)
    text = re.sub(r"\"", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub(' ', text)
    #text = special_character_removal.sub('',text)

    if count_null_words:
        text = text.split()
        for t in text:
            word_count_dict[t] += 1
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    return (text)

list_sentences_train = train_data["Comment"].fillna("noComment").values
list_sentences_test = test_data["Comment"].fillna("noComment").values
list_sentences_eval = eval_data["Comment"].fillna("noComment").values

train_comments = [clean_text(text) for text in list_sentences_train]
test_comments = [clean_text(text) for text in list_sentences_test]
eval_comments = [clean_text(text) for text in list_sentences_eval]

print("Cleaned.")

Processing text dataset
Cleaned.


**Now that we cleaned our data, we'll create new files to seperate uncleaned data from cleaned data**

In [6]:
train_data['Comment'] = train_comments
test_data['Comment'] = test_comments
eval_data['Comment'] = eval_comments

#save the cleaned data
train_data.to_csv('data/cleaned_train.csv', index = False)
test_data.to_csv('data/cleaned_test.csv', index = False)
eval_data.to_csv('data/cleaned_eval.csv', index = False)

In [7]:
#load the cleaned data into pandas dataframe:

cl_train_df = pd.read_csv('data/cleaned_train.csv')
cl_test_df = pd.read_csv('data/cleaned_test.csv')
cl_eval_df = pd.read_csv('data/cleaned_eval.csv')

In [8]:
cl_train_df.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,you fuck your dad
1,0,20120528192215Z,i realli do not understand your point it seem ...
2,0,,a major of canadian can and has been wrong bef...
3,0,,listen if you dont wanna get marri to a man or...
4,0,20120619094753Z,c xe c b u ea n xu u ed ng u u b u eddng bi u ...


In [42]:
cl_test_df.head()

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,,20120603163526Z,like this if you are a tribe fan,PrivateTest
1,2,,20120531215447Z,you are idiot,PrivateTest
2,3,,20120823164228Z,i am a woman babs and the only war on women i ...,PrivateTest
3,4,,20120826010752Z,wow & you benefitted so many wins this year fr...,PrivateTest
4,5,,20120602223825Z,haha green me red you now loser whos winning n...,PrivateTest


In [43]:
cl_eval_df.head()

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,0,20120603163526Z,like this if you are a tribe fan,PrivateTest
1,2,1,20120531215447Z,you are idiot,PrivateTest
2,3,1,20120823164228Z,i am a woman babs and the only war on women i ...,PrivateTest
3,4,1,20120826010752Z,wow & you benefitted so many wins this year fr...,PrivateTest
4,5,1,20120602223825Z,haha green me red you now loser whos winning n...,PrivateTest


**So as we can see the Comments are perfectly cleaned now, compared to the firstly given csv's**

In [9]:
X_train, X_test, y_train, y_test = train_test_split(cl_train_df['Comment'], 
                                                    cl_train_df['Insult'], 
                                                    test_size=0.20, 
                                                    random_state=8)

# Classification with the classic NaiveBayes Algorithm

## Comments to word vectors using CountVectorizer

In [49]:
CountVectorizer().get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [52]:
# Parameter election
ngram_range = (1,1) #we will use bigrams later on the improvement part
min_df = 10
max_df = 1.
max_features = 222

In [53]:
bow = CountVectorizer(encoding='utf-8',
                      ngram_range=ngram_range,
                     stop_words=None,
                     max_df=max_df,
                     min_df=min_df,
                     max_features=max_features)
        
bow_train = bow.fit_transform(X_train.astype('U')).toarray()
print(bow_train.shape)
bow_test = bow.fit_transform(X_test.astype('U')).toarray()
print(bow_test.shape)

(3157, 222)
(790, 222)


## Basic NaiveBayes 

So at this point we will not tune any hyperparameter and we will pass the word vectors produced by the CountVectorizer as asked in the project definition. 
**Later on we will improve the NaiveBayes and compare the results for each improvement**

In [90]:
mnbc = MultinomialNB(alpha=0) #alpha = 0 means we will not use laplace smoothing in this step
mnbc

MultinomialNB(alpha=0, class_prior=None, fit_prior=True)

In [91]:
mnbc.fit(bow_train, Y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


MultinomialNB(alpha=0, class_prior=None, fit_prior=True)

In [92]:
mnbc_pred = mnbc.predict(bow_test)

In [93]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(Y_train, mnbc.predict(bow_train)))

The training accuracy is: 
0.7700348432055749


In [95]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(Y_test, mnbc_pred))

The test accuracy is: 
0.649367088607595


In [96]:
# Classification report
print("Classification report")
print(classification_report(Y_test,mnbc_pred))

Classification report
              precision    recall  f1-score   support

           0       0.84      0.65      0.73       585
           1       0.39      0.64      0.49       205

    accuracy                           0.65       790
   macro avg       0.62      0.65      0.61       790
weighted avg       0.72      0.65      0.67       790



In [98]:
d = {
     'Model': 'Basic Naïve Bayes',
     'Training Set Accuracy': accuracy_score(Y_train, mnbc.predict(bow_train)),
     'Test Set Accuracy': accuracy_score(Y_test, mnbc_pred)
}

df_models_mnbc = pd.DataFrame(d, index=[0])
df_models_mnbc

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Basic Naïve Bayes,0.770035,0.649367


In [307]:
with open('df_models_mnbc.pickle', 'wb') as output: 
    pickle.dump(df_models_mnbc, output)

**Now that we got the accuracy and the F1-Score of the basic NaiveBayes, we will improve it by doing lemmatization, removing stop words, using bigrams, and using laplace Smoothing to check if we're going to get any better results!**

## 1. Lemmatization

In [10]:
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

In [11]:
# IN order to lemmatize, we have to iterate through every word:

nrows = len(train_data)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = train_data.loc[row]['Comment']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

## 2. Stop Words

In [14]:
# Downloading the stop words list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/andrewpap22/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
# Loading the stop words in english
stop_words = list(stopwords.words('english'))
stop_words[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [37]:
#To remove the stop words, we'll handle a regular expression only detecting whole words, as seen in the following example:

example = "me eating a meal"
word = "me"

# The regular expression is:
regex = r"\b" + word + r"\b"  # we need to build it like that to work properly

re.sub(regex, "StopWord", example)

'StopWord eating a meal'

In [38]:
# We can now loop through all the stop words:
for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    train_data['Comment'].str.replace(regex_stopword, '')

In [39]:
train_data.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"[fuck, dad]"
1,0,20120528192215Z,"[realli, understand, point, seem, mix, appl, o..."
2,0,,"[major, canadian, wrong, befor, n, nunless, su..."
3,0,,"[listen, dont, wan, na, get, marri, man, woman..."
4,0,20120619094753Z,"[c, xe, c, b, u, ea, n, xu, u, ed, ng, u, u, b..."


**Now that we have done lemmatization and have removed the stopwords, we'll tune in NaiveBayes again with the newly cleaned data, using bigrams this time and applying Laplace Smoothing by setting alpha = 0.01**

In [62]:
x_train, x_test, labels_train, labels_test = train_test_split(train_data['Comment'], 
                                                    train_data['Insult'], 
                                                    test_size=0.20, 
                                                    random_state=8)

In [9]:
train_data.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,you fuck your dad
1,0,20120528192215Z,i realli do not understand your point it seem ...
2,0,,a major of canadian can and has been wrong bef...
3,0,,listen if you dont wanna get marri to a man or...
4,0,20120619094753Z,c xe c b u ea n xu u ed ng u u b u eddng bi u ...


In [248]:
# Parameter election
ngram_range2 = (1,2) #using bigrams this time!
min_df2 = 10
max_df2 = 1.
max_features2 = 222

In [295]:
bow2 = CountVectorizer(encoding='utf-8',
                     ngram_range=ngram_range2,
                     stop_words=stop_words,
                     max_df=max_df2,
                     min_df=min_df2,
                     max_features=max_features2)

bow_train2 = bow2.fit_transform(x_train.astype('U')).toarray()
print(bow_train2.shape)
bow_test2 = bow2.fit_transform(x_test.astype('U')).toarray()
print(bow_test2.shape)

(3157, 222)
(790, 222)


# Improved NaiveBayes

In [296]:
mnbc2 = MultinomialNB(alpha=0.01) #using Laplace Smoothing with value of 22
mnbc2

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [297]:
mnbc2.fit(bow_train2, labels_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [298]:
mnbc_pred2 = mnbc2.predict(bow_test2)

In [299]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, mnbc2.predict(bow_train2)))

The training accuracy is: 
0.8150142540386442


In [300]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, mnbc_pred2))

The test accuracy is: 
0.6594936708860759


In [302]:
# Classification report
print("Classification report")
print(classification_report(labels_test,mnbc_pred2))

Classification report
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       585
           1       0.28      0.20      0.23       205

    accuracy                           0.66       790
   macro avg       0.51      0.51      0.51       790
weighted avg       0.62      0.66      0.64       790



In [311]:
d = {
     'Model': 'Improved Naïve Bayes',
     'Training Set Accuracy': accuracy_score(labels_train, mnbc2.predict(bow_train2)),
     'Test Set Accuracy': accuracy_score(labels_test, mnbc_pred2)
}

df_models_mnbc2 = pd.DataFrame(d, index=[0])
df_models_mnbc2

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Improved Naïve Bayes,0.815014,0.659494


In [312]:
with open('df_models_mnbc2.pickle', 'wb') as output: 
    pickle.dump(df_models_mnbc2, output)

In [313]:
path_pickles = "/home/andrewpap22/Desktop/dataMining_MainProject/"

list_pickles = [
    "df_models_mnbc.pickle",
    "df_models_mnbc2.pickle"
]

df_summary = pd.DataFrame()

for pickle_ in list_pickles:
    
    path = path_pickles + pickle_
    
    with open(path, 'rb') as data:
        df = pickle.load(data)

    df_summary = df_summary.append(df)

df_summary = df_summary.reset_index().drop('index', axis=1)

In [314]:
df_summary

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Basic Naïve Bayes,0.770035,0.649367
1,Improved Naïve Bayes,0.815014,0.659494


**So as we can see the Improved NaiveBayes algorithm, after performing lemmatization, removing stopwords, using bigrams and using laplace smoothing has some *'litle'* but still there is improvement!**

## Part - Of - Speech

In [23]:
def get_wordnet_pos(word):
    """ Map POS tag to first character lemmatize() accepts. """
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

Now we will provide the correct 'part-of-speech' tag as the second argument to lemmatize(). That way our Comment column of the dataframe will have the pos tags (features) of the whole text

In [27]:
def lemmatize(tokens):
    """ Lemmatize all words in given list of tokens. """

    lemmatizer = WordNetLemmatizer()
    lems = [lemmatizer.lemmatize(token, get_wordnet_pos(token))
            for token in tokens]

    return lems


x_train = x_train.apply(lambda x: lemmatize(x))

x_train.head()

3586                                  [love, chick, voic]
231                              [oh, hey, yousaid, like]
1119                                        [fuck, idiot]
3330    [photograph, avoid, urg, steal, brink, ?, woul...
968     [know, work, lot, countri, europ, brazil, isre...
Name: Comment, dtype: object

In [80]:
def dummy(doc):
    """ Dummy tokenizer to use when data are already tokenized. """
    return doc

def tf_idf(series):
    """ Tf-Idf vectorization of Comments. Return a series of the vectors. """

    comment_list = series.tolist()

    tfidf_vectorizer = TfidfVectorizer(
        tokenizer=dummy, preprocessor=dummy, max_features=222)

    matr = tfidf_vectorizer.fit_transform(comment_list)
    ser = pd.Series(matr.toarray().tolist())
    # return series of vectors for Comments
    return ser


tfidf_train_pos = tf_idf(x_train) 
tfidf_test_pos = tf_idf(x_test)
tfidf_train_pos.head()

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.21913705030672145, 0.20...
dtype: object

In [81]:
print(tfidf_train_pos.shape)
print(tfidf_test_pos.shape)

(3157,)
(790,)


In [82]:
tfidf_test_pos.head()

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
dtype: object

## TF - IDF

## **This one will not be in use, as we have made above tf-idf representation given the pos-tags of the text data, so we'll use them combined on our models below as we won't get any better improvement testing tf-idf features alone!**

But since the project definition needs the code implementation of both pos and tf-df seperataly, i'm providing my code of the 2nd project on tf-idf

In [34]:
# Parameter election
ngram_range3 = (1,2)
min_df3 = 10
max_df3 = 1.
max_features3 = 222

In [40]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range3,
                        stop_words=stop_words,
                        lowercase=True,
                        max_df=max_df3,
                        min_df=min_df3,
                        max_features=max_features3,
                        norm='l2',
                        sublinear_tf=True)
                        
tf_idf_train = tfidf.fit_transform(x_train.astype('U')).toarray()
print(tf_idf_train.shape)

tf_idf_test = tfidf.transform(x_test.astype('U')).toarray()
print(tf_idf_test.shape)

(3157, 222)
(790, 222)


**Now that we got our POS tags and our TF-IDF representation, we'll try them on SVM & Random Decision Forest**

# 1. SVM

In [83]:
# Made them as lists to prevent the error: setting an array element with a sequence
tfidf_trainpos = list(tfidf_train_pos)
tfidf_testpos = list(tfidf_test_pos)
Labels_train = list(labels_train)
Labels_test = list(labels_test)

In [51]:
C = [.0001, .001, .01, .1]
degree = [3, 4, 5]
gamma = [1, 10, 100]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]

# Create a base model
svc = svm.SVC(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .22, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(tfidf_trainpos, Labels_train)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed: 10.6min finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.22, train_size=None),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=8, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated'...one,
             param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1], 'kernel': ['linear'],
                          'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1], 'degree': [3, 4, 5],
                          'kernel': ['poly'], 'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1],
                          'gamma': [1, 10, 100], 'kernel': ['rbf'],
                          'probability':

In [38]:
# best hyperparameters: 

print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

The best hyperparameters from Grid Search are:
{'C': 0.1, 'kernel': 'linear', 'probability': True}

The mean accuracy of a model with these hyperparameters is:
0.7863309352517985


In [54]:
#saving the model as best_svc:

best_svc = grid_search.best_estimator_
best_svc

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=8, shrinking=True, tol=0.001,
    verbose=False)

In [55]:
best_svc.fit(tfidf_trainpos, Labels_train)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=8, shrinking=True, tol=0.001,
    verbose=False)

In [84]:
svc_pred = best_svc.predict(tfidf_testpos)

In [93]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(Labels_train, best_svc.predict(tfidf_trainpos)))

The training accuracy is: 
0.7427937915742794


In [92]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(Labels_test, svc_pred))

The test accuracy is: 
0.730379746835443


In [86]:
# Classification report
print("Classification report")
print(classification_report(Labels_test,svc_pred))

Classification report
              precision    recall  f1-score   support

           0       0.74      0.98      0.84       585
           1       0.25      0.02      0.04       205

    accuracy                           0.73       790
   macro avg       0.50      0.50      0.44       790
weighted avg       0.61      0.73      0.63       790



In [87]:
# We'll create a dataset with a model summary to compare models:

d = {
     'Model': 'SVM',
     'Training Set Accuracy': accuracy_score(Labels_train, best_svc.predict(tfidf_trainpos)),
     'Test Set Accuracy': accuracy_score(Labels_test, svc_pred)
}

df_models_svc = pd.DataFrame(d, index=[0])

In [88]:
df_models_svc

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,SVM,0.742794,0.73038


In [94]:
with open('df_models_svc.pickle', 'wb') as output:
    pickle.dump(df_models_svc, output)

# 2. Random Forest

In [96]:
# Hyperparameters for Random Forest: 


rf_0 = RandomForestClassifier(random_state = 8)

print('Parameters currently in use:\n')
rf_0.get_params()

Parameters currently in use:



{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 8,
 'verbose': 0,
 'warm_start': False}

In [97]:
# n_estimators
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]

# max_features
max_features = ['auto', 'sqrt']

# max_depth
max_depth = [int(x) for x in np.linspace(20, 100, num = 5)]
max_depth.append(None)

# min_samples_split
min_samples_split = [2, 5, 10]

# min_samples_leaf
min_samples_leaf = [1, 2, 4]

# bootstrap
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random_grid

{'n_estimators': [200, 400, 600, 800, 1000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [20, 40, 60, 80, 100, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [98]:
# Grid Search: 

# Create the parameter grid based on the results of random search 
bootstrap = [False]
max_depth = [30, 40, 50]
max_features = ['sqrt']
min_samples_leaf = [1, 2, 4]
min_samples_split = [5, 10, 15]
n_estimators = [800]

param_grid = {
    'bootstrap': bootstrap,
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators
}

# Create a base model
rfc = RandomForestClassifier(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .22, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rfc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(tfidf_trainpos, Labels_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  8.8min finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.22, train_size=None),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_sampl...
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=8,
                                  

In [99]:
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

The best hyperparameters from Grid Search are:
{'bootstrap': False, 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 15, 'n_estimators': 800}

The mean accuracy of a model with these hyperparameters is:
0.8244604316546763


In [102]:
best_rfc = grid_search.best_estimator_
best_rfc

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=40, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [103]:
best_rfc.fit(tfidf_trainpos, Labels_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=40, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [104]:
rfc_pred = best_rfc.predict(tfidf_testpos)

In [105]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(Labels_train, best_rfc.predict(tfidf_trainpos)))

The training accuracy is: 
0.8584098828001268


In [106]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(Labels_test, rfc_pred))

The test accuracy is: 
0.6708860759493671


In [107]:
# Classification report
print("Classification report")
print(classification_report(Labels_test,rfc_pred))

Classification report
              precision    recall  f1-score   support

           0       0.75      0.83      0.79       585
           1       0.31      0.21      0.25       205

    accuracy                           0.67       790
   macro avg       0.53      0.52      0.52       790
weighted avg       0.64      0.67      0.65       790



In [109]:
d = {
     'Model': 'Random Forest',
     'Training Set Accuracy': accuracy_score(Labels_train, best_rfc.predict(tfidf_trainpos)),
     'Test Set Accuracy': accuracy_score(Labels_test, rfc_pred)
}

df_models_rfc = pd.DataFrame(d, index=[0])
df_models_rfc

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Random Forest,0.85841,0.670886


In [110]:
with open('df_models_rfc.pickle', 'wb') as output:
    pickle.dump(df_models_rfc, output)

In [111]:
path_pickles = "/home/andrewpap22/Desktop/dataMining_MainProject/"

list_pickles = [
    "df_models_mnbc.pickle",
    "df_models_mnbc2.pickle",
    "df_models_svc.pickle",
    "df_models_rfc.pickle"
]

df_summary2 = pd.DataFrame()

for pickle_ in list_pickles:
    
    path = path_pickles + pickle_
    
    with open(path, 'rb') as data:
        df = pickle.load(data)

    df_summary2 = df_summary2.append(df)

df_summary2 = df_summary2.reset_index().drop('index', axis=1)

In [112]:
df_summary2

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Basic Naïve Bayes,0.770035,0.649367
1,Improved Naïve Bayes,0.815014,0.659494
2,SVM,0.742794,0.73038
3,Random Forest,0.85841,0.670886


Sorting by: **Test Set Accuracy:**

In [113]:
df_summary2.sort_values('Test Set Accuracy', ascending=False)

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
2,SVM,0.742794,0.73038
3,Random Forest,0.85841,0.670886
1,Improved Naïve Bayes,0.815014,0.659494
0,Basic Naïve Bayes,0.770035,0.649367


# Summary: 

**F1-Scores:**

1. SVM: **0.84**
2. Random Forest: **0.79**
3. Improved Naive Bayes: **0.78**
4. Basic Naive Bayes: **0.73** 

So as we can see: 

**SVM** is the best model so far with the best performance and scores!!! 

The reason is, we gave it the best possible features and best possible cleaned data (best possible by what i could personally manage... not to mean the best possible created!) 
It contained the full cleaned data with the extra cleaning we did for the Improved Naive bayes + the combination of tfidf and pos tag features. So, that's a good reason why SVM has the best test set accuracy and f1-score! 

**-----------------------------------------------------------------------------------------------------------**

*__Now we'll try anything we can in order to get the best possible Test Set Accuracy and F1-Score__* 

i.e. We have to exceed the performance of SVM! 

# i) Multinomial Logistic Regression

In [136]:
#randomized search cross validation:

# C
C = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)]

# multi_class
multi_class = ['multinomial']

# solver
solver = ['newton-cg', 'sag', 'saga', 'lbfgs']
 
# class_weight
class_weight = ['balanced', None]

# penalty
penalty = ['l2']

# Create the random grid
random_grid = {'C': C,
               'multi_class': multi_class,
               'solver': solver,
               'class_weight': class_weight,
               'penalty': penalty}

random_grid

{'C': [0.1,
  0.2,
  0.30000000000000004,
  0.4,
  0.5,
  0.6,
  0.7000000000000001,
  0.8,
  0.9,
  1.0],
 'multi_class': ['multinomial'],
 'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'],
 'class_weight': ['balanced', None],
 'penalty': ['l2']}

In [137]:
#The search:

# First create the base model to tune
lrc = LogisticRegression(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=lrc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(tf_idf_train, labels_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   36.0s finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=8,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=50, n_jobs=None,
                   param_distributions={'C': [0.1, 0.2, 0.30000000000000004,
                                              0.4, 0.5, 0.6, 0.7000000000000001,
                                              0.8, 0.9, 1.0],
                                        'class_weight': ['balanc

In [139]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)

The best hyperparameters from Random Search are:
{'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'multinomial', 'class_weight': None, 'C': 0.7000000000000001}

The mean accuracy of a model with these hyperparameters is:
0.8175473660264535


In [140]:
best_lrc = random_search.best_estimator_
best_lrc

LogisticRegression(C=0.7000000000000001, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=8, solver='newton-cg', tol=0.0001,
                   verbose=0, warm_start=False)

In [141]:
best_lrc.fit(tf_idf_train, labels_train)

LogisticRegression(C=0.7000000000000001, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=8, solver='newton-cg', tol=0.0001,
                   verbose=0, warm_start=False)

In [142]:
lrc_pred = best_lrc.predict(tf_idf_test)

In [143]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_lrc.predict(tf_idf_train)))

The training accuracy is: 
0.835920177383592


In [144]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, lrc_pred))

The test accuracy is: 
0.8265822784810126


In [145]:
# Classification report
print("Classification report")
print(classification_report(labels_test,lrc_pred))

Classification report
              precision    recall  f1-score   support

           0       0.84      0.95      0.89       585
           1       0.76      0.49      0.59       205

    accuracy                           0.83       790
   macro avg       0.80      0.72      0.74       790
weighted avg       0.82      0.83      0.81       790



In [147]:
d = {
     'Model': 'Logistic Regression',
     'Training Set Accuracy': accuracy_score(labels_train, best_lrc.predict(tf_idf_train)),
     'Test Set Accuracy': accuracy_score(labels_test, lrc_pred)
}

df_models_lrc = pd.DataFrame(d, index=[0])
df_models_lrc

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Logistic Regression,0.83592,0.826582


In [148]:
with open('df_models_lrc.pickle', 'wb') as output:
    pickle.dump(df_models_lrc, output)

# Final Results & Conclusion!!!

In [149]:
path_pickles = "/home/andrewpap22/Desktop/dataMining_MainProject/"

list_pickles = [
    "df_models_mnbc.pickle",
    "df_models_mnbc2.pickle",
    "df_models_svc.pickle",
    "df_models_rfc.pickle",
    "df_models_lrc.pickle"
]

df_summary_final = pd.DataFrame()

for pickle_ in list_pickles:
    
    path = path_pickles + pickle_
    
    with open(path, 'rb') as data:
        df = pickle.load(data)

    df_summary_final = df_summary_final.append(df)

df_summary_final = df_summary_final.reset_index().drop('index', axis=1)

In [150]:
df_summary_final

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Basic Naïve Bayes,0.770035,0.649367
1,Improved Naïve Bayes,0.815014,0.659494
2,SVM,0.742794,0.73038
3,Random Forest,0.85841,0.670886
4,Logistic Regression,0.83592,0.826582


Sorting by: **Test Set Accuracy:**

In [222]:
df_summary_final.sort_values('Test Set Accuracy', ascending=False)

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
4,Logistic Regression,0.83592,0.826582
2,SVM,0.742794,0.73038
3,Random Forest,0.85841,0.670886
1,Improved Naïve Bayes,0.815014,0.659494
0,Basic Naïve Bayes,0.770035,0.649367


# Final Summary: 

**F1-Scores:**

1. *SVM*: **0.84**
2. *Random Forest*: **0.79**
3. *Improved Naive Bayes*: **0.78**
4. *Basic Naive Bayes*: **0.73** 
5. *Multinomial Logistic Regresion*: **0.89**

So as we can see: 

We got much better results on **Multinomial Logistic Regresion** compared to the previous best: *SVM* with a much higher **Test Set Accuracy** and slightly higher **F1-Score**! 

What we did on Logistic Regresion is: 

We transformed the firstly cleaned data (cl_data) into tf-idf features, we had already previously done, lemmatization, removed the stop words and we have used bigrams as well but we did not pass the pos tags as well, as we did with the previous models. We have just passed the tf-idf features alone and that resulted in much better results in the Logistic Regresion model, BUT when tf-idf features tested alone without the pos tags in the previous models, it did not result in better Test Set Accuracy nor F1-Score! 

And here the project is complete! 

**Thank you!** 

*Author:* **Andreas Pappas** 
*ID:* **1115201500201** 

Spring Semester 2020 
Data Mining Course!