# This notebook compares the Accuracy and AUC socre on Random Forest classifier using  different word embedding methods.

In [6]:
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from IPython.display import display, Markdown, Latex
import spacy
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [7]:
class word2vec:         
    def transform(self,text):
        l = [[j.lower() for j in word_tokenize(i)] for i in text]
        model = Word2Vec(l,size=300, \
            window=7, \
            min_count=0,\
            workers=1)
        model.train(l, total_examples=len(l), epochs=10)
        updated_vector = []
        for i in l:
            p=0
            for j in i:
                p+=model.wv[j]
            updated_vector.append(p)
        return(updated_vector)
    def transformSpacy(self,text):
        model = spacy.load("en_core_web_md")
      #  text2vec = [model(i).vector for i in text['comment'].fillna(" ").tolist()]
        #dimmension of vector = 300x1
        text2vec = [model(i).vector for i in text]
        return text2vec

In [8]:
#parameters:
#size ->  dimensionality of the word vector
#window -> the window size(maximum distance between the current and predicted word within a sentence)
#min_count -> ignores all words with total frequency lower than this
#workers -> faster training with multicore machine

df_train = pd.read_csv('../cleaned_dataset.csv')
df_test = pd.read_csv('../Preprocessed Data//cleaned_dataset_test.csv')
#df = pd.read_csv('../cleaned_dataset.csv')
#print(df_train["comment"].fillna(" ").tolist());

df_test=df_test.drop(df_test.index[[114,153,1944]]).reset_index(drop=True)

# Word embedding used - Contineous bag of words

In [9]:
df_train = df_train.fillna('')
df_test = df_test.fillna('')
X_train = df_train["comment"]
Y_train = df_train["insult"].tolist()

X_test = df_test["comment"]
Y_test = df_test["insult"].tolist()


In [10]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [13]:

clf = RandomForestClassifier()
clf.fit(X_train,Y_train);

In [14]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))

**Accuracy** : 0.8055975794251135<br>**AUC Score** : 0.6510140579848422

# Word embedding used - TF-IDF

In [15]:
df_train = df_train.fillna('')
df_test = df_test.fillna('')
X_train = df_train["comment"]
Y_train = df_train["insult"].tolist()

X_test = df_test["comment"]
Y_test = df_test["insult"].tolist()


In [16]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [17]:
print (type(X_train),type(Y_train))

<class 'scipy.sparse.csr.csr_matrix'> <class 'list'>


In [18]:


clf = RandomForestClassifier()
clf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))

**Accuracy** : 0.7965204236006052<br>**AUC Score** : 0.6378850376800147

# Word embedding used - Word2vec(Gensim)

In [20]:
df_train = df_train.fillna('')
df_test = df_test.fillna('')
X_train = df_train["comment"]
Y_train = df_train["insult"].tolist()

X_test = df_test["comment"]
Y_test = df_test["insult"].tolist()


In [21]:
vectorizer = word2vec()
X_train=vectorizer.transform(X_train)
X_test=vectorizer.transform(X_test)

In [24]:


clf = RandomForestClassifier()
clf.fit(X_train,Y_train);

In [25]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))

**Accuracy** : 0.6773827534039334<br>**AUC Score** : 0.5311047799515253

# Word embedding used - Word2vec(spacy)

In [26]:
df_train = df_train.fillna('')
df_test = df_test.fillna('')
X_train = df_train["comment"]
Y_train = df_train["insult"].tolist()

X_test = df_test["comment"]
Y_test = df_test["insult"].tolist()


In [27]:
vectorizer = word2vec()
X_train=vectorizer.transformSpacy(X_train)
X_test=vectorizer.transformSpacy(X_test)

In [28]:
print (type(X_train),type(Y_train))

<class 'list'> <class 'list'>


In [29]:

clf = RandomForestClassifier()
clf.fit(X_train,Y_train);

In [30]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))


**Accuracy** : 0.7995461422087746<br>**AUC Score** : 0.6599398096066471