# This notebook compares the Accuracy and AUC socre on k nearest neighbour classifier using two different word embedding methods.

In [43]:
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from IPython.display import display, Markdown, Latex
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [67]:
df_train = pd.read_csv("cleaned_dataset.csv")
df_test = pd.read_csv('Preprocessed Data/cleaned_dataset_test.csv')
df_test1=df_test.drop(df_test.index[[114,153,1944]]).reset_index(drop=True)


In [113]:
class word2vec:         
    def transform(self,text):
        l = [[j.lower() for j in word_tokenize(i)] for i in text]
        model = Word2Vec(l,size=300, \
            window=7, \
            min_count=0,\
            workers=1)
        model.train(l, total_examples=len(l), epochs=10)
        updated_vector = []
        for i in l:
            p=0
            for j in i:
                p+=model.wv[j]
            updated_vector.append(p)
        return(np.asarray(updated_vector))
    def transformSpacy(self,text):
        model = spacy.load("en_core_web_md")
      #  text2vec = [model(i).vector for i in text['comment'].fillna(" ").tolist()]
        #dimmension of vector = 300x1
        text2vec = [model(i).vector for i in text]
        return text2vec

In [None]:
#parameters:
#size ->  dimensionality of the word vector
#window -> the window size(maximum distance between the current and predicted word within a sentence)
#min_count -> ignores all words with total frequency lower than this
#workers -> faster training with multicore machine




# Word embedding used - Contineous bag of words

In [48]:
X_train = df_train['comment'].fillna(" ").tolist()
Y_train = df_train['insult'].tolist()

X_test = df_test['comment'].fillna(" ").tolist()
Y_test = df_test['insult'].tolist()

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [49]:
n_neighbors = 15
clf = KNeighborsClassifier(n_neighbors)
clf.fit(X_train,Y_train);

In [50]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))

**Accuracy** : 0.7831507366830374<br>**AUC Score** : 0.664081966027

# Word embedding used - TF-IDF

In [51]:
X_train = df_train['comment'].fillna(" ").tolist()
Y_train = df_train['insult'].tolist()

X_test = df_test['comment'].fillna(" ").tolist()
Y_test = df_test['insult'].tolist()

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [52]:
n_neighbors = 15
clf = KNeighborsClassifier(n_neighbors)
clf.fit(X_train,Y_train);

In [53]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))

**Accuracy** : 0.8175292784284095<br>**AUC Score** : 0.702732840911

# Word embedding used - Word2vec(Gensim)

In [114]:
X_train = df_train['comment'].tolist()
Y_train = df_train['insult'].tolist()

X_test = df_test1['comment'].tolist()
Y_test = df_test1['insult'].tolist()

vectorizer = word2vec()
X_train=vectorizer.transform(X_train)
X_test=vectorizer.transform(X_test)

In [115]:
n_neighbors = 15
clf = KNeighborsClassifier(n_neighbors)
clf.fit(X_train,Y_train);

In [116]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))


**Accuracy** : 0.7295763993948563<br>**AUC Score** : 0.52971872936

# Word embedding used - Word2vec(spacy)

In [98]:
X_train = df_train['comment'].tolist()
Y_train = df_train['insult'].tolist()

X_test = df_test1['comment'].tolist()
Y_test = df_test1['insult'].tolist()

vectorizer = word2vec()
X_train=vectorizer.transformSpacy(X_train)
X_test=vectorizer.transformSpacy(X_test)

In [None]:
#vectorizer = CountVectorizer()

In [99]:
n_neighbors = 15
clf = KNeighborsClassifier(n_neighbors)
clf.fit(X_train,Y_train);

In [100]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))


**Accuracy** : 0.806732223903177<br>**AUC Score** : 0.735522834703