<a href="https://colab.research.google.com/github/yrndurgun/emotion-analysis/blob/main/knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import defaultdict
import pandas as pd
import numpy as np
import io
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download () # Download only required or all -> press d, type all, type quit after
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, accuracy_score, f1_score, confusion_matrix, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to /root/nltk_data...
       |   Unzipping grammars/book_gr


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [None]:
train_path = "training.csv"
val_path = "validation.csv"
test_path = "test.csv"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

In [None]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]  

In [None]:
train_df['tokenized'] = train_df.apply(lambda x: tokenize(x['text']), axis=1)
val_df['tokenized'] = val_df.apply(lambda x: tokenize(x['text']), axis=1)
test_df['tokenized'] = test_df.apply(lambda x: tokenize(x['text']), axis=1)

In [None]:
lmtzr = WordNetLemmatizer()
train_df['lemmatized'] = train_df['tokenized'].apply(
                    lambda lst:[lmtzr.lemmatize(word) for word in lst])
val_df['lemmatized'] = val_df['tokenized'].apply(
                   lambda lst:[lmtzr.lemmatize(word) for word in lst])
test_df['lemmatized'] = test_df['tokenized'].apply(
                   lambda lst:[lmtzr.lemmatize(word) for word in lst])


In [None]:
stemmer = SnowballStemmer("english")
train_df['lemmatized'] = train_df.lemmatized.map(lambda l: [stemmer.stem(word) for word in l])
train_df.lemmatized = train_df.lemmatized.str.join(sep=' ')

val_df['lemmatized'] = val_df.lemmatized.map(lambda l: [stemmer.stem(word) for word in l])
val_df.lemmatized = val_df.lemmatized.str.join(sep=' ')

test_df['lemmatized'] = test_df.lemmatized.map(lambda l: [stemmer.stem(word) for word in l])
test_df.lemmatized = test_df.lemmatized.str.join(sep=' ')

In [None]:
cv = CountVectorizer(stop_words='english')
train_ppd_df = cv.fit_transform(train_df["lemmatized"])
val_ppd_df = cv.transform(val_df["lemmatized"])
test_ppd_df = cv.transform(test_df["lemmatized"])

In [None]:
train_labels = np.array(train_df['label'])
val_labels = np.array(val_df['label'])
test_labels = np.array(test_df['label'])

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_ppd_df, train_labels )

prediction = knn.predict(test_ppd_df)

print("Accuracy Score -> ",accuracy_score(prediction, test_labels)*100)
confusion_matrix(y_true=test_labels, y_pred=prediction)

Accuracy Score ->  62.64999999999999


array([[458,  64,  17,  29,  11,   2],
       [138, 496,  34,  13,   5,   9],
       [ 47,  45,  56,   7,   3,   1],
       [ 92,  46,   8, 125,   3,   1],
       [ 66,  30,  10,  15,  98,   5],
       [ 22,  10,   4,   3,   7,  20]])

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(train_ppd_df, train_labels )

prediction = knn.predict(test_ppd_df)

print("Accuracy Score -> ",accuracy_score(prediction, test_labels)*100)
confusion_matrix(y_true=test_labels, y_pred=prediction)

Accuracy Score ->  61.45


array([[446,  76,  16,  26,  13,   4],
       [128, 492,  37,  21,  10,   7],
       [ 43,  53,  49,   8,   4,   2],
       [ 79,  50,   6, 131,   7,   2],
       [ 58,  36,   8,  16,  99,   7],
       [ 21,  14,   3,   3,  13,  12]])

In [None]:
knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(train_ppd_df, train_labels )

prediction = knn.predict(test_ppd_df)

print("Accuracy Score -> ",accuracy_score(prediction, test_labels)*100)
confusion_matrix(y_true=test_labels, y_pred=prediction)

Accuracy Score ->  60.050000000000004


array([[383,  82,  33,  38,  35,  10],
       [ 79, 458,  72,  51,  20,  15],
       [ 24,  35,  75,  13,   9,   3],
       [ 51,  43,  13, 149,  12,   7],
       [ 39,  34,   8,  20, 113,  10],
       [ 11,   9,   5,   3,  15,  23]])

In [None]:
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(train_ppd_df, train_labels )

prediction = knn.predict(test_ppd_df)

print("Accuracy Score -> ",accuracy_score(prediction, test_labels)*100)
confusion_matrix(y_true=test_labels, y_pred=prediction)

Accuracy Score ->  60.650000000000006


array([[442,  80,  15,  29,  13,   2],
       [139, 491,  32,  20,   8,   5],
       [ 46,  56,  46,   9,   1,   1],
       [ 92,  42,   7, 124,   9,   1],
       [ 61,  38,   7,  16,  98,   4],
       [ 19,  19,   3,   4,   9,  12]])

In [None]:
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(train_ppd_df, train_labels )

prediction = knn.predict(test_ppd_df)

print("Accuracy Score -> ",accuracy_score(prediction, test_labels)*100)
confusion_matrix(y_true=test_labels, y_pred=prediction)

Accuracy Score ->  60.050000000000004


array([[448,  88,  10,  24,   8,   3],
       [147, 487,  29,  20,   7,   5],
       [ 53,  57,  42,   5,   1,   1],
       [105,  35,   6, 120,   7,   2],
       [ 66,  40,   6,  16,  91,   5],
       [ 22,  17,   2,   4,   8,  13]])

In [None]:
knn = KNeighborsClassifier(n_neighbors =11)
knn.fit(train_ppd_df, train_labels )

prediction = knn.predict(test_ppd_df)

print("Accuracy Score -> ",accuracy_score(prediction, test_labels)*100)
confusion_matrix(y_true=test_labels, y_pred=prediction)

Accuracy Score ->  60.3


array([[455,  83,  10,  27,   4,   2],
       [149, 491,  32,  15,   3,   5],
       [ 51,  54,  41,  11,   1,   1],
       [ 99,  45,   7, 117,   6,   1],
       [ 76,  39,   4,  13,  90,   2],
       [ 22,  15,   1,   4,  12,  12]])

In [None]:
knn = KNeighborsClassifier(n_neighbors = 13)
knn.fit(train_ppd_df, train_labels )

prediction = knn.predict(test_ppd_df)

print("Accuracy Score -> ",accuracy_score(prediction, test_labels)*100)
confusion_matrix(y_true=test_labels, y_pred=prediction)

Accuracy Score ->  60.099999999999994


array([[457,  85,  10,  23,   3,   3],
       [156, 495,  26,  13,   0,   5],
       [ 48,  62,  38,   8,   2,   1],
       [102,  45,   6, 116,   5,   1],
       [ 72,  49,   4,  12,  86,   1],
       [ 25,  17,   1,   3,  10,  10]])