# Roles Classifier Alternative: K-Nearest Neighbor

Imports and downloading tokenizers from NLTK:

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import genesis
from nltk.corpus import wordnet as wn
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)


[nltk_data] Downloading package genesis to /home/alexis/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alexis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/alexis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/alexis/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## K-NN Classifier Definition

In [8]:
class KnnClassifier():
    def __init__(self, k=1, distance_type='path'):
        self.k = k
        self.distance_type = distance_type

    # This function is used for training
    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    # This function runs the K(1) nearest neighbour algorithm and
    # returns the label with closest match.
    def predict(self, x_test):
        self.x_test = x_test
        y_predict = []

        for i in range(len(x_test)):
            max_sim = 0
            max_index = 0
            for j in range(self.x_train.shape[0]):
                temp = self.document_similarity(x_test[i], self.x_train[j])
                if temp > max_sim:
                    max_sim = temp
                    max_index = j
            y_predict.append(self.y_train[max_index])
        return y_predict

    def convert_tag(self, tag):
        """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
        tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
        try:
            return tag_dict[tag[0]]
        except KeyError:
            return None

    def doc_to_synsets(self, doc):
        """
            Returns a list of synsets in document.
            Tokenizes and tags the words in the document doc.
            Then finds the first synset for each word/tag combination.
            If a synset is not found for that combination it is skipped.

            Args:
                doc: string to be converted

            Returns:
                list of synsets
        """
        tokens = word_tokenize(doc + ' ')

        l = []
        tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)

        for token, tag in zip(tokens, tags):
            syntag = self.convert_tag(tag[1])
            syns = wn.synsets(token, syntag)
            if (len(syns) > 0):
                l.append(syns[0])
        return l

    def similarity_score(self, s1, s2, distance_type='path'):
        """
        Calculate the normalized similarity score of s1 onto s2
        For each synset in s1, finds the synset in s2 with the largest similarity value.
        Sum of all of the largest similarity values and normalize this value by dividing it by the
        number of largest similarity values found.

        Args:
            s1, s2: list of synsets from doc_to_synsets

        Returns:
            normalized similarity score of s1 onto s2
        """
        s1_largest_scores = []

        for i, s1_synset in enumerate(s1, 0):
            max_score = 0
            for s2_synset in s2:
                if distance_type == 'path':
                    score = s1_synset.path_similarity(s2_synset, simulate_root=False)
                else:
                    score = s1_synset.wup_similarity(s2_synset)
                if score != None:
                    if score > max_score:
                        max_score = score

            if max_score != 0:
                s1_largest_scores.append(max_score)

        mean_score = np.mean(s1_largest_scores)

        return mean_score

    def document_similarity(self, doc1, doc2):
        """Finds the symmetrical similarity between doc1 and doc2"""

        synsets1 = self.doc_to_synsets(doc1)
        synsets2 = self.doc_to_synsets(doc2)

        return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2

## Testing Several Files

In [None]:
file_size = [150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 645]
accuracy = []

s = stopwords.words('english')
ps = nltk.wordnet.WordNetLemmatizer()

for i in file_size:
    file_name = f'output/balanced_{i}.csv'
    roles = pd.read_csv(f'../{file_name}')
    mapping = {'Student': 0, 'Co-Facilitator': 1, 'Facilitator': 2}
    roles['Role'] = roles['Role'].apply(lambda x: mapping[x])

    for k in range(roles.shape[0]):
        review = roles.loc[k, 'Text']
        review = review.split()
        review = [ps.lemmatize(word) for word in review if not word in s]
        review = ' '.join(review)
        roles.loc[k, 'Text'] = review

    X = roles['Text']
    y = roles['Role']
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

    # Train the Classifier
    classifier = KnnClassifier(k=1, distance_type='path')
    classifier.fit(X_train.values, y_train.values)

    test_corpus = []
    for x_value in X_valid.values:
        review = x_value.split()

        review = [ps.lemmatize(word) for word in review if not word in s]
        review = ' '.join(review)
        test_corpus.append(review)


    def accuracy_score(y_pred, y_true):
        matched = 0

        for j in range(0, len(y_pred)):
            if y_pred[j] == y_true[j]:
                matched = matched + 1

        return float(matched) / float(len(y_true))


    error_count = 0
    predictions = classifier.predict(test_corpus)

    accuracy_partial = accuracy_score(list(map(int, predictions)), y_valid.values.tolist())
    accuracy.append(accuracy_partial)
    print(f'Accuracy for file_size {i}: {accuracy_partial}')


Accuracy for file_size 150: 0.3
Accuracy for file_size 200: 0.35
Accuracy for file_size 250: 0.34
Accuracy for file_size 300: 0.4166666666666667
Accuracy for file_size 350: 0.5142857142857142
Accuracy for file_size 400: 0.475
Accuracy for file_size 450: 0.4222222222222222
Accuracy for file_size 500: 0.45
Accuracy for file_size 550: 0.37272727272727274
Accuracy for file_size 600: 0.325


## Graphical Performance Analysis

In the following plots we can see the how the model behaves when it is trained with different amounts of data.

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

plt.plot(file_size, accuracy)
plt.title('# of Rows vs. Accuracy')
plt.suptitle('K-Nearest Neighbor Roles Classifier')
plt.xlabel('# of Rows')
plt.ylabel('Accuracy')
plt.show()

## Conclusions

- The model doesn't show a good performance with the datasets, and we can see that the behavior is random, from which we
can conclude that KNN was not able to learn anything from the datasets.