My approach tokenizes, removes stopwords, and lemmatizes the tokens prior to vectorizing the words via GloVe embeddings. I used 300-dimensional GloVe embeddings that were pre-trained on a dump of 6B tokens and 400K vocabulary words from the Wikipedia and GigaWord datasets (see more details here: https://nlp.stanford.edu/projects/glove/). The final representation for each input was the average of the GloVe embeddings for each word. Lastly, I trained a k-nearest neighbor classifier using an 80/20 train-test split.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import math
import tqdm
import gensim.downloader

In [2]:
data = pd.read_csv("movie-plots-student.csv")
data = data.drop(columns="Unnamed: 0")
text = list(data["Plot"])

In [3]:
label_vals = np.unique(data["Genre"])
label2idx = {label:idx for idx, label in enumerate(label_vals)}
idx2label = {idx:label for label, idx in label2idx.items()}

In [4]:
nltk.download("stopwords")
stoplist = stopwords.words("english")

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1108)>


In [5]:
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')

ValueError: unable to read local cache '/Users/artemvysogorets/gensim-data/information.json' during fallback, connect to the Internet and retry

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
def preprocess_sentences(sentences, stopwords):
    wnl=WordNetLemmatizer()
    sentences = [ [ wnl.lemmatize(word.lower()) for word in sentence.split() 
                   if word not in stopwords and word.isalpha() ] 
                 for sentence in sentences ]
    return sentences

In [None]:
def prepare_data(text, glove_vectors, skip_empty=True):
    """
    Returns BOW GLoVE embeddings for each sentence
    and list of indices of removed examples.
    """
    stoplist = stopwords.words("english")
    prepped = preprocess_sentences(text, stoplist)
    bow_vecs = []
    removed_idx = []
    for idx, sentence in enumerate(prepped):
        embeddings = np.array([glove_vectors[word] for word in sentence if word in glove_vectors])
        if len(embeddings) > 0:
            bow_avg = np.mean(embeddings, axis=0)
            bow_vecs.append(bow_avg)
        else:
            print("Skipping sentence due to no embeddings: " + " ".join(sentence))
            removed_idx.append(idx)
            if not skip_empty:
                bow_vecs.append(np.zeros(300))
    return np.array(bow_vecs), removed_idx

In [None]:
from sklearn.model_selection import train_test_split

word_vecs, removed_idx = prepare_data(text, glove_vectors)

y = np.array([label2idx[label] for idx, label in enumerate(data["Genre"]) if idx not in removed_idx])
train_X, test_X, train_y, test_y = train_test_split(word_vecs, y, test_size=0.2, random_state=123)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def train_model(train_X, train_y, test_X, test_y, n_neighbors):
    kneighbors = KNeighborsClassifier(n_neighbors=n_neighbors).fit(train_X, train_y)
    train_preds = kneighbors.predict(train_X)
    test_preds = kneighbors.predict(test_X)
    train_err = np.sum(train_preds != train_y) / len(train_y)
    test_err = np.sum(test_preds != test_y) / len(test_y)
    return kneighbors, train_err, test_err

In [None]:
models = []
for n_neighbors in range(1, 11):
    knn_model, train_err, test_err = train_model(train_X, train_y, test_X, test_y, n_neighbors)
    print(f"n={n_neighbors}, train_err={train_err:.4f}, test_err={test_err:.4f}")
    models.append(knn_model)

In [None]:
best_model = models[9]

In [None]:
# Requires that the preceding cells have already been run.
def test_model(test_data):
    word_vecs, _ = prepare_data(test_data, glove_vectors, skip_empty=False)
    preds = best_model.predict(word_vecs)
    return [idx2label[pred] for pred in preds]