## 6

In [3]:
import os
import warnings
from pathlib import Path

import spacy
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
#from gensim import downloader
import matplotlib.pyplot as plt
#from gensim.models import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, f1_score

We first load the data.

In [4]:
df = pd.read_csv('movie-plots-student.csv')
df

Unnamed: 0.1,Unnamed: 0,Genre,Plot
0,0,drama,A Bill of Divorcement describes a day in the l...
1,1,comedy,Dr. Clitterhouse (Edward G. Robinson) is a wea...
2,2,comedy,"Three young couples, all having financial stru..."
3,3,comedy,Hollywood studio mogul Joe Mulholland (Matthau...
4,4,drama,In a working class South London district lives...
...,...,...,...
10711,10711,comedy,"In a North of England training camp, lovestruc..."
10712,10712,comedy,Avijit Banerjee runs his own software company....
10713,10713,drama,Elangovan (Thankar Bachchan) is a school teach...
10714,10714,comedy,Victor Maynard (Bill Nighy) is an experienced ...


For preprocessing, we will tokenize, lemmatize, and remove any stopwords and non-alphanumeric characters from each document. Next we'll load our word embedding model to embed each movie plot as a vector. We'll use the pre-trained w2v model trained on the Google news vectors and embed each word in each document as a 300 dimensional vector, and use a simple average vector for each document.

In [5]:
# load vectors for training or create them
try:
    X_vectors = np.load('X_vectors.npy')
    print('Loaded X_vectors.')
except:
    # load/download embedding model
    model_name = 'word2vec-google-news-300'
    file = Path(f'{os.path.expanduser("~")}/gensim-data/{model_name}/{model_name}.gz')
    if file.is_file():
        print('Loading embedding model...')
        embedding_model = KeyedVectors.load_word2vec_format(str(file), binary=True)
    else:
        print('Downloading embedding model...')
        embedding_model = downloader.load(model_name)
    
    print('Preprocessing text...')
    # load english module
    nlp = spacy.load('en')

    # embed each document as average word vector
    # impute with zero vectors
    embedding_dim = 300
    documents = df.Plot
    vector = np.zeros((embedding_dim, ))
    vectors = np.empty((len(documents), embedding_dim))
    for i, doc in enumerate(tqdm(documents)):
        # tokenize, lemmatize, stopwords, non-alphanumeric characters
        clean_doc = [token.lemma_ for token in nlp(doc) 
                     if not token.is_stop and token.is_alpha]
        vector *= 0
        num_words = 0
        for word in clean_doc:
            if word in embedding_model.vocab:
                vector += embedding_model[word]
                num_words += 1
        if num_words > 0:
            vector /= num_words

        vectors[i] = vector

    np.save('X_vectors.npy', vectors)
    X_vectors = vectors
    print('X_vectors created.')

Downloading embedding model...


NameError: name 'downloader' is not defined

Next we create integer labels from our categories and check for class imbalance.

In [None]:
# get labels as integers
y_labels = df.Genre.astype('category').cat.codes

In [None]:
plt.hist(y_labels);
plt.title('Training class balance');

Now we train a multiclass logistic regression using multinomial loss across all classes. We'll use an elasticnet tuned with 5-fold cross validation to find the optimal C (regularization strength) and l1_ration (ratio between L1 and L2 penalty). We use sample weights inversely proportional to the class frequencies to account for class imbalance. For timing purposes, we'll keep the number of iterations low enough the fitting isn't too slow. We perform an initial train/test split just to get an idea of our performance before doing our cross validation process on all of the training data to get our final model.

In [None]:
# standardize features to enable regularization
X_vectors = StandardScaler().fit_transform(X_vectors)

# make train test split to get idea of test performance
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y_labels, stratify=y_labels,
                                                    random_state=123, test_size=0.25)

# use multiclass logistic regression with multinomial loss across all classes 
# elasticnet with 5-fold cross validation to find optimal C (regularization strength) 
# and l1_ratio (ratio between L1 and L2 penalty)
# best model chosen based on f1-score
# use sample weights inversely proportional to the class frequencies to handle imbalance
# we'll keep max_iter low enough that fitting isn't too slow
parameters = {'Cs': [1e-4, 1e-2, 1, 1e2, 1e4], 'l1_ratios': [0, 0.5, 1]}
model = LogisticRegressionCV(random_state=123, multi_class='multinomial',
                             penalty='elasticnet', solver='saga', max_iter=200,
                             class_weight='balanced', verbose=1, n_jobs=-1,
                             **parameters)

# ignore max_iter warning since we're printing out convergence
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
model.fit(X_train, y_train)

In [None]:
# get a sense of test accuracy
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))

In [None]:
# now use the same process to train on all of the data with more iterations
model.fit(X_vectors, y_labels)

With our best model trained, we're ready to predict on the held out test data. We write a function to do the same preprocessing and prediction.

In [None]:
def test_model(test_data):
    # NOTE: this function assumes except case was triggered above
    # and variables used are thus available
    vector = np.zeros((embedding_dim, ))
    X_test = np.empty((len(test_data), embedding_dim))
    for i, doc in enumerate(tqdm(test_data)):
        # tokenize, lemmatize, stopwords, non-alphanumeric characters
        clean_doc = [token.lemma_ for token in nlp(doc) 
                     if not token.is_stop and token.is_alpha]
        vector *= 0
        num_words = 0
        for word in clean_doc:
            if word in embedding_model.vocab:
                vector += embedding_model[word]
                num_words += 1
        if num_words > 0:
            vector /= num_words

        X_test[i] = vector
    
    X_test = StandardScaler().fit_transform(X_test)
    y_pred = model.predict(X_test)
    
    return y_pred

In [None]:
test