In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
import math
import csv
import string

import seaborn as sn
import matplotlib.pyplot as plt

from collections import Counter

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression

pd.options.mode.chained_assignment = None  # default='warn'



In [13]:
###First we perform a naive pre-processing step to go from pandas to lists of data
###and relabel the genres as 0,1,2,3 to be used with sklearn
def preprocess_part1(filename='movie-plots-student.csv'):
    data = pd.read_csv(filename)
    data.dropna(axis=0,inplace=True)
    train_y = []
    data_y = data['Genre']
    for i in range(len(data_y)):
        if data_y[i] == 'comedy':
            train_y.append(0)
        elif data_y[i] == 'drama':
            train_y.append(1)
        elif data_y[i] == 'horror':
            train_y.append(2)
        else: #action
            train_y.append(3)
    data_X = data['Plot']
    for i in range(len(data_X)):
        data_X.loc[i] = data_X.loc[i].replace("\r\n"," ")

    return data_X, train_y

#pre-processing step where we perform lemmatization and stemming; this occurs at the level of a given input
def preprocess_part2(text,lemma,stem):
    if lemma:
        #print('we lemma-ing')
        lemmatizer = WordNetLemmatizer() 
        text = text.apply(lambda row: " ".join([lemmatizer.lemmatize(word) for word in row.split(" ")]))
    if stem:
        #print('we stemming')
        snow = nltk.stem.SnowballStemmer('english')
        text = text.apply(lambda row: " ".join([snow.stem(word) for word in row.split(" ")]))
    return text

def preprocess_part3(text, vocab, stopwords_bool,update=False):
    # get tokens from file
    tokens = word_tokenize(text.lower())
    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    stripped_words = [word for word in stripped if word.isalpha()]
    # remove stopwords 
    if(stopwords_bool):
        stop_wordssss = set(stopwords.words('english'))
        words = [s for s in stripped_words if not s in stop_wordssss]
        # update vocabulary
        if update:
            vocab.update(words)
        # return tokens 
        return words
    else:   
        # update vocabulary   
        if update:
            vocab.update(stripped_words)
        # return tokens
        return stripped_words

class TextClassifier():
    # constructor
    def __init__(self, model, filename, lemma=False, stem=False, stop_words=False, min_occurence=0):
        self.model = model
        self.lemma = lemma
        self.stem = stem
        self.filename = filename
        self.min_occurence = min_occurence
        self.stop_words = stop_words
        data_X, data_Y = preprocess_part1(self.filename)
        
        data_X = preprocess_part2(data_X,self.lemma,self.stem)    
        # preprocess data and build vocabulary
        self.vocab = Counter()
        data_X = data_X.apply(lambda row: " ".join(preprocess_part3(row, self.vocab, self.stop_words,update=True)))
        # only include words in vocabulary that appear > min_occurence times
        self.vocab = [k for k,c in self.vocab.items() if c >= min_occurence]
        X = data_X.to_numpy()
        Y = np.array(data_Y)
        
        # train-test split data
        self.X_train,self.X_test,self.Y_train,self.Y_test = train_test_split(X,Y, test_size=0.1)
        # initialize class variables before fitting
        self.best_classifier = None
        self.best_score = float('-inf')
        self.best_params = None

    def vectorize(self, X, params):
        # vectorize data according to vocabulary 
        vectorizer = CountVectorizer(vocabulary=self.vocab, **params)
        return vectorizer.fit_transform(X)

    def fit(self,params):
        vectorized_X = self.vectorize(self.X_train, params)
        score = cross_val_score(self.model, vectorized_X,self.Y_train, cv=5).mean()
        # update best score seen from 5-folds of cross validation 
        if (score > self.best_score):
            self.best_classifier = self.model.fit(vectorized_X,self.Y_train)
            self.best_score = score
            self.best_params = params
       
    # function to get accuracy of model 
    def test_set_score(self):
        vectorized_X_test = self.vectorize(self.X_test, self.best_params)
        predictions = self.best_classifier.predict(vectorized_X_test)
        # print accuracy score of predictions 
        print('Accuracy on 10 percent of hold-out data:', accuracy_score(self.Y_test,predictions))

    def test_model(self,test_data_name):
        ###expects the NAME of the csv file
        #re-perform all preprocessing steps, but without updating the vocabulary!
        testdata_X, testdata_Y = preprocess_part1(test_data_name)
        testdata_X = preprocess_part2(testdata_X,self.lemma,self.stem)
        testdata_X = testdata_X.apply(lambda row: " ".join(preprocess_part3(row, self.vocab, 
                                                                    self.stop_words,update=False)))
        
        testdataX_np = testdata_X.to_numpy()
        testdataY_np = np.array(testdata_Y)
        
        #vectorize the data
        testdataX_vec = self.vectorize(testdataX_np, self.best_params)
        #get predictions from the best classifier
        predictions_test = self.best_classifier.predict(testdataX_vec)
        
        print('Test set accuracy:', accuracy_score(testdataY_np,predictions_test))

In the interest of time/my poor computing power on my laptop, I restricted my attention to model training with at most 500 iterations. I found that when I do not use stop-words, I needed to increase the number of iterations to achieve convergence, so I decided to always use stop-words for the remainder of the experiments below. Finally, there's another variable to remove frequency of words from the vocabulary, but I didn't get around to running different tests for those variables, so it's always set to 0.

I tried various combinations of lemmatizing/stemming/regularization with the Logistic Regression classifier

In [10]:
###Logistic Regression Models

In [21]:
lg = TextClassifier(LogisticRegression(C=5, max_iter=500), filename='movie-plots-student.csv', lemma=False, stem=False, stop_words=True, min_occurence=0)
lg.fit({"ngram_range":(1,1)})
lg.test_set_score()
lg.test_model('movie-plots-student.csv')


Accuracy on 10 percent of hold-out data: 0.7098880597014925
Test set accuracy: 0.9706980216498694


In [22]:
lg = TextClassifier(LogisticRegression(C=5, max_iter=500), filename='movie-plots-student.csv', lemma=True, stem=True, stop_words=True, min_occurence=0)
lg.fit({"ngram_range":(1,1)})
lg.test_set_score()
lg.test_model('movie-plots-student.csv')

Accuracy on 10 percent of hold-out data: 0.710820895522388
Test set accuracy: 0.9704180664427025


In [23]:
lg = TextClassifier(LogisticRegression(C=0.01, max_iter=500), filename='movie-plots-student.csv', lemma=False, stem=False, stop_words=True, min_occurence=0)
lg.fit({"ngram_range":(1,1)})
lg.test_set_score()
lg.test_model('movie-plots-student.csv')

Accuracy on 10 percent of hold-out data: 0.7042910447761194
Test set accuracy: 0.883631952220978


In [24]:
lg = TextClassifier(LogisticRegression(C=0.01, max_iter=500), filename='movie-plots-student.csv', lemma=True, stem=True, stop_words=True, min_occurence=0)
lg.fit({"ngram_range":(1,1)})
lg.test_set_score()
lg.test_model('movie-plots-student.csv')

Accuracy on 10 percent of hold-out data: 0.7145522388059702
Test set accuracy: 0.8774729376633073


In [25]:
lg = TextClassifier(LogisticRegression(C=0.1, max_iter=500), filename='movie-plots-student.csv', lemma=True, stem=True, stop_words=True, min_occurence=0)
lg.fit({"ngram_range":(1,1)})
lg.test_set_score()
lg.test_model('movie-plots-student.csv')

Accuracy on 10 percent of hold-out data: 0.7052238805970149
Test set accuracy: 0.954273982829414


In [26]:
lg = TextClassifier(LogisticRegression(C=0.1, max_iter=500), filename='movie-plots-student.csv', lemma=False, stem=False, stop_words=True, min_occurence=0)
lg.fit({"ngram_range":(1,1)})
lg.test_set_score()
lg.test_model('movie-plots-student.csv')

Accuracy on 10 percent of hold-out data: 0.7257462686567164
Test set accuracy: 0.9606196341918626


It appears that the best model is when Lemmatizing is True, Stemming is True, and regularization parameter is set to 5 (note that this is close to the case where Lemmatizing and Stemming are False). For the final test model, please execute the following (with 'insert name here' replaced with the new csv file)

In [None]:
lg = TextClassifier(LogisticRegression(C=5, max_iter=500), filename='movie-plots-student.csv', lemma=True, stem=True, stop_words=True, min_occurence=2)
lg.fit({"ngram_range":(1,1)})
lg.test_set_score()
lg.test_model('insert name here')