In [31]:
import csv
from collections import defaultdict
from pathlib import Path

import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC

from scripts.preprocess_data import preprocess_data
from utils.helpers import list_from_yaml
from utils.metrics import confusion_matrix, classifcation_metrics
from utils.serialization import load_json

# Constants

In [6]:
DATA_PATH = Path('./data')

# Pre-process data

In [7]:
preprocess_data(DATA_PATH)

Preprocessing movie data
Finding unique genres
Only keepnig genres that appear at least 200 times in 42202 movies
----------
Statistics
----------
42202 movies
Number of unique genres: 91
Word count per plot summary - min: 1, mean: 309.42, std: 317.49, median: 186.00, 90% : 738, 95% : 909, 99% : 1376, max: 4922
Genre count per movie - min: 0, mean: 3.35, std: 2.05, median: 3,  max: 15
----------
-- Saving --
Saving movie data as csv as : prepared_movie_data.csv
Saving unique genre to index mapping as: genre_mapping.json
Saving genre_mapping.json to: data
Saving movie plots as: plot_summaries.json
Saving plot_summaries.json to: data
----------
-- Saved --


# Load Pre-processed data

In [8]:
movie_data = pd.read_csv(DATA_PATH / 'prepared_movie_data.csv')
movie_data.head()

Unnamed: 0,movie_name,plot_summary,genres
0,Ghosts of Mars,Set in the second half of the 22nd century the...,"['Thriller', 'Adventure', 'Horror', 'Supernatu..."
1,White Of The Eye,A series of murders of rich young women throug...,"['Psychological thriller', 'Thriller']"
2,A Woman in Flames,Eva an upper class housewife becomes frustrate...,['Drama']
3,The Sorcerer's Apprentice,Every hundred years the evil Morgana returns t...,"['World cinema', 'Fantasy', 'Family Film', 'Ad..."
4,Little city,Adam a San Francisco-based artist who works as...,"['Romantic comedy', 'Comedy-drama', 'Comedy', ..."


# Continue Pre-processing

## Lower & Tokenize

In [9]:
# to lower case
movie_data.loc[:, 'plot_summary'] = movie_data['plot_summary'].apply(lambda x: x.lower())
# tokenizing
movie_data.loc[:, 'tokenized_summary'] = movie_data['plot_summary'].apply(word_tokenize)

## Add 100 most frequent words to list of stopwords

In [10]:
# add stop words
frequent_words = defaultdict(lambda: 0)
for summary in movie_data['tokenized_summary']:
    for word in summary:
        frequent_words[word] += 1
frequent_words = sorted(frequent_words.items(), key=lambda item: item[1], reverse=True)

In [11]:
most_frequent_words = pd.DataFrame(frequent_words).head(100)
print(most_frequent_words)
most_frequent_words = set(most_frequent_words[0]).union(set(stopwords.words('English')))

        0       1
0     the  822296
1      to  479962
2     and  455497
3       a  375882
4      of  260660
..    ...     ...
95  tries   12035
96   help   12016
97      s   11738
98    n't   11717
99    now   11690

[100 rows x 2 columns]


## Remove stopwords from data

In [12]:
stemmer = SnowballStemmer('english')
def remove_stopwords(x, stopwords=most_frequent_words, stemmer=stemmer):
    return ' '.join([stemmer.stem(word) for word in x if word not in stopwords])

In [13]:
movie_data.loc[:, 'cleaned_plot_summary'] = movie_data['tokenized_summary'].apply(remove_stopwords)

## Convert genres to multilabel one hot encoding

In [14]:
genre_mapping = load_json(DATA_PATH, 'genre_mapping')

Loaded file: genre_mapping.json successfully


In [20]:
def get_genre_index(x, mapping=genre_mapping):
    x = list_from_yaml(x)
    return [genre_mapping[genre] for genre in x]
        

In [21]:
movie_data.loc[:, 'genre_indices'] = movie_data['genres'].apply(get_genre_index)

## Generate Data

In [22]:
mlb = MultiLabelBinarizer()
tfidf = TfidfVectorizer()

In [23]:
y = mlb.fit_transform(movie_data['genre_indices'].to_numpy())
X = tfidf.fit_transform(movie_data['cleaned_plot_summary'].to_numpy())

# Training

## Parameters

In [24]:
TRAIN_SPLIT = 0.8
VALIDATION_SPLIT = 0.5
RANDOM_STATE = 42 # seed

## Train/Test split

In [25]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=TRAIN_SPLIT, shuffle=True,
                                         random_state=RANDOM_STATE)
val_X, test_X, val_y, test_y = train_test_split(test_X, test_y, train_size=VALIDATION_SPLIT, shuffle=True,
                                         random_state=RANDOM_STATE)
print(f'train data has {train_X.shape[0]} movies,'
      f' validation data has {val_X.shape[0]} movies,'
      f' test data has {test_X.shape[0]} movies,')

train data has 33761 movies, validation data has 4220 movies, test data has 4221 movies,


## Logistic Regression

In [26]:
ovr = OneVsRestClassifier(LogisticRegression(solver='sag'))

pred_y = ovr.fit(train_X, train_y).predict(test_X)

In [30]:
TP, TN, FP, FN = confusion_matrix(pred_y, test_y)
test_metrics = classifcation_metrics(TP, TN, FP, FN)
test_metrics

{'precision': 0.7085714285714285,
 'recall': 0.18957609451007645,
 'accuracy': 0.966717953924777,
 'f1': 0.29912280701754385}

## SVM

In [32]:
ovr = OneVsRestClassifier(LinearSVC())

pred_y = ovr.fit(train_X, train_y).predict(test_X)

In [33]:
TP, TN, FP, FN = confusion_matrix(pred_y, test_y)
test_metrics = classifcation_metrics(TP, TN, FP, FN)
test_metrics

{'precision': 0.6297309621523027,
 'recall': 0.2879082696316887,
 'accuracy': 0.9669808987506215,
 'f1': 0.39515475225332636}