In [92]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import f1_score, make_scorer
from pprint import pprint
from time import time
import pickle

In [2]:
train_data = pd.read_csv('movie-plots-student.csv', index_col=0)
train_X, train_y = train_data['Plot'], train_data['Genre']
lab_enc = LabelEncoder()
train_y = lab_enc.fit_transform(train_y)

In [89]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])
parameters = {
    'vect__max_features': (None, 1000, 5000),
    'vect__stop_words': (None, 'english'),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
    'clf__max_depth': (30, 50, 70, None),
    'clf__n_estimators': (100, 300),
}

In [90]:
np.random.seed(2020)
macro_f1 = make_scorer(f1_score, average='macro')
grid_search = GridSearchCV(pipeline, parameters, scoring=macro_f1,
                           n_jobs=-1, verbose=10, cv=5)


In [91]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_X, train_y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
pickle.dump((lab_enc, grid_search), open('result_models.pkl','wb'))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__max_depth': (10, 50),
 'vect__max_features': (1000, 5000),
 'vect__ngram_range': ((1, 1), (1, 3)),
 'vect__stop_words': (None, 'english')}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  74 out of  80 | elapsed:  5.2min remaining:   25.4s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  5.6min finished


done in 375.130s

Best score: 0.525
Best parameters set:
	clf__max_depth: 50
	vect__max_features: 5000
	vect__ngram_range: (1, 3)
	vect__stop_words: 'english'


I use 5-fold cross-validation for train-validation split. To tokenize the sentences, I lower the cases, experiment on n-grams (1, 2, 3), different numbers of maximus tokens and exclude stop words. Then I use TF-IDF to featurize the tokens. I experiment on different classification models, like Naive Bayes and random forest. The classification results are evaluated on validation with macro F1 socres. I tune the hyperparameters by cross-validation with grid search on tokenization settings mentioned above and model hyperparameters of random forests, including number of trees and max depth.

In [100]:
lab_enc, grid_search = pickle.load(open('result_models.pkl','rb'))
print('Validation f1: %.4f'%grid_search.best_score_)

Validation f1: 0.5254


In [None]:
# Change the file name to the test file
test_data = pd.read_csv('movie-plots-test.csv',index_col=0)
test_X, test_y = test_data['Plot'], test_data['Genre']
test_y = lab_enc.transform(test_y)
print(cm(test_y,preds))
print(cr(test_y,preds))