In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Proportion of data to use for training and validation
trainval_ratio = 0.8

rng = np.random.RandomState(0)
data = pd.read_csv('movie-plots-student.csv', index_col=0)
x, y = data.Plot.values, data.Genre.values

# Shuffle data and split into combined training/validation, and test sets. We will tune
# hyperparameters using cross-validation on the combined training/validation set, and then
# estimate generalization performance on the test set.
shuffled_idxs = rng.permutation(len(x))
x, y = x[shuffled_idxs], y[shuffled_idxs]
trainval_idx = int(len(x) * trainval_ratio)
x_trainval, y_trainval = x[:trainval_idx], y[:trainval_idx]
x_test, y_test = x[trainval_idx:], y[trainval_idx:]

# Count words, then transform to a tf-idf representation. Classify using linear SVM.
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', max_iter=5, tol=None))])

# Setup hyperparameter grid search. Log loss is multinomial logistic regression, hinge loss 
# is linear SVM. Alpha is the regularization strength of the classifier, and ngram_range allows 
# using unigrams and bigrams.
parameters = {
    'clf__loss': ('log', 'hinge'),
    'clf__alpha': (1e-4, 1e-5, 1e-6),
    'vect__ngram_range': [(1, 1), (1, 2)]
}
# Optimize hyperparameters using grid search 5-fold cross-validation
gs_clf = GridSearchCV(text_clf, parameters, scoring='f1_macro', cv=5, n_jobs=-1).fit(x_trainval, y_trainval)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
pred_test = gs_clf.predict(x_test)
f1 = f1_score(y_test, pred_test, average='macro')
print(f'F1: {f1:.3f}')

The result of the cross-validation is:

`clf__alpha: 0.0001
clf__loss: 'hinge'
vect__ngram_range: (1, 2)
F1: 0.716`

For the competition, we will use these hyperparameters and train using the entire dataset.

In [None]:
def test_model(test_data):
    data = pd.read_csv('movie-plots-student.csv', index_col=0)
    x, y = data.Plot.values, data.Genre.values
    text_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 2))),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', alpha=1e-4, max_iter=5, tol=None))])
    text_clf = text_clf.fit(x, y)
    return text_clf.predict(test_data)

In [None]:
data=pd.read_csv("movie-plots-test.csv",index_col=0)
test_y=data["Genre"]
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import classification_report as cr
preds=test_model(data["Plot"])
cm(test_y,preds)

In [None]:
print(cr(test_y,preds))