### *Question 6*

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk

In [2]:
data=pd.read_csv("movie-plots-student.csv")
classes=["drama","comedy","horror","action"]
labels=data["Genre"].apply(lambda x: classes.index(x))

In [3]:
# Tokenize textual data:
tokens=[[token for token in nltk.tokenize.word_tokenize(text) if token.isalpha()] for text in data["Plot"]]

In [4]:
# Remove stopwords from nltk:
from nltk.corpus import stopwords
tokens=[[token for token in doc if token not in stopwords.words("english")] for doc in tokens]

In [5]:
# Apply stemming:
from nltk.stem import PorterStemmer
porter=PorterStemmer()
tokens=[[porter.stem(token) for token in doc] for doc in tokens]

In [7]:
# Train/validation 80/20 split:
train=np.random.choice(range(len(labels)),size=int(0.8*len(labels)),replace=False)
val=[i for i in range(len(labels)) if i not in train]
train_X=[tokens[i] for i in train]
train_y=[labels[i] for i in train]
val_X=[tokens[i] for i in val]
val_y=[labels[i] for i in val]

In [8]:
# Three different vectorizations:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizers={'binary':TfidfVectorizer(analyzer='word',binary=True),'bow':CountVectorizer(analyzer='word',binary=False),'tfidf':TfidfVectorizer(analyzer='word',binary=False)}
vec_train_X,vec_val_X={},{}
for name,vectorizer in vectorizers.items():
    vec_train_X[name]=vectorizer.fit_transform([" ".join(doc) for doc in train_X])
    vec_val_X[name]=vectorizer.transform([" ".join(doc) for doc in val_X])

In [9]:
# Naive Bayes models:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
models={'binary':BernoulliNB(),'bow':MultinomialNB(),'tfidf':MultinomialNB()}
predictions_nb={}
for name,model in models.items():
    model.fit(vec_train_X[name],train_y)
    predictions_nb[name]=model.predict(vec_val_X[name])

In [10]:
# Logistic Regression models:
from sklearn.linear_model import LogisticRegression
models={name:LogisticRegression(max_iter=10000) for name in models.keys()}
predictions_lr={}
for name,model in models.items():
    model.fit(vec_train_X[name],train_y)
    predictions_lr[name]=model.predict(vec_val_X[name])

In [11]:
# SVM models:
from sklearn.svm import SVC
models={name:SVC(kernel='linear') for name in models.keys()}
predictions_svm={}
for name,model in models.items():
    model.fit(vec_train_X[name],train_y)
    predictions_svm[name]=model.predict(vec_val_X[name])

In [12]:
# Generate confusion matrices:
from sklearn.metrics import confusion_matrix
cms_nb={name:pd.DataFrame(confusion_matrix(val_y,predictions_nb[name]).T,index=[f"predicted {classes[0]}",f"predicted {classes[1]}",f"predicted {classes[2]}",f"predicted {classes[3]}"],columns=[f"true {classes[0]}",f"true {classes[1]}",f"true {classes[2]}",f"true {classes[3]}"]) for name in predictions_nb.keys()}
cms_lr={name:pd.DataFrame(confusion_matrix(val_y,predictions_lr[name]).T,index=[f"predicted {classes[0]}",f"predicted {classes[1]}",f"predicted {classes[2]}",f"predicted {classes[3]}"],columns=[f"true {classes[0]}",f"true {classes[1]}",f"true {classes[2]}",f"true {classes[3]}"]) for name in predictions_lr.keys()}
cms_svm={name:pd.DataFrame(confusion_matrix(val_y,predictions_svm[name]).T,index=[f"predicted {classes[0]}",f"predicted {classes[1]}",f"predicted {classes[2]}",f"predicted {classes[3]}"],columns=[f"true {classes[0]}",f"true {classes[1]}",f"true {classes[2]}",f"true {classes[3]}"]) for name in predictions_svm.keys()}

In [13]:
from sklearn.metrics import *

In [14]:
# Binary vect. + Bernoulli NB:
print('accuracy: ',accuracy_score(val_y,predictions_nb['binary']))
print('balanced accuracy: ',balanced_accuracy_score(val_y,predictions_nb['binary']))
print('macro f1-score: ',f1_score(val_y,predictions_nb['binary'],average='macro'))
cms_nb['binary']

accuracy:  0.6408582089552238
balanced accuracy:  0.5562459866946986
macro f1-score:  0.5747159318288565


Unnamed: 0,true drama,true comedy,true horror,true action
predicted drama,837,354,69,109
predicted comedy,107,358,5,13
predicted horror,50,26,122,17
predicted action,13,7,0,57


In [15]:
# Bag-of-words vect. + Multinomial NB:
print('accuracy: ',accuracy_score(val_y,predictions_nb['bow']))
print('balanced accuracy: ',balanced_accuracy_score(val_y,predictions_nb['bow']))
print('macro f1-score: ',f1_score(val_y,predictions_nb['bow'],average='macro'))
cms_nb['bow']

accuracy:  0.6898320895522388
balanced accuracy:  0.6776360079498299
macro f1-score:  0.6810477462086035


Unnamed: 0,true drama,true comedy,true horror,true action
predicted drama,733,217,28,49
predicted comedy,211,485,15,26
predicted horror,15,20,151,11
predicted action,48,23,2,110


In [16]:
# TF-IDF vect. + Multinomial NB:
print('accuracy: ',accuracy_score(val_y,predictions_nb['tfidf']))
print('balanced accuracy: ',balanced_accuracy_score(val_y,predictions_nb['tfidf']))
print('macro f1-score: ',f1_score(val_y,predictions_nb['tfidf'],average='macro'))
cms_nb['tfidf']

accuracy:  0.5237873134328358
balanced accuracy:  0.28953733263131237
macro f1-score:  0.2356203928153876


Unnamed: 0,true drama,true comedy,true horror,true action
predicted drama,1000,622,190,195
predicted comedy,7,123,6,1
predicted horror,0,0,0,0
predicted action,0,0,0,0


In [17]:
# Binary vect. + Logistic Regression:
print('accuracy: ',accuracy_score(val_y,predictions_lr['binary']))
print('balanced accuracy: ',balanced_accuracy_score(val_y,predictions_lr['binary']))
print('macro f1-score: ',f1_score(val_y,predictions_lr['binary'],average='macro'))
cms_lr['binary']

accuracy:  0.7318097014925373
balanced accuracy:  0.6470196978906863
macro f1-score:  0.6943539693500478


Unnamed: 0,true drama,true comedy,true horror,true action
predicted drama,859,232,53,83
predicted comedy,133,502,17,24
predicted horror,6,7,125,6
predicted action,9,4,1,83


In [18]:
# Bag-of-words vect. + Logistic Regression:
print('accuracy: ',accuracy_score(val_y,predictions_lr['bow']))
print('balanced accuracy: ',balanced_accuracy_score(val_y,predictions_lr['bow']))
print('macro f1-score: ',f1_score(val_y,predictions_lr['bow'],average='macro'))
cms_lr['bow']

accuracy:  0.6791044776119403
balanced accuracy:  0.6304868233691376
macro f1-score:  0.6499647462813667


Unnamed: 0,true drama,true comedy,true horror,true action
predicted drama,754,227,48,66
predicted comedy,199,481,20,21
predicted horror,16,21,122,10
predicted action,38,16,6,99


In [19]:
# TF-IDF vect. + Logistic Regression:
print('accuracy: ',accuracy_score(val_y,predictions_lr['tfidf']))
print('balanced accuracy: ',balanced_accuracy_score(val_y,predictions_lr['tfidf']))
print('macro f1-score: ',f1_score(val_y,predictions_lr['tfidf'],average='macro'))
cms_lr['tfidf']

accuracy:  0.6907649253731343
balanced accuracy:  0.5600658644521903
macro f1-score:  0.6031296107524566


Unnamed: 0,true drama,true comedy,true horror,true action
predicted drama,860,258,71,122
predicted comedy,135,474,21,25
predicted horror,6,9,104,6
predicted action,6,4,0,43


In [20]:
print('accuracy: ',accuracy_score(val_y,predictions_svm['binary']))
print('balanced accuracy: ',balanced_accuracy_score(val_y,predictions_svm['binary']))
print('macro f1-score: ',f1_score(val_y,predictions_svm['binary'],average='macro'))
cms_svm['binary']

accuracy:  0.746268656716418
balanced accuracy:  0.7074547082217384
macro f1-score:  0.7339233094641257


Unnamed: 0,true drama,true comedy,true horror,true action
predicted drama,835,220,35,62
predicted comedy,140,506,7,21
predicted horror,11,12,152,6
predicted action,21,7,2,107


In [21]:
print('accuracy: ',accuracy_score(val_y,predictions_svm['bow']))
print('balanced accuracy: ',balanced_accuracy_score(val_y,predictions_svm['bow']))
print('macro f1-score: ',f1_score(val_y,predictions_svm['bow'],average='macro'))
cms_svm['bow']

accuracy:  0.6403917910447762
balanced accuracy:  0.6023874992570157
macro f1-score:  0.6127412209333266


Unnamed: 0,true drama,true comedy,true horror,true action
predicted drama,703,247,51,66
predicted comedy,223,454,17,25
predicted horror,24,20,122,11
predicted action,57,24,6,94


In [22]:
print('accuracy: ',accuracy_score(val_y,predictions_svm['tfidf']))
print('balanced accuracy: ',balanced_accuracy_score(val_y,predictions_svm['tfidf']))
print('macro f1-score: ',f1_score(val_y,predictions_svm['tfidf'],average='macro'))
cms_svm['tfidf']

accuracy:  0.7066231343283582
balanced accuracy:  0.6340760359581273
macro f1-score:  0.6653764269851274


Unnamed: 0,true drama,true comedy,true horror,true action
predicted drama,832,248,46,93
predicted comedy,139,472,15,19
predicted horror,12,19,134,7
predicted action,24,6,1,77


In [None]:
# Select hyperparameters for Bag-of-words + Logistic Regression:
fig,axes=plt.subplots(nrows=1,ncols=3,figsize=(18,5))
penalties=["l2","l1","none"]
solvers=['lbfgs','saga','lbfgs']
Cs=[1e-3,1e-2,1e-1,0.5,1,5,1e1,1e2,1e3]
for i,penalty in enumerate(penalties):
    acc,b_acc,f1=[],[],[]
    for c in Cs:
        print(penalty,c)
        model=LogisticRegression(penalty=penalty,solver=solvers[i],C=c,max_iter=10000)
        model.fit(vec_train_X['bow'],train_y)
        prediction=model.predict(vec_val_X['bow'])
        acc.append(accuracy_score(val_y,prediction))
        b_acc.append(balanced_accuracy_score(val_y,prediction))
        f1.append(f1_score(val_y,prediction,average='macro'))
    axes[i].plot(acc,color='purple',label='accuracy')
    axes[i].plot(b_acc,color='orange',label='balanced acc.')
    axes[i].plot(f1,color='salmon',label='macro-f1')
    axes[i].set_title(f"penalty: {penalty}")
    axes[i].set_xlabel("C")
    axes[i].set_xticks(ticks=range(len(Cs)))
    axes[i].set_xticklabels(labels=Cs)
    axes[i].legend()
    axes[i].grid()
plt.show()

l2 0.001
l2 0.01
l2 0.1
l2 0.5
l2 1
l2 5
l2 10.0
l2 100.0
l2 1000.0
l1 0.001
l1 0.01
l1 0.1
l1 0.5
