# Here I wanted to see if I could train a model to recognize MCU movie transcripts from the transcripts of similar movies

### The trained models did an ok job, not great. I could have done a better job at feature engineering and removing stop words. But what it seemed to struggle the most with was certain MCU movies that didn't have any of the other main characters from the Avengers, like 'Dr. Strange' and 'Captain Marvel', which is understandable, but I had somewhat hoped that the language in the transcripts would give enough of a signal. It seems like its mostly picking out the names of the most popular characters

### I'm not too worried that the models didn't do great, I was just doing this for practice and for funzies.

### This notebook isn't very well documented and explained, I'm pretty sure no one will see this or care, its just for me to return to in order to copy and paste code

In [11]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('cleaned_scripts_and_titles.csv', index_col=0)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['scripts'],df['MCU'],random_state=131)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [5]:
vect.get_feature_names()[::2000]

['00',
 'bat',
 'clogged',
 'disperses',
 'forrest',
 'imitative',
 'mania',
 'pair',
 'releases',
 'slogan',
 'throne',
 'wider']

In [6]:
len(vect.get_feature_names())

22553

In [7]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<48x22553 sparse matrix of type '<class 'numpy.int64'>'
	with 102639 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.7


In [12]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['man' 'spider' 'new' 'black' 'they' 'one' 'nick' 'daisy' 'of' 'your']

Largest Coefs: 
['it' 'he' 'on' 'in' 'steve' 'banner' 'up' 'as' 'scott' 'bruce']


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

5343

In [14]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.7


In [15]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['groggy' 'slumped' 'stomps' 'leadership' 'stroll' 'generating'
 'collision' 'adjusts' 'sideways' 'shudders']

Largest tfidf: 
['logan' 'peter' 'spider' 'the' 'scott' 'panther' 'tony' 'edward' 'blade'
 'thor']


In [16]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['spider' 'logan' 'man' 'edward' 'charles' 'swan' 'gwen' 'jacob' 'black'
 'jean']

Largest Coefs: 
['tony' 'thor' 'steve' 'stark' 'scott' 'peter' 'the' 'jane' 'rogers'
 'challa']


In [20]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,4)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

21641

In [21]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.7


In [22]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['man' 'spider' 'spider man' 'black' 'new' 'they' 'but' 'one' 'your' 'of']

Largest Coefs: 
['it' 'he' 'on' 'steve' 'scott' 'tony' 'ross' 'challa' 'in' 'rocket']


In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


print('Recall: {:.3f}'.format(recall_score(y_test, predictions)))
print('Precision: {:.3f}'.format(precision_score(y_test, predictions)))
print('Accuracy: {:.3f}'.format(accuracy_score(y_test, predictions)))
print('F1: {:.3f}'.format(f1_score(y_test, predictions)))
confusion = confusion_matrix(y_test, predictions)
print(confusion)

Recall: 0.400
Precision: 1.000
Accuracy: 0.812
F1: 0.571
[[11  0]
 [ 3  2]]


In [24]:
y_test

52    0
40    0
45    0
19    1
10    0
17    1
6     1
16    0
38    0
13    1
39    0
9     0
62    0
35    0
36    0
14    1
Name: MCU, dtype: int64

In [25]:
predictions

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [27]:
from sklearn import naive_bayes

clfnb = naive_bayes.MultinomialNB()

In [28]:
clfnb.fit(X_train_vectorized, y_train)

predictions = clfnb.predict(vect.transform(X_test))

In [29]:
print('Recall: {:.3f}'.format(recall_score(y_test, predictions)))
print('Precision: {:.3f}'.format(precision_score(y_test, predictions)))
print('Accuracy: {:.3f}'.format(accuracy_score(y_test, predictions)))
print('F1: {:.3f}'.format(f1_score(y_test, predictions)))
confusion = confusion_matrix(y_test, predictions)
print(confusion)

Recall: 0.600
Precision: 1.000
Accuracy: 0.875
F1: 0.750
[[11  0]
 [ 2  3]]


In [38]:
from sklearn import svm

clfsv = svm.SVC(kernel='rbf', C=0.001)
clfnb.fit(X_train_vectorized, y_train)

predictions = clfnb.predict(vect.transform(X_test))

In [39]:
print('Recall: {:.3f}'.format(recall_score(y_test, predictions)))
print('Precision: {:.3f}'.format(precision_score(y_test, predictions)))
print('Accuracy: {:.3f}'.format(accuracy_score(y_test, predictions)))
print('F1: {:.3f}'.format(f1_score(y_test, predictions)))
confusion = confusion_matrix(y_test, predictions)
print(confusion)

Recall: 0.600
Precision: 1.000
Accuracy: 0.875
F1: 0.750
[[11  0]
 [ 2  3]]


In [82]:
from sklearn.model_selection import cross_val_score

vect = TfidfVectorizer(min_df=5,max_df=0.5,ngram_range=(1,2)).fit(df['scripts'])
len(vect.get_feature_names())

21433

In [83]:
X_vectorized = vect.transform(df['scripts'])

In [84]:

clfsv = svm.SVC(kernel='linear', C=1.0)

cv_scores = cross_val_score(clfsv, X_vectorized, df['MCU'], cv=10)

print(cv_scores)
print(np.mean(cv_scores))

[0.75       1.         0.71428571 0.83333333 0.83333333 0.83333333
 1.         0.83333333 0.83333333 0.83333333]
0.8464285714285713
