# Summary so far
We have all the scripts downloaded and turned into a bag-of-words. We also have a list of movies with ratings. We need to match the ratings to the scripts

Import the ratings

In [1]:
import pandas as pd

ratings_df = pd.read_csv('filmsCountriesRatings.tsv', sep='\t')

In [2]:
ratings_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,title,averageRating,numVotes
0,tt0005809,movie,The Governor,1915.0,Drama,The Governor,6.7,324.0
1,tt0008305,movie,Les Misérables,1917.0,Drama,Les Misérables,6.7,36.0
2,tt0009968,movie,Broken Blossoms,1919.0,"Drama,Romance",Broken Blossoms,7.2,10903.0
3,tt0018455,movie,Sunrise,1927.0,"Drama,Romance",Sunrise,8.1,53184.0
4,tt0020697,movie,The Blue Angel,1930.0,"Drama,Music",The Blue Angel,7.7,16068.0


In [3]:
ratings_df.shape

(1154, 8)

Rename a column so it doesn't get confused with one of the bag-of-words columns' titles

In [4]:
ratings_df2 = ratings_df.rename(columns={'title': 'movie_title'})

In [19]:
ratings_df2.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,movie_title,averageRating,numVotes
0,tt0005809,movie,The Governor,1915.0,Drama,The Governor,6.7,324.0
1,tt0008305,movie,Les Misérables,1917.0,Drama,Les Misérables,6.7,36.0
2,tt0009968,movie,Broken Blossoms,1919.0,"Drama,Romance",Broken Blossoms,7.2,10903.0
3,tt0018455,movie,Sunrise,1927.0,"Drama,Romance",Sunrise,8.1,53184.0
4,tt0020697,movie,The Blue Angel,1930.0,"Drama,Music",The Blue Angel,7.7,16068.0


Read in the bag of words from the scripts

In [5]:
scripts_df = pd.read_csv('scripts/bagOfWordsAndMovieTitle.csv')

In [8]:
scripts_df.head()

Unnamed: 0.1,Unnamed: 0,00,000,10,100,101,102,103,104,105,...,yourselves,youth,zero,zip,zips,zone,zoo,zoom,zooms,movie_name
0,0,0,0,0,0,0,0,0,0,0,...,0,14,0,0,0,0,0,0,0,The Maltese Falcon
1,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Casablanca
2,2,1,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,The Big Sleep
3,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Strangers on a Train
4,4,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,1,2,From Here To Eternity


In [15]:
scripts_df.shape

(708, 6689)

We have a lot more ratings than scripts! That's fine; we'll match the scripts we have.

In [6]:
ratingsSorted = ratings_df2.sort_values('movie_title')


In [27]:
ratingsSorted.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,movie_title,averageRating,numVotes
1152,tt9850370,movie,#Anne Frank Parallel Stories,2019.0,"Documentary,Drama,History",#Anne Frank Parallel Stories,6.6,1734.0
854,tt3526286,movie,#Horror,2015.0,"Crime,Drama,Horror",#Horror,3.0,3903.0
467,tt11942126,movie,007: Shadows,2020.0,"Action,Thriller",007: Shadows,8.5,23.0
102,tt0072263,movie,Ten Little Indians,1974.0,"Crime,Mystery,Thriller",10 Little Indians,5.7,3445.0
685,tt1999067,movie,15: Inside the Mind of a Serial Killer,2011.0,"Horror,Thriller",15: An Exploration of Human Violence,4.9,64.0


In [7]:
scriptsSorted = scripts_df.sort_values('movie_name')

In [29]:
scriptsSorted.head()

Unnamed: 0.1,Unnamed: 0,00,000,10,100,101,102,103,104,105,...,yourselves,youth,zero,zip,zips,zone,zoo,zoom,zooms,movie_name
457,457,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,10 Things I Hate About You
155,155,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,12 And Holding
387,387,1,0,1,0,0,0,0,0,0,...,1,2,0,1,2,0,6,0,2,12 Monkeys
490,490,0,2,1,2,0,0,0,0,0,...,0,0,2,1,0,2,0,0,0,13 Days
94,94,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2,13 Ghosts


The ratings movie titles and the scripts movie titles don't match up exactly, so we have to do fuzzy matching.

In [8]:
# https://www.datacamp.com/tutorial/fuzzy-string-python
from thefuzz import process
from thefuzz import fuzz

scripts_df["movie_name"] = scripts_df["movie_name"].apply(
  lambda x: process.extractOne(x, ratings_df2['movie_title'], scorer=fuzz.partial_ratio)[0]
)

Now we can merge the ratings and the scripts. We'll do a left merge with ratings as the left table, so we don't having any ratings without scripts

In [9]:
ratingsScripts = pd.merge(scripts_df,ratings_df2,how = 'left',left_on='movie_name', right_on='movie_title')

In [33]:
ratingsScripts.head()

Unnamed: 0.1,Unnamed: 0,00,000,10,100,101,102,103,104,105,...,zooms,movie_name,tconst,titleType,primaryTitle,startYear,genres,movie_title,averageRating,numVotes
0,0,0,0,0,0,0,0,0,0,0,...,0,The Image,tt0073589,movie,The Image,1975.0,"Adult,Drama",The Image,6.2,1885.0
1,1,0,0,0,1,0,1,0,0,0,...,0,Blackbear,tt4824256,movie,Blackbear,2019.0,"Drama,Sport",Blackbear,4.3,815.0
2,2,1,1,1,0,0,0,0,0,0,...,0,The Birds,tt0056869,movie,The Birds,1963.0,"Drama,Horror,Mystery",The Birds,7.6,201200.0
3,3,0,0,0,1,0,0,0,0,0,...,0,Stray,tt6294226,movie,Stray,2019.0,"Action,Crime,Fantasy",Stray,4.8,908.0
4,4,0,0,0,0,0,0,0,0,0,...,2,It,tt1396484,movie,It,2017.0,Horror,It,7.3,591697.0


In [10]:
ratingsScripts.shape

(722, 6697)

Set up the dataframe to do machine learning on, by dropping the unnecessary columns

In [11]:
ratingsScriptsML = ratingsScripts.drop(['movie_name','tconst','titleType','primaryTitle','startYear','genres','movie_title','numVotes'], axis=1)

In [40]:
ratingsScriptsML.head()

Unnamed: 0.1,Unnamed: 0,00,000,10,100,101,102,103,104,105,...,yourselves,youth,zero,zip,zips,zone,zoo,zoom,zooms,averageRating
0,0,0,0,0,0,0,0,0,0,0,...,0,14,0,0,0,0,0,0,0,6.2
1,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,4.3
2,2,1,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,7.6
3,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.8
4,4,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,1,2,7.3


Separate out the features and the predicted variable 

In [12]:
X = ratingsScriptsML.drop('averageRating', axis= 1)
y = ratingsScriptsML['averageRating']

In [18]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X,y,test_size = .2, random_state = 42)

We have too many features; let's reduce the number of features. We'll use SVD

In [19]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components = 27, random_state = 42)
svd.fit(X_tr)
X_tr_transformed = svd.transform(X_tr)
X_te_transformed = svd.transform(X_te)

In [20]:
X_tr_transformed.shape

(577, 27)

In [21]:
X_te_transformed.shape

(145, 27)

Save the original ratings and scripts dataframe.

In [43]:
ratingsScripts.to_csv('ratingsAndScriptsBagOfWords.csv')