In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer

In [57]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('fff/data/cleaned_data.csv')

In [58]:
df.head()

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,grossMillions,genre,description,director,actors
0,Dark,2017,60,8.8,unknown,215184,unknown,Crime Drama Mystery,"A family saga with a supernatural twist, set i...",unknown,"Louis Hofmann, Karoline Eichhorn, Lisa Vicari,..."
1,The Sinner,2017,45,8.0,unknown,77161,unknown,Crime Drama Mystery,Anthology series that examines how and why ord...,unknown,"Bill Pullman, Dohn Norwood, Adam LeFevre, Jess..."
2,Game of Thrones,2011,57,9.3,unknown,1688494,unknown,Action Adventure Drama,Nine noble families fight for control over the...,unknown,"Emilia Clarke, Peter Dinklage, Kit Harington, ..."
3,Yellowstone,2018,60,8.4,unknown,17372,unknown,Drama Western,A ranching family in Montana faces off against...,unknown,"Kevin Costner, Luke Grimes, Kelly Reilly, Wes ..."
4,Knives Out,2019,130,7.9,82.0,341615,165.36,Comedy Crime Drama,A detective investigates the death of a patria...,Rian Johnson,"Daniel Craig, Chris Evans, Ana de Armas, Jamie..."


In [59]:
#changing columnt types for string concatenation, adding new columns and removing irrelevant characters

df['movie'] = df['movie'].astype(str)
df['movie'] = df['movie'].apply(lambda x: x.replace(':', '').replace('.', '').replace('-', '').replace('/', '').replace(',', '').lower().strip())

df['actors'] = df['actors'].astype(str)
df['actors'] = df['actors'].apply(lambda x: x.replace(' ', '').strip())
df['actors'] = df['actors'].apply(lambda x: " ".join(x.split(',')))

df['director'] = df['director'].apply(lambda x: "".join(x.split()))

df['imdb'] = df['imdb'].astype(str)

df['year'] = df['year'].astype(str)

df['score'] = 'imdb score'

df['imdb_score'] = df['score'].str.cat(df['imdb'], sep=" ")

df['genre'] = df['genre'].apply(lambda x: x.strip())
df['genre'] = df['genre'].apply(lambda x: x.replace('     ', ' '))


spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–",'0',"1","2","3","4","5",'6',"7","8","9"]
for char in spec_chars:
    df['description'] = df['description'].str.replace(char, ' ')

df['description'] = df['description'].astype(str).str.lower().str.strip()
df['description'] = df['description'].str.split().str.join(" ")

In [60]:
def combine_features(row):
    return row['description'] + ' ' + row['actors'] + ' ' + row['director'] + ' ' + row['genre']

#create a new column with 3 relevant features for film prediction
df['combined_features'] = df.apply(combine_features, axis=1)

In [61]:
#replacing unknown values with NaN 
df['combined_features'] = df['combined_features'].apply(lambda x: x.replace('unknown', ''))
df['combined_features'] = df['combined_features'].replace(r'^\s*$', np.nan, regex=True)

In [62]:
df.head()

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,grossMillions,genre,description,director,actors,score,imdb_score,combined_features
0,dark,2017,60,8.8,unknown,215184,unknown,Crime Drama Mystery,a family saga with a supernatural twist set in...,unknown,LouisHofmann KarolineEichhorn LisaVicari MajaS...,imdb score,imdb score 8.8,a family saga with a supernatural twist set in...
1,the sinner,2017,45,8.0,unknown,77161,unknown,Crime Drama Mystery,anthology series that examines how and why ord...,unknown,BillPullman DohnNorwood AdamLeFevre JessicaBiel,imdb score,imdb score 8.0,anthology series that examines how and why ord...
2,game of thrones,2011,57,9.3,unknown,1688494,unknown,Action Adventure Drama,nine noble families fight for control over the...,unknown,EmiliaClarke PeterDinklage KitHarington LenaHe...,imdb score,imdb score 9.3,nine noble families fight for control over the...
3,yellowstone,2018,60,8.4,unknown,17372,unknown,Drama Western,a ranching family in montana faces off against...,unknown,KevinCostner LukeGrimes KellyReilly WesBentley,imdb score,imdb score 8.4,a ranching family in montana faces off against...
4,knives out,2019,130,7.9,82.0,341615,165.36,Comedy Crime Drama,a detective investigates the death of a patria...,RianJohnson,DanielCraig ChrisEvans AnadeArmas JamieLeeCurtis,imdb score,imdb score 7.9,a detective investigates the death of a patria...


In [64]:
#final data
df.to_csv('end_data.csv', index=False)

In [65]:
#create count matrix using count vectorizer
cv = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
count_matrix = cv.fit_transform(df['combined_features'])

In [66]:
print(count_matrix.shape)

(9625, 202312)


In [67]:
#saving a model
pickle.dump(count_matrix, open('cosine_similarity.pickle', 'wb'))