In [257]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [246]:
df = pd.read_csv('cleaned_data.csv')

In [247]:
df.head()

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,grossMillions,genre,description,director,actors
0,Dark,2017,60,8.8,unknown,215184,unknown,Crime Drama Mystery,"A family saga with a supernatural twist, set i...",unknown,"Louis Hofmann, Karoline Eichhorn, Lisa Vicari,..."
1,The Sinner,2017,45,8.0,unknown,77161,unknown,Crime Drama Mystery,Anthology series that examines how and why ord...,unknown,"Bill Pullman, Dohn Norwood, Adam LeFevre, Jess..."
2,Game of Thrones,2011,57,9.3,unknown,1688494,unknown,Action Adventure Drama,Nine noble families fight for control over the...,unknown,"Emilia Clarke, Peter Dinklage, Kit Harington, ..."
3,Yellowstone,2018,60,8.4,unknown,17372,unknown,Drama Western,A ranching family in Montana faces off against...,unknown,"Kevin Costner, Luke Grimes, Kelly Reilly, Wes ..."
4,Knives Out,2019,130,7.9,82.0,341615,165.36,Comedy Crime Drama,A detective investigates the death of a patria...,Rian Johnson,"Daniel Craig, Chris Evans, Ana de Armas, Jamie..."


In [248]:
df['movie'] = df['movie'].apply(lambda x: x.replace(':', '').replace('.', '').replace('-', '').replace('/', '').replace(',', '').lower().strip())

df['actors'] = df['actors'].astype(str)

df['description'] = df['description'].apply(lambda x: x.replace(',', '').replace('.', '').lower().strip())

df['actors'] = df['actors'].apply(lambda x: x.replace(',', '').replace('.', '').strip())

df['genre'] = df['genre'].apply(lambda x: x.strip())

In [249]:
# create features

features = ['description', 'genre', 'actors']

# create column in df which combines all the features

def combine_features(row):
    return row['description'] + ' ' + row['genre'] + ' ' + row['actors']

df['combined_features'] = df.apply(combine_features, axis=1)
print(df.combined_features)

0       a family saga with a supernatural twist set in...
1       anthology series that examines how and why ord...
2       nine noble families fight for control over the...
3       a ranching family in montana faces off against...
4       a detective investigates the death of a patria...
                              ...                        
9620    when their best friends announce that they're ...
9621    two minor characters from the play 'hamlet' st...
9622    a depressed wealthy businessman finds his life...
9623    batman learns he has a violent unruly pre-teen...
9624    out-of-the-box simon roberts runs an ad agency...
Name: combined_features, Length: 9625, dtype: object


In [250]:
df.head()

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,grossMillions,genre,description,director,actors,combined_features
0,dark,2017,60,8.8,unknown,215184,unknown,Crime Drama Mystery,a family saga with a supernatural twist set in...,unknown,Louis Hofmann Karoline Eichhorn Lisa Vicari Ma...,a family saga with a supernatural twist set in...
1,the sinner,2017,45,8.0,unknown,77161,unknown,Crime Drama Mystery,anthology series that examines how and why ord...,unknown,Bill Pullman Dohn Norwood Adam LeFevre Jessica...,anthology series that examines how and why ord...
2,game of thrones,2011,57,9.3,unknown,1688494,unknown,Action Adventure Drama,nine noble families fight for control over the...,unknown,Emilia Clarke Peter Dinklage Kit Harington Len...,nine noble families fight for control over the...
3,yellowstone,2018,60,8.4,unknown,17372,unknown,Drama Western,a ranching family in montana faces off against...,unknown,Kevin Costner Luke Grimes Kelly Reilly Wes Ben...,a ranching family in montana faces off against...
4,knives out,2019,130,7.9,82.0,341615,165.36,Comedy Crime Drama,a detective investigates the death of a patria...,Rian Johnson,Daniel Craig Chris Evans Ana de Armas Jamie Le...,a detective investigates the death of a patria...


In [251]:
# create count matrix from new combined column

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(df['combined_features'])

In [252]:
# compute the cosine similarity based on the count_matrix

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [253]:
df = df.reset_index()
titles = df['movie']
indices = pd.Series(df.index, index=df['movie'])

In [254]:
# create a function that will show us best 10 similar movies 

def recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # get a list of similar movies in descending order of similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
   
    # not taking the first index since it is the same movie
    sim_scores = sim_scores[1:25]
    movie_indices = [i[0] for i in sim_scores]
    
    # show best movies and sort them by imdb raiting
    movies = df.iloc[movie_indices][['movie', 'year', 'imdb', 'votes']]
    vote_counts = movies[movies['votes'].notnull()]['votes'].astype('int')

    qualified = movies.sort_values('imdb', ascending=False).head(10) 
    return qualified



In [265]:
recommendations('prison break')

Unnamed: 0,movie,year,imdb,votes
5648,satya,1998,8.2,12746
8236,prison break the final break,2009,7.9,23132
6350,a prophet,2009,7.9,91588
2733,the life of david gale,2003,7.6,106347
1137,hell or high water,2016,7.6,195831
3206,the player,1992,7.5,52858
4506,dead man walking,1995,7.5,86553
6373,ghost dog the way of the samurai,1999,7.5,82799
8596,the innocent man,2018,7.3,4434
6259,lucky man,2016,7.2,5258
