In [382]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load dataset

In [384]:
df_film = pd.read_csv('cinemas.csv') 

## Check format

In [387]:
df_film.columns

Index(['Unnamed: 0', 'color', 'director_name', 'num_critic_for_reviews',
       'duration', 'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster',
       'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language',
       'country', 'content_rating', 'budget', 'title_year',
       'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio',
       'movie_facebook_likes', 'Unnamed: 28', 'war_symb_title',
       'point_symb_title'],
      dtype='object')

In [389]:
df_recom_sys = df_film[['movie_title', 'plot_keywords', 'genres', 'director_name']].copy()

In [391]:
df_recom_sys.dropna(inplace = True)

In [393]:
df_recom_sys

Unnamed: 0,movie_title,plot_keywords,genres,director_name
0,Avatar,avatar|future|marine|native|paraplegic,Action|Adventure|Fantasy|Sci-Fi,James Cameron
1,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...,Action|Adventure|Fantasy,Gore Verbinski
2,Spectre,bomb|espionage|sequel|spy|terrorist,Action|Adventure|Thriller,Sam Mendes
3,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...,Action|Thriller,Christopher Nolan
5,John Carter,alien|american civil war|male nipple|mars|prin...,Action|Adventure|Sci-Fi,Andrew Stanton
...,...,...,...,...
5042,El Mariachi,assassin|death|guitar|gun|mariachi,Action|Crime|Drama|Romance|Thriller,Robert Rodriguez
5043,The Mongol King,jewell|mongol|nostradamus|stepnicka|vallone,Crime|Drama,Anthony Vallone
5044,Newlyweds,written and directed by cast member,Comedy|Drama,Edward Burns
5045,Signed Sealed Delivered,fraud|postal worker|prison|theft|trial,Comedy|Drama,Scott Smith


In [395]:
df_recom_sys['plot_keywords'] = df_recom_sys['plot_keywords'].str.replace('|', ' ')

  df_recom_sys['plot_keywords'] = df_recom_sys['plot_keywords'].str.replace('|', ' ')


In [397]:
df_recom_sys

Unnamed: 0,movie_title,plot_keywords,genres,director_name
0,Avatar,avatar future marine native paraplegic,Action|Adventure|Fantasy|Sci-Fi,James Cameron
1,Pirates of the Caribbean: At World's End,goddess marriage ceremony marriage proposal pi...,Action|Adventure|Fantasy,Gore Verbinski
2,Spectre,bomb espionage sequel spy terrorist,Action|Adventure|Thriller,Sam Mendes
3,The Dark Knight Rises,deception imprisonment lawlessness police offi...,Action|Thriller,Christopher Nolan
5,John Carter,alien american civil war male nipple mars prin...,Action|Adventure|Sci-Fi,Andrew Stanton
...,...,...,...,...
5042,El Mariachi,assassin death guitar gun mariachi,Action|Crime|Drama|Romance|Thriller,Robert Rodriguez
5043,The Mongol King,jewell mongol nostradamus stepnicka vallone,Crime|Drama,Anthony Vallone
5044,Newlyweds,written and directed by cast member,Comedy|Drama,Edward Burns
5045,Signed Sealed Delivered,fraud postal worker prison theft trial,Comedy|Drama,Scott Smith


In [399]:
df_recom_sys['genres'] = df_recom_sys['genres'].str.replace('|', ' ')

  df_recom_sys['genres'] = df_recom_sys['genres'].str.replace('|', ' ')


In [401]:
df_recom_sys['movie_title'].unique()

array(['Avatar\xa0', "Pirates of the Caribbean: At World's End\xa0",
       'Spectre\xa0', ..., 'Newlyweds\xa0', 'Signed Sealed Delivered\xa0',
       'My Date with Drew\xa0'], dtype=object)

In [403]:
df_recom_sys['movie_title'] = df_recom_sys['movie_title'].str.replace('\xa0', '')

In [405]:
df_recom_sys['movie_title'].unique()

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Newlyweds', 'Signed Sealed Delivered', 'My Date with Drew'],
      dtype=object)

In [407]:
df_recom_sys.shape

(4801, 4)

In [409]:
df_recom_sys.drop_duplicates(subset = ['movie_title'],inplace = True)

In [411]:
df_recom_sys

Unnamed: 0,movie_title,plot_keywords,genres,director_name
0,Avatar,avatar future marine native paraplegic,Action Adventure Fantasy Sci-Fi,James Cameron
1,Pirates of the Caribbean: At World's End,goddess marriage ceremony marriage proposal pi...,Action Adventure Fantasy,Gore Verbinski
2,Spectre,bomb espionage sequel spy terrorist,Action Adventure Thriller,Sam Mendes
3,The Dark Knight Rises,deception imprisonment lawlessness police offi...,Action Thriller,Christopher Nolan
5,John Carter,alien american civil war male nipple mars prin...,Action Adventure Sci-Fi,Andrew Stanton
...,...,...,...,...
5042,El Mariachi,assassin death guitar gun mariachi,Action Crime Drama Romance Thriller,Robert Rodriguez
5043,The Mongol King,jewell mongol nostradamus stepnicka vallone,Crime Drama,Anthony Vallone
5044,Newlyweds,written and directed by cast member,Comedy Drama,Edward Burns
5045,Signed Sealed Delivered,fraud postal worker prison theft trial,Comedy Drama,Scott Smith


In [413]:
df_recom_sys['combined_info'] = (df_recom_sys['plot_keywords'] + ' ' + df_recom_sys['movie_title'].str.replace(':', '').str.replace('-', ' ')+ ' '+df_recom_sys['genres'] +' '+ df_recom_sys['director_name']).str.lower()

In [415]:
stopwords = r'\b(?:a|an|the|in|on|at|by|for|with|about|against|between|into|through|during|before|after|above|below|to|from|up|down|over|under|again|further|then|once)\b'
df_recom_sys['combined_info'] = df_recom_sys['combined_info'].str.replace(stopwords, '', regex=True)
df_recom_sys['combined_info'] = df_recom_sys['combined_info'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [417]:
df_recom_sys['combined_info']

0       avatar future marine native paraplegic avatar ...
1       goddess marriage ceremony marriage proposal pi...
2       bomb espionage sequel spy terrorist spectre ac...
3       deception imprisonment lawlessness police offi...
5       alien american civil war male nipple mars prin...
                              ...                        
5042    assassin death guitar gun mariachi el mariachi...
5043    jewell mongol nostradamus stepnicka vallone mo...
5044    written and directed cast member newlyweds com...
5045    fraud postal worker prison theft trial signed ...
5049    actress name title crush date four word title ...
Name: combined_info, Length: 4674, dtype: object

## rec sys (TfidfVectorizer + cosine_similarity)

In [420]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))

In [422]:
tfidf_matrix = tfidf.fit_transform(df_recom_sys['combined_info'])

In [424]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [425]:
def get_recommendation(df, title, simil):
    idx = df[df['movie_title'] == title].index[0]
    sim_scores = list(enumerate(simil[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['movie_title'].iloc[movie_indices]

In [428]:
get_recommendation(df_recom_sys, 'Avatar', cosine_sim)

2487                        Aliens
3576                The Terminator
288     Terminator 2: Judgment Day
140             The Last Airbender
606                      The Abyss
4691                       Destiny
1702         Dragonball: Evolution
5027                    The Ridges
26                         Titanic
210                        X-Men 2
Name: movie_title, dtype: object