## TF-IDF Based Content Filtering
To filter out similar movies from a given title

In [45]:
import numpy as np
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [46]:
credits = pd.read_csv("../Datasets/tmdb_5000_credits.csv")
movies = pd.read_csv("../Datasets/tmdb_5000_movies.csv")

In [132]:
credits.rename(columns={'movie_id':'id'}, inplace=True)
movies_merged_df = movies.merge(credits,on='id')
movies_cleaned_df = movies_merged_df.drop(['crew','revenue','title_x','title_y','keywords','spoken_languages','budget','homepage','id','original_language','production_companies','production_countries','release_date','runtime','status','tagline'],axis=1)
movies_summaries = movies_cleaned_df[['original_title','overview']]
movies_summaries = movies_summaries.append({"original_title":"New Movie", "overview": "In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization."}, ignore_index=True)
movies_summaries.iloc[-1]["overview"] = "Batman"
movies_summaries.loc[4803,"overview"]

'Batman'

In [120]:
tfv = TfidfVectorizer(min_df=1,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')
movies_summaries['overview'] = movies_summaries['overview'].fillna('')

In [121]:
tfv_matrix = tfv.fit_transform(movies_summaries['overview'])

In [122]:
movies_summaries.set_index('original_title').loc['New Movie']['overview']

'Batman'

In [123]:
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
indices = pd.Series(movies_summaries.index, index=movies_summaries['original_title']).drop_duplicates()

In [124]:
def give_rec(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[0:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return movies_summaries['original_title'].iloc[movie_indices]

In [125]:
give_rec('New Movie')

4803                                   New Movie
428                               Batman Returns
3854     Batman: The Dark Knight Returns, Part 2
65                               The Dark Knight
3                          The Dark Knight Rises
299                               Batman Forever
119                                Batman Begins
210                               Batman & Robin
9             Batman v Superman: Dawn of Justice
0                                         Avatar
1       Pirates of the Caribbean: At World's End
Name: original_title, dtype: object