## TF-IDF Based Content Filtering
To filter out similar movies from a given title

In [12]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [2]:
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")

In [3]:
credits.rename(columns={'movie_id':'id'}, inplace=True)
movies_merged_df = movies.merge(credits,on='id')
movies_cleaned_df = movies_merged_df.drop(['crew','revenue','title_x','title_y','keywords','spoken_languages','budget','homepage','id','original_language','production_companies','production_countries','release_date','runtime','status','tagline'],axis=1)
movies_cleaned_df.head()

Unnamed: 0,genres,original_title,overview,popularity,vote_average,vote_count,cast
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",Spectre,A cryptic message from Bond’s past sends him o...,107.376788,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",John Carter,"John Carter is a war-weary, former military ca...",43.926995,6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c..."


In [5]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')
movies_cleaned_df['overview'] = movies_cleaned_df['overview'].fillna('')

In [6]:
tfv_matrix = tfv.fit_transform(movies_cleaned_df['overview'])

In [8]:
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
indices = pd.Series(movies_cleaned_df.index, index=movies_cleaned_df['original_title']).drop_duplicates()

In [9]:
def give_rec(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return movies_cleaned_df['original_title'].iloc[movie_indices]

In [11]:
give_rec('Go for It!')

1363                                 Spy Kids
1155                  Spy Kids 3-D: Game Over
2638                               Metropolis
3793                     The Velocity of Gary
2305                              Indignation
3016                                  Sparkle
3796        The Last Time I Committed Suicide
1859    The Sisterhood of the Traveling Pants
3149                                   Volver
1302    Spy Kids 2: The Island of Lost Dreams
Name: original_title, dtype: object