# Data cleaning and preprocessing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# Commented dfs are not needed
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")
keywords = pd.read_csv("keywords.csv")
# ratings = pd.read_csv("ratings.csv")
# links = pd.read_csv("links.csv")
# metadata = pd.read_csv("movies_metadata.csv")

In [3]:
# Examine the datasets
print("Movies dataset:")
# print(movies.head())
print(movies.info())

print("\nCredits dataset:")
# print(credits.head())
print(credits.info())

print("\nKeywords dataset:")
# print(keywords.head())
print(keywords.info())

# print("\nRatings dataset:")
# print(ratings.head())
# print(ratings.info())

# print("\nLinks dataset:")
# print(links.head())
# print(links.info())

# print("\nMetadata dataset:")
# print(metadata.head())
# print(metadata.info())

Movies dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  statu

In [4]:
# Handle missing values
# Only released movies to consider (not rumored, post-production)

movies.fillna("", inplace=True)
movies = movies[movies["status"]=="Released"]
movies = movies.drop(columns=["homepage", "overview", "tagline", "status"])


credits.fillna("", inplace=True)
keywords.fillna("", inplace=True)
# ratings.fillna("", inplace=True)
# links.fillna("", inplace=True)
# metadata.fillna("", inplace=True)

In [5]:
# Function to get names from dicts of ids and names (applies to many columns in our data)
    # change 'name' to 'id' if 'id' is wanted instead
def extract_names(column_data):
    column_data = ast.literal_eval(column_data)
    names = [data['name'] for data in column_data]
    return names

In [6]:
# Prepares content_based_df
# Completes the following:
    # merges movies, credits, and keywords
    # cleans columns to get desired results

content_based_df = movies.merge(credits, left_on="id", right_on="movie_id", suffixes=("_movies", "_credits"))
content_based_df.drop(columns=["title_credits", "movie_id"], inplace=True)

content_based_df = content_based_df.merge(keywords, on="id", suffixes=("", "_keywords"))
content_based_df.drop(columns=["keywords"], inplace=True)
content_based_df.rename(columns={"keywords_keywords": "keywords"}, inplace=True)


columns_to_clean = [
    'genres', 'production_companies', 'production_countries',
    'spoken_languages', 'cast', 'crew', 'keywords'
]

for column in columns_to_clean:
    content_based_df[column] = content_based_df[column].apply(extract_names)

content_based_df['release_date'] = pd.to_datetime(content_based_df['release_date'])

# Produces df with following columns (all 100% ready for next steps):
# ['budget', 'genres', 'id', 'original_language', 'original_title',
#       'popularity', 'production_companies', 'production_countries',
#       'release_date', 'revenue', 'runtime', 'spoken_languages',
#       'title_movies', 'vote_average', 'vote_count', 'cast', 'crew',
#       'keywords']

In [7]:
content_based_df.columns

Index(['budget', 'genres', 'id', 'original_language', 'original_title',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages',
       'title_movies', 'vote_average', 'vote_count', 'cast', 'crew',
       'keywords'],
      dtype='object')

In [8]:
# Step 1: Pre-process and combine relevant features
def create_combined_feature(row):
    return ' '.join(row['genres']) + ' ' + ' '.join(row['keywords']) + ' ' + ' '.join(row['cast'][:3]) + ' ' + ' '.join(row['crew'][:3])

content_based_df['combined_feature'] = content_based_df.apply(create_combined_feature, axis=1)

In [20]:
# Step 2: Convert text-based feature to numerical format using TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(content_based_df['combined_feature'])

# Step 3: Calculate cosine similarity between movies
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = content_based_df[content_based_df['title_movies'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Get the top 10 most similar movies with their titles, similarity scores, and ratings
    recommendations = content_based_df[['title_movies', 'vote_average']].iloc[movie_indices]
    recommendations['similarity_score'] = [i[1] for i in sim_scores]

    # Reset the index and drop the original index
    recommendations = recommendations.reset_index(drop=True)
    recommendations.index += 1
    
    return recommendations

# Example usage
print(get_recommendations('The Truman Show'))

                  title_movies  vote_average  similarity_score
1       The Life of David Gale           7.3          0.158781
2   Ace Ventura: Pet Detective           6.4          0.149705
3                      Weekend           7.4          0.139202
4          You Can Count on Me           6.8          0.137511
5               Bruce Almighty           6.4          0.128260
6            Quantum of Solace           6.1          0.122899
7              Man of the Year           5.8          0.117804
8       Fun with Dick and Jane           5.9          0.112782
9                   Waterworld           5.9          0.112480
10          The Missing Person           6.4          0.110108


In [18]:
content_based_df[content_based_df["title_movies"].str.contains("Truman")]

Unnamed: 0,budget,genres,id,original_language,original_title,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,title_movies,vote_average,vote_count,cast,crew,keywords,combined_feature
698,60000000,"[Comedy, Drama]",37165,en,The Truman Show,56.488027,"[Paramount Pictures, Scott Rudin Productions]",[United States of America],1998-06-04,264118201,103.0,[English],The Truman Show,7.8,4537,"[Jim Carrey, Laura Linney, Noah Emmerich, Nata...","[Nancy Haigh, Dennis Gassner, Howard Feuer, Pe...","[claustrophobia, hidden camera, dystopia, real...",Comedy Drama claustrophobia hidden camera dyst...
