In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import ast

In [30]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")
ratings = pd.read_csv("ratings.csv")
keywords = pd.read_csv("keywords.csv")
links = pd.read_csv("links.csv")
metadata = pd.read_csv("movies_metadata.csv")

  metadata = pd.read_csv("movies_metadata.csv")


In [31]:
# Examine the datasets
print("Movies dataset:")
# print(movies.head())
print(movies.info())

print("\nCredits dataset:")
# print(credits.head())
print(credits.info())

print("\nRatings dataset:")
# print(ratings.head())
print(ratings.info())

print("\nKeywords dataset:")
# print(keywords.head())
print(keywords.info())

print("\nLinks dataset:")
# print(links.head())
print(links.info())

print("\nMetadata dataset:")
# print(metadata.head())
print(metadata.info())

Movies dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  statu

In [32]:
# Handle missing values
# Only released movies to consider (not rumored, post-production)

movies.fillna("", inplace=True)
movies = movies[movies["status"]=="Released"]
movies = movies.drop(columns=["homepage", "overview", "tagline", "status"])


credits.fillna("", inplace=True)
ratings.fillna("", inplace=True)
keywords.fillna("", inplace=True)
links.fillna("", inplace=True)
metadata.fillna("", inplace=True)

In [33]:
# Function to get names from dicts of ids and names (applies to many columns in our data)
def extract_names(column_data):
    column_data = ast.literal_eval(column_data)
    names = [data['name'] for data in column_data]
    return names

In [34]:
# Prepares content_based_df
# Completes the following:
    # merges movies, credits, and keywords
    # cleans columns to get desired results

    
content_based_df = movies.merge(credits, left_on="id", right_on="movie_id", suffixes=("_movies", "_credits"))
content_based_df.drop(columns=["title_credits", "movie_id"], inplace=True)

content_based_df = content_based_df.merge(keywords, on="id", suffixes=("", "_keywords"))
content_based_df.drop(columns=["keywords"], inplace=True)
content_based_df.rename(columns={"keywords_keywords": "keywords"}, inplace=True)


columns_to_clean = [
    'genres', 'production_companies', 'production_countries',
    'spoken_languages', 'cast', 'crew', 'keywords'
]

for column in columns_to_clean:
    content_based_df[column] = content_based_df[column].apply(extract_names)

content_based_df['release_date'] = pd.to_datetime(content_based_df['release_date'])

# Produces df with following columns (all 100% ready for next steps):
# ['budget', 'genres', 'id', 'original_language', 'original_title',
#       'popularity', 'production_companies', 'production_countries',
#       'release_date', 'revenue', 'runtime', 'spoken_languages',
#       'title_movies', 'vote_average', 'vote_count', 'cast', 'crew',
#       'keywords']

In [35]:
(content_based_df.head())

Unnamed: 0,budget,genres,id,original_language,original_title,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,title_movies,vote_average,vote_count,cast,crew,keywords
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",19995,en,Avatar,150.437577,"[Ingenious Film Partners, Twentieth Century Fo...","[United States of America, United Kingdom]",2009-12-10,2787965087,162.0,"[English, Español]",Avatar,7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[Stephen E. Rivkin, Rick Carter, Christopher B...","[culture clash, future, space war, space colon..."
1,300000000,"[Adventure, Fantasy, Action]",285,en,Pirates of the Caribbean: At World's End,139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...",[United States of America],2007-05-19,961000000,169.0,[English],Pirates of the Caribbean: At World's End,6.9,4500,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[Dariusz Wolski, Gore Verbinski, Jerry Bruckhe...","[ocean, drug abuse, exotic island, east india ..."
2,245000000,"[Action, Adventure, Crime]",206647,en,Spectre,107.376788,"[Columbia Pictures, Danjaq, B24]","[United Kingdom, United States of America]",2015-10-26,880674609,148.0,"[Français, English, Español, Italiano, Deutsch]",Spectre,6.3,4466,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[Thomas Newman, Sam Mendes, Anna Pinnock, John...","[spy, based on novel, secret agent, sequel, mi..."
3,250000000,"[Action, Crime, Drama, Thriller]",49026,en,The Dark Knight Rises,112.31295,"[Legendary Pictures, Warner Bros., DC Entertai...",[United States of America],2012-07-16,1084939099,165.0,[English],The Dark Knight Rises,7.6,9106,"[Christian Bale, Michael Caine, Gary Oldman, A...","[Hans Zimmer, Charles Roven, Christopher Nolan...","[dc comics, crime fighter, terrorist, secret i..."
4,260000000,"[Action, Adventure, Science Fiction]",49529,en,John Carter,43.926995,[Walt Disney Pictures],[United States of America],2012-03-07,284139100,132.0,[English],John Carter,6.1,2124,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[Andrew Stanton, Andrew Stanton, John Lasseter...","[based on novel, mars, medallion, space travel..."


In [17]:
# User-based and item-based collaborative filtering dataframes
# Assuming 'id' in metadata dataset is equivalent to 'tmdbId' in links dataset
collab_filter_df = pd.merge(left=links, right=metadata, left_on='tmdbId', right_on='id')
collab_filter_df = pd.merge(left=collab_filter_df, right=ratings, on='movieId')

In [None]:
# Drop unnecessary columns
content_based_df.drop(['imdb_id'], axis=1, inplace=True)
collab_filter_df.drop(['id', 'imdb_id'], axis=1, inplace=True)

print("\nContent-based filtering dataset:")
print(content_based_df.head())
print(content_based_df.info())

print("\nUser-based and Item-based Collaborative filtering dataset:")
print(collab_filter_df.head())
print(collab_filter_df.info())

In [None]:
collab_filter_df.info()