# Data cleaning and preprocessing

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import ast
from scikit_learn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

ModuleNotFoundError: No module named 'scikit_learn'

In [5]:
# Commented dfs are not needed
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")
keywords = pd.read_csv("keywords.csv")
# ratings = pd.read_csv("ratings.csv")
# links = pd.read_csv("links.csv")
# metadata = pd.read_csv("movies_metadata.csv")

In [6]:
# Examine the datasets
print("Movies dataset:")
# print(movies.head())
print(movies.info())

print("\nCredits dataset:")
# print(credits.head())
print(credits.info())

print("\nKeywords dataset:")
# print(keywords.head())
print(keywords.info())

# print("\nRatings dataset:")
# print(ratings.head())
# print(ratings.info())

# print("\nLinks dataset:")
# print(links.head())
# print(links.info())

# print("\nMetadata dataset:")
# print(metadata.head())
# print(metadata.info())

Movies dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  statu

In [7]:
# Handle missing values
# Only released movies to consider (not rumored, post-production)

movies.fillna("", inplace=True)
movies = movies[movies["status"]=="Released"]
movies = movies.drop(columns=["homepage", "overview", "tagline", "status"])


credits.fillna("", inplace=True)
keywords.fillna("", inplace=True)
# ratings.fillna("", inplace=True)
# links.fillna("", inplace=True)
# metadata.fillna("", inplace=True)

In [9]:
# Function to get names from dicts of ids and names (applies to many columns in our data)
    # change 'name' to 'id' if 'id' is wanted instead
def extract_names(column_data):
    column_data = ast.literal_eval(column_data)
    names = [data['name'] for data in column_data]
    return names

In [10]:
# Prepares content_based_df
# Completes the following:
    # merges movies, credits, and keywords
    # cleans columns to get desired results

content_based_df = movies.merge(credits, left_on="id", right_on="movie_id", suffixes=("_movies", "_credits"))
content_based_df.drop(columns=["title_credits", "movie_id"], inplace=True)

content_based_df = content_based_df.merge(keywords, on="id", suffixes=("", "_keywords"))
content_based_df.drop(columns=["keywords"], inplace=True)
content_based_df.rename(columns={"keywords_keywords": "keywords"}, inplace=True)


columns_to_clean = [
    'genres', 'production_companies', 'production_countries',
    'spoken_languages', 'cast', 'crew', 'keywords'
]

for column in columns_to_clean:
    content_based_df[column] = content_based_df[column].apply(extract_names)

content_based_df['release_date'] = pd.to_datetime(content_based_df['release_date'])

# Produces df with following columns (all 100% ready for next steps):
# ['budget', 'genres', 'id', 'original_language', 'original_title',
#       'popularity', 'production_companies', 'production_countries',
#       'release_date', 'revenue', 'runtime', 'spoken_languages',
#       'title_movies', 'vote_average', 'vote_count', 'cast', 'crew',
#       'keywords']

In [11]:
print(content_based_df.head())

      budget                                         genres      id  \
0  237000000  [Action, Adventure, Fantasy, Science Fiction]   19995   
1  300000000                   [Adventure, Fantasy, Action]     285   
2  245000000                     [Action, Adventure, Crime]  206647   
3  250000000               [Action, Crime, Drama, Thriller]   49026   
4  260000000           [Action, Adventure, Science Fiction]   49529   

  original_language                            original_title  popularity  \
0                en                                    Avatar  150.437577   
1                en  Pirates of the Caribbean: At World's End  139.082615   
2                en                                   Spectre  107.376788   
3                en                     The Dark Knight Rises  112.312950   
4                en                               John Carter   43.926995   

                                production_companies  \
0  [Ingenious Film Partners, Twentieth Century Fo...  