# Importing Libraries

In [467]:
import pandas as pd
from datetime import datetime as dt
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import requests
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity



# Importing dataset

In [468]:
movies_data = pd.read_csv('movies_metadata.csv')
original_movies = movies_data.copy()


  movies_data = pd.read_csv('movies_metadata.csv')


In [469]:
print(movies_data.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


We don't need 'overview', 'poster_path', 'spoken_languages', 'status', 'production_companies', 'production_countries', or 'imdb_id'. Let's get rid of those

In [470]:
movies_data = movies_data.drop(columns = ['budget', 'popularity','overview', 'poster_path', 'spoken_languages', 'status', 'production_companies', 'production_countries', 'imdb_id', 'original_title','belongs_to_collection', 'tagline', 'homepage', 'release_date'])



# Handling Missing Values

Our movies dataset (movies_data) has 45,666 records. That's a lot!

In [471]:
mean_runtime = movies_data['runtime'].dropna().mean()
movies_data['runtime'].fillna(mean_runtime, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_data['runtime'].fillna(mean_runtime, inplace=True)


In [472]:
movies_data.dropna(subset=[ 'original_language', 'revenue', 'title', 'video', 'vote_average', 'vote_count'], inplace=True)


We will impute runtime, original_language, popularity, and release_date

In [473]:
mode_language = movies_data['original_language'].mode()[0]
movies_data['original_language'].fillna(mode_language, inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_data['original_language'].fillna(mode_language, inplace = True)


Now we've handled the missing values. Let us now view the dataset

In [474]:
movies_data.loc[0,:]

Unnamed: 0,0
adult,False
genres,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
id,862
original_language,en
revenue,373554033.0
runtime,81.0
title,Toy Story
video,False
vote_average,7.7
vote_count,5415.0


Now lets convert parse the genres column into a list of dictionaries


In [475]:
movies_data['genres'] = movies_data['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [476]:

movies_data['genre_names'] = movies_data['genres'].apply(lambda x: [genre['name'] for genre in x] )

In [477]:
movies_data.drop(columns = ['genres'], inplace = True)

In [478]:
# Step 3: Explode the 'genre_names' column
movies_data = movies_data.explode('genre_names')

# Step 4: One-hot encode the genres
genre_dummies = pd.get_dummies(movies_data['genre_names'], prefix='genre')

# Step 5: Aggregate back to unique movies by summing the one-hot encoded columns
movies_data = pd.concat([movies_data, genre_dummies], axis=1)
movies_data.drop(columns=['genre_names'], inplace=True)

movies_data = movies_data.groupby('title').agg({
    **{col: 'first' for col in movies_data.columns if col not in genre_dummies.columns},
    **{col: 'max' for col in genre_dummies.columns}
})
movies_data.reset_index(drop=True, inplace=True)


In [479]:
movies_data.iloc[:10,:]

Unnamed: 0,adult,id,original_language,revenue,runtime,title,video,vote_average,vote_count,genre_Action,...,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western
0,False,55245,en,0.0,83.0,!Women Art Revolution,False,4.3,2.0,False,...,False,False,False,False,False,False,False,False,False,False
1,False,41371,en,0.0,95.0,#1 Cheerleader Camp,False,3.4,23.0,False,...,False,False,False,False,False,False,False,False,False,False
2,False,301325,de,0.0,90.0,#Horror,False,3.4,53.0,False,...,False,True,False,True,False,False,False,True,False,False
3,False,267752,en,0.0,74.0,#chicagoGirl,False,7.0,1.0,False,...,False,False,False,False,False,False,False,False,False,False
4,False,143747,it,0.0,104.0,"$1,000 on the Black",False,6.0,2.0,False,...,False,False,False,False,False,False,False,False,False,True
5,False,158079,it,0.0,98.0,"$100,000 for Ringo",False,5.0,1.0,False,...,False,False,False,False,False,False,False,False,False,True
6,False,4204,en,0.0,98.0,$5 a Day,False,6.0,24.0,False,...,False,False,False,False,False,False,False,False,False,False
7,False,248268,en,0.0,90.0,$50K and a Call Girl: A Love Story,False,6.3,11.0,False,...,False,False,False,False,False,False,False,False,False,False
8,False,19311,en,0.0,78.0,$9.99,False,6.0,28.0,False,...,False,False,False,False,False,False,False,False,False,False
9,False,95383,en,0.0,89.0,$ellebrity,False,5.5,9.0,False,...,False,False,False,False,False,False,False,False,False,False


Let's try and reduce the dimensionality of the genre columns using PCA

In [480]:
genre_columns_pca = movies_data.filter(like='genre_').columns
desired_column_order_pca = movies_data.columns.tolist()

In [481]:
pca = PCA(n_components=5)
pca_features = pca.fit_transform(movies_data[genre_columns_pca])
movies_data_reduced = movies_data.copy()
pca_df = pd.DataFrame(pca_features, columns=[f'genre_{i+1}' for i in range(5)])
movies_data = movies_data_reduced.drop(columns=genre_columns_pca).join(pca_df)


Lets convert the bool datatypes to int. These include the 'adult' and 'video' columns

In [482]:
movies_data['video'] = movies_data['video'].astype(bool).astype(int)
movies_data['adult'] = movies_data['adult'].astype(bool).astype(int)

There are too many languages to create encoding column for all of them. Let's instead retain languages with more than 10000 occurences, and for the rest, create a separate language column labeled 'Rare Language'

In [483]:
# Create a binary column based on whether the language is English
movies_data['is_english'] = movies_data['original_language'].apply(lambda x: 1 if x == 'en' else 0)

# Drop the original language column if no longer needed
movies_data.drop(columns=['original_language'], inplace=True)

In [484]:
movies_data.iloc[0,:]

Unnamed: 0,0
adult,1
id,55245
revenue,0.0
runtime,83.0
title,!Women Art Revolution
video,0
vote_average,4.3
vote_count,2.0
genre_1,-0.400518
genre_2,-0.342073


Let us see the unique genres available

Let's now add a column to indicate if revenue is 0

In [485]:
movies_data['is_zero_revenue'] = movies_data['revenue'] == 0

In [486]:
numerical_columns = [ 'revenue', 'runtime', 'vote_average', 'vote_count']
bool_columns = movies_data.select_dtypes(bool).columns
movies_data[bool_columns] = movies_data[bool_columns].astype(int)
movies_data[numerical_columns] = movies_data[numerical_columns].astype(float)

scaler = StandardScaler()
movies_data[numerical_columns] = scaler.fit_transform(movies_data[numerical_columns])



In [487]:
user_ratings = [
    {"title": "Toy Story", "rating": 8},
    {"title": "Jumanji", "rating": 9},
    {"title": "Grumpier Old Men", "rating": 7},
    {"title": "Waiting to Exhale", "rating": 8},
    {"title": "Father of the Bride Part II", "rating": 9}
]
api_key = 'fab77af2'
base_url = "http://www.omdbapi.com/"

# Function to fetch movie data from OMDB API
def fetch_movie_details(title):
    formatted_title = title.replace(" ", "+")  # Format title for URL
    url = f"{base_url}?apikey={api_key}&t={formatted_title}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching data for {title}")
        return None

# Fetch details for each movie in user_ratings
user_movie_data = []
for movie in user_ratings:
    movie_data = fetch_movie_details(movie["title"])
    if movie_data and movie_data['Response'] == 'True':
        movie_data['user_rating'] = movie['rating']  # add user rating to the fetched data
        user_movie_data.append(movie_data)

# Convert the list to a DataFrame
user_movie_df_cp = pd.DataFrame(user_movie_data)
user_movie_df_cp.iloc[:,:]

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,user_rating
0,Toy Story,1995,G,22 Nov 1995,81 min,"Animation, Adventure, Comedy",John Lasseter,"John Lasseter, Pete Docter, Andrew Stanton","Tom Hanks, Tim Allen, Don Rickles",A cowboy doll is profoundly threatened and jea...,...,8.3,1089101,tt0114709,movie,,"$223,225,679",,,True,8
1,Jumanji,1995,PG,15 Dec 1995,104 min,"Adventure, Comedy, Family",Joe Johnston,"Jonathan Hensleigh, Greg Taylor, Jim Strain","Robin Williams, Kirsten Dunst, Bonnie Hunt",When two kids find and play a magical board ga...,...,7.1,386214,tt0113497,movie,,"$100,499,940",,,True,9
2,Grumpier Old Men,1995,PG-13,22 Dec 1995,101 min,"Comedy, Romance",Howard Deutch,Mark Steven Johnson,"Walter Matthau, Jack Lemmon, Ann-Margret",John and Max resolve to save their beloved bai...,...,6.7,30085,tt0113228,movie,,"$71,518,503",,,True,7
3,Waiting to Exhale,1995,R,22 Dec 1995,124 min,"Comedy, Drama, Romance",Forest Whitaker,"Terry McMillan, Ron Bass","Whitney Houston, Angela Bassett, Loretta Devine","Based on Terry McMillan's novel, this film fol...",...,6.0,12518,tt0114885,movie,,"$67,052,156",,,True,8
4,Father of the Bride Part II,1995,PG,08 Dec 1995,106 min,"Comedy, Family, Romance",Charles Shyer,"Albert Hackett, Frances Goodrich, Nancy Meyers","Steve Martin, Diane Keaton, Martin Short",George Banks must deal not only with his daugh...,...,6.1,42329,tt0113041,movie,,"$76,594,107",,,True,9


In [488]:
user_movie_df = user_movie_df_cp.copy()

user_movie_df.drop(columns = ['Year',  'Released', 'Director', 'Writer', 'Actors', 'Plot', 'Country', 'Awards', 'Poster', 'Ratings', 'Metascore',  'imdbID', 'Type', 'DVD', 'Production', 'Website', 'Response', ], inplace = True)

# Rename 'Rated' column to 'adult'
user_movie_df.rename(columns={'Rated': 'adult'}, inplace=True)

# Encode the 'adult' column: 1 if 'R', otherwise 0
user_movie_df['adult'] = user_movie_df['adult'].apply(lambda x: 1 if x == 'R' else 0)


# Rename columns
user_movie_df.rename(columns={'Language': 'is_english', 'BoxOffice': 'revenue',
                              'imdbRating': 'vote_average', 'imdbVotes': 'vote_count'}, inplace=True)

# Encode 'is_english': 1 if 'English', else 0
user_movie_df['is_english'] = user_movie_df['is_english'].apply(lambda x: 1 if 'English' in x else 0)

# Convert 'revenue' to a numeric format (remove commas and dollar signs)
user_movie_df['revenue'] = pd.to_numeric(user_movie_df['revenue'].replace('[\$,]', '', regex=True), errors='coerce').fillna(0)

# Scale 'revenue' to match the scaling in combined_data
scaler_revenue = MinMaxScaler()
combined_data_revenue = scaler_revenue.fit(movies_data[['revenue']])
user_movie_df['revenue'] = scaler_revenue.transform(user_movie_df[['revenue']])

# Convert 'vote_count' to a numeric format (remove commas)
user_movie_df['vote_count'] = pd.to_numeric(user_movie_df['vote_count'].replace(',', '', regex=True), errors='coerce').fillna(0)

# Scale 'vote_count' to match the scaling in combined_data
scaler_vote_count = MinMaxScaler()
combined_data_vote_count = scaler_vote_count.fit(movies_data[['vote_count']])
user_movie_df['vote_count'] = scaler_vote_count.transform(user_movie_df[['vote_count']])

# Check the updated user_movie_df

# Rename 'Runtime' to 'runtime'
user_movie_df.rename(columns={'Runtime': 'runtime', 'Title':'title'}, inplace=True)

# Remove 'min' text and convert to float
user_movie_df['runtime'] = user_movie_df['runtime'].str.replace(' min', '').astype(float)

# Check the result to confirm the changes

# Step 1: Convert the 'Genre' column from a string to a list of genres
user_movie_df['genre_names'] = user_movie_df['Genre'].apply(lambda x: x.split(', '))

# Step 2: Drop the original 'Genre' column
user_movie_df.drop(columns=['Genre'], inplace=True)

# Step 3: Explode the 'genre_names' column
user_movie_df = user_movie_df.explode('genre_names')

# Step 4: One-hot encode the genres
genre_dummies = pd.get_dummies(user_movie_df['genre_names'], prefix='genre')

# Step 5: Aggregate back to unique movies by summing the one-hot encoded columns
user_movie_df = pd.concat([user_movie_df, genre_dummies], axis=1)
user_movie_df.drop(columns=['genre_names'], inplace=True)

user_movie_df = user_movie_df.groupby('title').agg({
    **{col: 'first' for col in user_movie_df.columns if col not in genre_dummies.columns},
    **{col: 'max' for col in genre_dummies.columns}
})
user_movie_df.reset_index(drop=True, inplace=True)


In [489]:
genre_user_movies_df = ["genre_" + str(genre) for genre in genre_movies_df]
genre_user_movies_df.remove('genre_nan')
print(genre_user_movies_df)

['genre_Animation', 'genre_Comedy', 'genre_Family', 'genre_Adventure', 'genre_Fantasy', 'genre_Romance', 'genre_Drama', 'genre_Action', 'genre_Crime', 'genre_Thriller', 'genre_Horror', 'genre_History', 'genre_Science Fiction', 'genre_Mystery', 'genre_War', 'genre_Foreign', 'genre_Music', 'genre_Documentary', 'genre_Western', 'genre_TV Movie']


In [490]:
for genre in genre_user_movies_df:
    if genre not in user_movie_df.columns:
      user_movie_df[genre] = False  # Add missing genre columns with a default value of 0


In [491]:
bool_columns = user_movie_df.select_dtypes(bool).columns
user_movie_df[bool_columns] = user_movie_df[bool_columns].astype(int)

In [492]:
# 3. Now, apply PCA using the same model used on combined_data
pca_genre_features = pca.transform(user_movie_df[genre_columns_pca])


In [493]:

# 4. Convert PCA output to a DataFrame and merge back to user_movie_df
pca_genre_df = pd.DataFrame(pca_genre_features, columns=[f'genre_{i+1}' for i in range(5)])
user_movie_df = user_movie_df.drop(columns=genre_columns_pca).join(pca_genre_df)

In [494]:
user_movie_df['is_zero_revenue'] = user_movie_df['revenue'] == 0
user_movie_df['is_zero_revenue'] = user_movie_df['is_zero_revenue'].astype(int)

In [495]:
movies_data.drop(columns = [ 'id', 'video'], inplace = True)

In [496]:
movies_data_cols = movies_data.columns
user_movie_df = user_movie_df[movies_data_cols]

In [497]:
user_movie_df.iloc[:,:]

Unnamed: 0,adult,revenue,runtime,title,vote_average,vote_count,genre_1,genre_2,genre_3,genre_4,genre_5,is_english,is_zero_revenue
0,0,1750582.0,106.0,Father of the Bride Part II,6.1,1470.472564,-0.668166,0.808688,0.225161,-0.02228,0.594476,1,0
1,0,1634578.0,101.0,Grumpier Old Men,6.7,1045.128921,-0.600321,0.769899,0.231293,-0.190643,0.612327,1,0
2,0,2296957.0,104.0,Jumanji,7.1,13416.674677,-0.824878,0.364864,0.251607,0.44936,-0.181847,1,0
3,0,5101892.0,81.0,Toy Story,8.3,37834.227537,-0.8231,0.308105,0.239037,0.410107,-0.154202,1,0
4,1,1532498.0,124.0,Waiting to Exhale,6.0,434.869873,0.249773,1.112682,0.283241,-0.123834,0.404534,1,0


In [498]:
movies_data.iloc[:5, :]

Unnamed: 0,adult,revenue,runtime,title,vote_average,vote_count,genre_1,genre_2,genre_3,genre_4,genre_5,is_english,is_zero_revenue
0,1,-0.174957,-0.287214,!Women Art Revolution,-0.686889,-0.219268,-0.400518,-0.342073,-0.731971,-0.024532,-0.027896,1,1
1,1,-0.174957,0.031621,#1 Cheerleader Camp,-1.15325,-0.176319,0.134504,0.749953,0.181486,-0.061336,-0.467415,1,1
2,1,-0.174957,-0.101227,#Horror,-1.15325,-0.114963,0.731443,-0.607799,0.386027,-0.94441,0.010047,0,1
3,1,-0.174957,-0.526341,#chicagoGirl,0.712194,-0.221313,-0.400518,-0.342073,-0.731971,-0.024532,-0.027896,1,1
4,1,-0.174957,0.270748,"$1,000 on the Black",0.194015,-0.219268,-0.296589,-0.248713,-0.376211,0.040463,0.01714,0,1


In [499]:
numerical_columns = [ 'revenue', 'runtime', 'vote_average', 'vote_count']
user_movie_df[numerical_columns] = user_movie_df[numerical_columns].astype(float)
user_movie_df[numerical_columns] = scaler.transform(user_movie_df[numerical_columns])

In [465]:
user_movie_df.iloc[:,:]

Unnamed: 0,adult,revenue,runtime,title,vote_average,vote_count,genre_1,genre_2,genre_3,genre_4,genre_5,is_english,is_zero_revenue
0,0,-0.147484,0.323887,Father of the Bride Part II,0.245833,2.784047,-0.668166,0.808688,0.225161,-0.02228,0.594476,1,0
1,0,-0.149304,0.191039,Grumpier Old Men,0.55674,1.914135,-0.600321,0.769899,0.231293,-0.190643,0.612327,1,0
2,0,-0.138909,0.270748,Jumanji,0.764011,27.216373,-0.824878,0.364864,0.251607,0.44936,-0.181847,1,0
3,0,-0.094889,-0.340354,Toy Story,1.385826,77.155059,-0.8231,0.308105,0.239037,0.410107,-0.154202,1,0
4,1,-0.150906,0.802141,Waiting to Exhale,0.194015,0.666036,0.249773,1.112682,0.283241,-0.123834,0.404534,1,0


In [500]:
# 1. Merge user ratings with user_movie_df
user_ratings_df = pd.DataFrame(user_ratings)
user_movie_ratings = user_movie_df.merge(user_ratings_df, on="title")


In [501]:
user_movie_ratings.iloc[:,:]

Unnamed: 0,adult,revenue,runtime,title,vote_average,vote_count,genre_1,genre_2,genre_3,genre_4,genre_5,is_english,is_zero_revenue,rating
0,0,-0.147484,0.323887,Father of the Bride Part II,0.245833,2.784047,-0.668166,0.808688,0.225161,-0.02228,0.594476,1,0,9
1,0,-0.149304,0.191039,Grumpier Old Men,0.55674,1.914135,-0.600321,0.769899,0.231293,-0.190643,0.612327,1,0,7
2,0,-0.138909,0.270748,Jumanji,0.764011,27.216373,-0.824878,0.364864,0.251607,0.44936,-0.181847,1,0,9
3,0,-0.094889,-0.340354,Toy Story,1.385826,77.155059,-0.8231,0.308105,0.239037,0.410107,-0.154202,1,0,8
4,1,-0.150906,0.802141,Waiting to Exhale,0.194015,0.666036,0.249773,1.112682,0.283241,-0.123834,0.404534,1,0,8


In [502]:

# 2. Extract features and ratings
user_features = user_movie_ratings.drop(columns=["title", "rating"]).values
user_ratings_values = user_movie_ratings["rating"].values


In [504]:
user_features

array([[ 0.00000000e+00, -1.47483611e-01,  3.23887227e-01,
         2.45832683e-01,  2.78404656e+00, -6.68165682e-01,
         8.08687950e-01,  2.25161243e-01, -2.22798300e-02,
         5.94476115e-01,  1.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00, -1.49304152e-01,  1.91039062e-01,
         5.56739913e-01,  1.91413537e+00, -6.00321248e-01,
         7.69898737e-01,  2.31292903e-01, -1.90643070e-01,
         6.12327445e-01,  1.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00, -1.38908958e-01,  2.70747961e-01,
         7.64011399e-01,  2.72163734e+01, -8.24877722e-01,
         3.64863844e-01,  2.51606946e-01,  4.49360120e-01,
        -1.81846558e-01,  1.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00, -9.48891320e-02, -3.40353597e-01,
         1.38582586e+00,  7.71550590e+01, -8.23099705e-01,
         3.08104772e-01,  2.39036816e-01,  4.10106786e-01,
        -1.54201905e-01,  1.00000000e+00,  0.00000000e+00],
       [ 1.00000000e+00, -1.50906162e-01,  8.0214062

In [None]:

# 3. Compute similarities between each movie in movies_data and the user's rated movies
movie_features = movies_data.drop(columns=["title"]).values
similarities = cosine_similarity(movie_features, user_features)


In [None]:

# 4. Weight the similarities by user ratings
weighted_similarities = similarities.dot(user_ratings_values) / user_ratings_values.sum()


In [None]:

# 5. Add similarity scores to movies_data
movies_data["similarity"] = weighted_similarities


In [None]:
# 6. Exclude movies already rated by the user
unseen_movies = movies_data[~movies_data["title"].isin(user_ratings_df["title"])]

In [None]:
# 7. Get the top 5 most similar movies
top_5_movies = unseen_movies.sort_values(by="similarity", ascending=False).head(5)


In [466]:

# Display the top 5 movies
print(top_5_movies[["title", "similarity"]])


                                 title  similarity
19340                      Love, Rosie    0.846759
22                (500) Days of Summer    0.845337
13355                    Groundhog Day    0.839380
14150                              Her    0.831577
35683  The Perks of Being a Wallflower    0.831402
