In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [2]:
df_netflix = pd.read_csv(r'Netflix_titles.csv')
df_HBO = pd.read_csv(r'HBO.csv')
df_amazon = pd.read_csv(r'Amazon.csv')

In [3]:
df_netflix.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6


In [4]:
df = pd.concat([df_netflix, df_HBO, df_amazon], ignore_index=True)
df.shape

(19015, 15)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19015 entries, 0 to 19014
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    19015 non-null  object 
 1   title                 19014 non-null  object 
 2   type                  19015 non-null  object 
 3   description           18866 non-null  object 
 4   release_year          19015 non-null  int64  
 5   age_certification     8701 non-null   object 
 6   runtime               19015 non-null  int64  
 7   genres                19015 non-null  object 
 8   production_countries  19015 non-null  object 
 9   seasons               4219 non-null   float64
 10  imdb_id               17619 non-null  object 
 11  imdb_score            17140 non-null  float64
 12  imdb_votes            17103 non-null  float64
 13  tmdb_popularity       18344 non-null  float64
 14  tmdb_score            16354 non-null  float64
dtypes: float64(5), int6

In [6]:
df_movies = df.drop_duplicates()
df_movies.drop(['description', 'age_certification'], axis=1, inplace=True)


In [7]:
df_movies['production_countries'] = df_movies['production_countries'].str.replace(r"\[", '', regex=True).str.replace(r"'", '', regex=True).str.replace(r"\]", '', regex=True)

df_movies['lead_prod_country'] = df_movies['production_countries'].str.split(',').str[0]

df_movies['prod_countries_cnt'] = df_movies['production_countries'].str.split(',').str.len()

df_movies['lead_prod_country'] = df_movies['lead_prod_country'].replace('', np.nan)
df.shape

(19015, 15)

In [8]:
df_movies['lead_prod_country']

0         US
1         US
2         US
3         GB
4         GB
        ... 
19010     US
19011     US
19012     IN
19013    NaN
19014    NaN
Name: lead_prod_country, Length: 18980, dtype: object

In [9]:
df_movies['genres']

0                                  ['documentation']
1                                 ['drama', 'crime']
2        ['drama', 'action', 'thriller', 'european']
3                    ['fantasy', 'action', 'comedy']
4                                  ['war', 'action']
                            ...                     
19010                                      ['drama']
19011                                     ['comedy']
19012                                      ['crime']
19013                            ['family', 'drama']
19014                                      ['drama']
Name: genres, Length: 18980, dtype: object

In [10]:
df_movies['genres'] = df_movies['genres'].str.replace(r"\[", '', regex=True).str.replace(r"'", '', regex=True).str.replace(r"\]", '', regex=True)

df_movies['main_genre'] = df_movies['genres'].str.split(',').str[0]

df_movies['main_genre'] = df_movies['main_genre'].replace('', np.nan)

In [11]:
cleaned_string = df['genres'].str.strip('[]').str.replace("'", "").str.split(',', expand=True).stack().reset_index(level=1, drop=True)
df = df.drop('genres', axis=1).join(cleaned_string.rename('genres'))
df['genres'] = df['genres'].str.strip()
df = df[df['genres'] != '']
df['genres'].reset_index(drop=True, inplace=True)
df.shape

(45479, 15)

In [12]:
df_movies['main_genre']


0        documentation
1                drama
2                drama
3              fantasy
4                  war
             ...      
19010            drama
19011           comedy
19012            crime
19013           family
19014            drama
Name: main_genre, Length: 18980, dtype: object

In [13]:
df_movies.drop(['genres', 'production_countries'], axis=1, inplace=True)


In [14]:
df_movies.dropna(inplace=True)

df_movies.set_index('title', inplace=True)

df_movies.drop(['id', 'imdb_id'], axis=1, inplace=True)

In [15]:
df.isnull().sum()

id                          0
title                       0
type                        0
description               144
release_year                0
age_certification       21271
runtime                     0
production_countries        0
seasons                 34374
imdb_id                  1644
imdb_score               2254
imdb_votes               2293
tmdb_popularity           901
tmdb_score               4086
genres                      0
dtype: int64

In [16]:
dummies = pd.get_dummies(df_movies[['type', 'lead_prod_country', 'main_genre']], drop_first=True)

df_movies_dum = pd.concat([df_movies, dummies], axis=1)

df_movies_dum.drop(['type', 'lead_prod_country', 'main_genre'], axis=1, inplace=True)

In [17]:
df_movies.columns

Index(['type', 'release_year', 'runtime', 'seasons', 'imdb_score',
       'imdb_votes', 'tmdb_popularity', 'tmdb_score', 'lead_prod_country',
       'prod_countries_cnt', 'main_genre'],
      dtype='object')

In [18]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_movies_dum)
df_scaled = pd.DataFrame(df_scaled, columns=df_movies_dum.columns)


df_scaled

Unnamed: 0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,prod_countries_cnt,lead_prod_country_AR,lead_prod_country_AT,...,main_genre_history,main_genre_horror,main_genre_music,main_genre_reality,main_genre_romance,main_genre_scifi,main_genre_sport,main_genre_thriller,main_genre_war,main_genre_western
0,0.397727,0.168539,0.058824,0.9125,0.037009,0.007913,0.815870,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.625000,0.134831,0.156863,0.9250,0.155671,0.058490,0.815326,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.545455,0.286517,0.058824,0.6750,0.017194,0.022579,0.728261,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.568182,0.056180,0.450980,0.6250,0.002570,0.018954,0.619565,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.625000,0.129213,0.078431,0.7000,0.017658,0.008919,0.782609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3289,1.000000,0.162921,0.000000,0.5875,0.000194,0.000629,0.782609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3290,0.988636,0.258427,0.000000,0.7500,0.000021,0.008224,0.728261,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3291,0.988636,0.264045,0.000000,0.8250,0.000012,0.002811,0.673913,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3292,0.988636,0.146067,0.000000,0.2750,0.000490,0.000628,0.271739,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
eps_array = [0.2, 0.5, 1] 
min_samples_array = [5, 10, 30] 
for eps in eps_array:
    for min_samples in min_samples_array:
        clusterer = DBSCAN(eps=eps, min_samples=min_samples).fit(df_scaled)
        
        cluster_labels = clusterer.labels_
        
        if len(set(cluster_labels)) == 1:
            continue 
        
        silhouette_avg = silhouette_score(df_scaled, cluster_labels)
        
        print("For eps =", eps,
              "For min_samples =", min_samples,
              "Count clusters =", len(set(cluster_labels)),
              "The average silhouette_score is :", silhouette_avg)

For eps = 0.2 For min_samples = 5 Count clusters = 75 The average silhouette_score is : 0.4378840737098286
For eps = 0.2 For min_samples = 10 Count clusters = 37 The average silhouette_score is : 0.36601440046646755
For eps = 0.2 For min_samples = 30 Count clusters = 17 The average silhouette_score is : 0.23106054247198202
For eps = 0.5 For min_samples = 5 Count clusters = 91 The average silhouette_score is : 0.601956050174035
For eps = 0.5 For min_samples = 10 Count clusters = 56 The average silhouette_score is : 0.5303679432698051
For eps = 0.5 For min_samples = 30 Count clusters = 21 The average silhouette_score is : 0.3622860416170049
For eps = 1 For min_samples = 5 Count clusters = 93 The average silhouette_score is : 0.6091664186394288
For eps = 1 For min_samples = 10 Count clusters = 57 The average silhouette_score is : 0.5362809971937993
For eps = 1 For min_samples = 30 Count clusters = 22 The average silhouette_score is : 0.3712130038803752


In [20]:
clusterer = DBSCAN(eps=0.5, min_samples=10).fit(df_scaled)

In [21]:
df_movies['cluster'] = clusterer.labels_

In [22]:
df_movies

Unnamed: 0_level_0,type,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_prod_country,prod_countries_cnt,main_genre,cluster
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Monty Python's Flying Circus,SHOW,1969,30,4.0,8.8,73424.0,17.617,8.306,GB,1,comedy,0
Seinfeld,SHOW,1989,24,9.0,8.9,308824.0,130.213,8.301,US,1,comedy,1
Knight Rider,SHOW,1982,51,4.0,6.9,34115.0,50.267,7.500,US,1,scifi,2
Thomas & Friends,SHOW,1984,10,24.0,6.5,5104.0,42.196,6.500,GB,1,animation,4
Saved by the Bell,SHOW,1989,23,5.0,7.1,35034.0,19.855,8.000,US,1,family,3
...,...,...,...,...,...,...,...,...,...,...,...,...
Putham Pudhu Kaalai: Vidiyaadha,SHOW,2022,29,1.0,6.2,389.0,1.400,8.000,IN,1,drama,37
Chivas: El Rebaño Sagrado,SHOW,2021,46,1.0,7.5,46.0,18.308,7.500,MX,1,sport,-1
Be Yourself,SHOW,2021,47,1.0,8.1,29.0,6.259,7.000,CN,1,drama,31
LOL - Hasse Toh Phasse,SHOW,2021,26,1.0,3.7,978.0,1.399,3.300,IN,1,comedy,53


In [25]:
import random

def recommend_movie(movie_name: str):
    movie_name = movie_name.lower()

    df_movies['name'] = df_movies.index.str.lower()

    movie = df_movies[df_movies['name'].str.contains(movie_name, na=False)]

    if not movie.empty:
        cluster = movie['cluster'].values[0]

        cluster_movies = df_movies[df_movies['cluster'] == cluster]

        if len(cluster_movies) >= 5:
            recommended_movies = random.sample(list(cluster_movies.index), 5)
        else:
            recommended_movies = list(cluster_movies.index)

        print('--- We can recommend you these movies ---')
        for m in recommended_movies:
            print(m)
    else:
        print('Movie not found in the database.')

In [30]:
s = input('Input movie name: ')

print("\n\n")
recommend_movie(s)




--- We can recommend you these movies ---
Mortal Kombat: Conquest
From
Rick and Morty
Ben 10: Alien Force
Undone


In [33]:
df_movies

Unnamed: 0_level_0,type,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_prod_country,prod_countries_cnt,main_genre,cluster,name
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Monty Python's Flying Circus,SHOW,1969,30,4.0,8.8,73424.0,17.617,8.306,GB,1,comedy,0,monty python's flying circus
Seinfeld,SHOW,1989,24,9.0,8.9,308824.0,130.213,8.301,US,1,comedy,1,seinfeld
Knight Rider,SHOW,1982,51,4.0,6.9,34115.0,50.267,7.500,US,1,scifi,2,knight rider
Thomas & Friends,SHOW,1984,10,24.0,6.5,5104.0,42.196,6.500,GB,1,animation,4,thomas & friends
Saved by the Bell,SHOW,1989,23,5.0,7.1,35034.0,19.855,8.000,US,1,family,3,saved by the bell
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Putham Pudhu Kaalai: Vidiyaadha,SHOW,2022,29,1.0,6.2,389.0,1.400,8.000,IN,1,drama,37,putham pudhu kaalai: vidiyaadha
Chivas: El Rebaño Sagrado,SHOW,2021,46,1.0,7.5,46.0,18.308,7.500,MX,1,sport,-1,chivas: el rebaño sagrado
Be Yourself,SHOW,2021,47,1.0,8.1,29.0,6.259,7.000,CN,1,drama,31,be yourself
LOL - Hasse Toh Phasse,SHOW,2021,26,1.0,3.7,978.0,1.399,3.300,IN,1,comedy,53,lol - hasse toh phasse


In [34]:
df_movies.to_csv('movies_with_clusters.csv')