In [1]:
import numpy as np
import pandas as pd

In [2]:
# Reading the csv file
df=pd.read_csv('netflix_titles.csv')

In [3]:
# Filling the null values with mode for the numeric column and then the columns with string is filled with NULL
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['date_added'] = df['date_added'].fillna(df['date_added'].mode()[0])
df['rating'] = df['rating'].fillna(df['country'].mode()[0])
df['duration'] = df['duration'].fillna('0 min')
df['cast'] = df['cast'].fillna('NULL')
df['director'] = df['director'].fillna('NULL')


In [4]:
# Checking is any Missing data in each column of the dataset.
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [5]:
movie_df = df[df['type'] == 'Movie']
tv_df = df[df['type'] == 'TV Show']

In [6]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,United States,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

In [8]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(8807, 18895)

In [9]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
indices

title
Dick Johnson Is Dead        0
Blood & Water               1
Ganglands                   2
Jailbirds New Orleans       3
Kota Factory                4
                         ... 
Zodiac                   8802
Zombie Dumb              8803
Zombieland               8804
Zoom                     8805
Zubaan                   8806
Length: 8807, dtype: int64

In [11]:
features=['title', 'director','cast','listed_in', 'description']
df_features = df[features]

In [12]:
def combined_features(x):
    return x['title'] + ' ' + x['director'] + ' ' + x['cast'] + ' ' +x['listed_in']+' '+ x['description']

In [13]:
df_features['combined_features'] = df_features.apply(combined_features, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features['combined_features'] = df_features.apply(combined_features, axis=1)


In [14]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_features['combined_features'])

In [15]:
# Compute the Cosine Similarity matrix based on the count_matrix
# 0 means no similarity, where as 1 means that both the items are 100% similar.
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix)

In [16]:
#Construct a reverse map of indices and movie titles
df_features = df_features.reset_index()
indices = pd.Series(df_features.index, index=df_features['title']).drop_duplicates()
indices

title
Dick Johnson Is Dead        0
Blood & Water               1
Ganglands                   2
Jailbirds New Orleans       3
Kota Factory                4
                         ... 
Zodiac                   8802
Zombie Dumb              8803
Zombieland               8804
Zoom                     8805
Zubaan                   8806
Length: 8807, dtype: int64

In [17]:
def recommendations_title(title, cosine_sim=cosine_sim):
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [18]:
recommendations_title('The Conjuring', cosine_sim2)

1284                                  The Conjuring 2
1118                                        Insidious
3450                                In the Tall Grass
5903                                            Creep
5359                                  Raising the Bar
5737    I Am the Pretty Thing That Lives in the House
4872                                     Family Blood
7168                                           Kanika
5110                                          Creep 2
5042                                       The Ritual
Name: title, dtype: object

In [19]:
recommendations_title('#Alive', cosine_sim2)

4211               Nang Nak
5317        Berlin Syndrome
2729                   Kaal
2589                 Psycho
7280                 Lechmi
1783    The Day of the Lord
1898            The Binding
7168                 Kanika
3698           Inhuman Kiss
5042             The Ritual
Name: title, dtype: object

In [20]:
recommendations_title('High & Low The Movie', cosine_sim2)

1969                                  Road To High & Low
1965                 High & Low The Movie 2 / End of Sky
1966              High & Low The Movie 3 / Final Mission
1968                                High & Low The Worst
1967                             High & Low The Red Rain
61      Naruto the Movie 2: Legend of the Stone of Gelel
56              Naruto Shippuden the Movie: Blood Prison
3842                           Gatao 2: Rise of the King
4559                                              Anjaan
5293        Berserk: The Golden Age Arc III - The Advent
Name: title, dtype: object

In [21]:
recommendations_title('Squid Game', cosine_sim2)

5065                    Prison Playbook
3288                     Chief of Staff
731     Love (ft. Marriage and Divorce)
686                   Hospital Playlist
3928                            Persona
5665                     Color of Woman
4132            Romance is a bonus book
1528                             Run On
3610                          Let’s Eat
3473                             The K2
Name: title, dtype: object

In [22]:
recommendations_title('Stranger Things', cosine_sim2)

5200         Beyond Stranger Things
2190           The Umbrella Academy
8803                    Zombie Dumb
2303                    Warrior Nun
880                         Haunted
1127               Prank Encounters
3187                    Nightflyers
6167    Anjaan: Special Crimes Unit
241                        Manifest
1335                     The Sinner
Name: title, dtype: object

In [23]:
recommendations_title('13 Reasons Why', cosine_sim2)

3561    13 Reasons Why: Beyond the Reasons
3604                              Sintonia
6841                            Get Shorty
7991                       Shadow of Truth
1335                            The Sinner
3789                        Killer Ratings
4508                         Stunt Science
5457                             Mind Game
5038                               Re:Mind
3937                             Imposters
Name: title, dtype: object

In [24]:
recommendations_title('PK', cosine_sim2)

1114            3 Idiots
3131      Dil Chahta Hai
1022    Taare Zameen Par
4507               Sanju
7119              Janaan
5718     Mumbai Cha Raja
7590            No Entry
1441       Hello Brother
1019              Lagaan
3132     Dil Dhadakne Do
Name: title, dtype: object