In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.notebook_repr_html', True)

In [2]:
# Importing dataset
shows = pd.read_csv('netflix_titles.csv')
shows.head(10)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...
5,s6,TV Show,46,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,"July 1, 2017",2016,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...
6,s7,Movie,122,Yasir Al Yasiri,"Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed...",Egypt,"June 1, 2020",2019,TV-MA,95 min,"Horror Movies, International Movies","After an awful accident, a couple admitted to ..."
7,s8,Movie,187,Kevin Reynolds,"Samuel L. Jackson, John Heard, Kelly Rowan, Cl...",United States,"November 1, 2019",1997,R,119 min,Dramas,After one of his high school students attacks ...
8,s9,Movie,706,Shravan Kumar,"Divya Dutta, Atul Kulkarni, Mohan Agashe, Anup...",India,"April 1, 2019",2019,TV-14,118 min,"Horror Movies, International Movies","When a doctor goes missing, his psychiatrist w..."
9,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,"December 15, 2017",2008,TV-MA,143 min,"Horror Movies, International Movies, Thrillers",An architect and his wife move into a castle t...


In [3]:
shows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


In [4]:
shows.isna().sum()

show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

In [5]:
# Filling in NaN with blank string
shows.fillna('', inplace=True)

In [6]:
#import nltk
#nltk.download()

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


stop_words = set(stopwords.words('english'))

# Tokenizing the description and filtering out stop words and punctuations
def stop_words_filter(d):
    d = d.translate(str.maketrans('', '', string.punctuation))
    
    word_tokens = word_tokenize(d)
    
    tokens = [w for w in word_tokens if w.lower() not in stop_words]
    
    s = ' '
    
    return s.join(tokens)

# Applying stop_words_filter to the description column
shows['tokens'] = shows['description'].apply(stop_words_filter)

shows['tokens'].head(5)

0    future elite inhabit island paradise far crowd...
1    devastating earthquake hits Mexico City trappe...
2    army recruit found dead fellow soldiers forced...
3    postapocalyptic world ragdoll robots hide fear...
4    brilliant group students become cardcounting e...
Name: tokens, dtype: object

In [8]:
# Removing commas in listed_in column
shows['genres'] = shows['listed_in'].apply(lambda x: x.replace(',', ''))
shows['genres'].head(5)

0    International TV Shows TV Dramas TV Sci-Fi & F...
1                          Dramas International Movies
2                   Horror Movies International Movies
3    Action & Adventure Independent Movies Sci-Fi &...
4                                               Dramas
Name: genres, dtype: object

In [9]:
features = ['director', 'tokens', 'genres']

# Combining the feature columns
def combine(df):
    return df['director'] + ' ' + df['tokens'] + ' ' + df['genres']
        
shows['features'] = shows.apply(combine, axis=1)
shows['features'].head(5)

0     future elite inhabit island paradise far crow...
1    Jorge Michel Grau devastating earthquake hits ...
2    Gilbert Chan army recruit found dead fellow so...
3    Shane Acker postapocalyptic world ragdoll robo...
4    Robert Luketic brilliant group students become...
Name: features, dtype: object

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Calculating cosine similarity between the shows
cv = CountVectorizer()
count_matrix = cv.fit_transform(shows['features'])
cosine_similarity = cosine_similarity(count_matrix)

In [11]:
# Generating a random movie in the list for testing
random = shows.sample()
with pd.option_context('display.max_colwidth', None):
    display(random.iloc[:, [1,2,5,10,11]])

Unnamed: 0,type,title,country,listed_in,description
5375,Movie,Sandy Wexler,United States,Comedies,"When a hapless but dedicated talent manager signs his first client who actually has talent, his career finally starts to take off."


In [12]:
# Enumerating and converting the cosine similarity matrix into a list
recommended_movies = list(enumerate(cosine_similarity[random.index][0]))
# Sorting the cosine similarity by values in descending order
sorted_rec_mov = sorted(recommended_movies, key=lambda x:x[1], reverse=True)

In [13]:
# Displaying the top 10 recommended movies according to the random movie generated above
num = 10
with pd.option_context('display.max_colwidth', None):
    rec = pd.DataFrame()
    for i in range(1, num+1):
        rec = rec.append(shows[shows.index == sorted_rec_mov[i][0]])
    display(rec.iloc[:, [1,2,5,10,11]])

Unnamed: 0,type,title,country,listed_in,description
6563,Movie,The Last Laugh,United States,"Comedies, Dramas","After moving to a retirement home, restless talent manager Al reconnects with long-ago client Buddy and coaxes him back out on the comedy circuit."
5914,Movie,Superstar,United States,Comedies,A socially awkward Catholic schoolgirl vows to win a dreamy classmate’s affections by taking first prize at an upcoming talent show.
6307,Movie,The Do-Over,United States,"Action & Adventure, Comedies",The life of a bank manager is turned upside down when a friend from his past manipulates him into faking his own death and taking off on an adventure.
6848,Movie,The Sapphires,Australia,"Comedies, Dramas, Independent Movies","Sisters Gail, Cynthia and Julie have talent, but their career takes off when promoter Dave convinces them to tour for U.S. troops in Vietnam."
3597,Movie,Lembi 8 Giga,,"Comedies, International Movies","To upgrade his life, a simple man fakes his credentials as a lawyer when sudden extraordinary abilities give him the talent to thrive."
4760,TV Show,Paquita Salas,Spain,"International TV Shows, Spanish-Language TV Shows, TV Comedies","One of Spain's best talent agents in the '90s, Paquita now finds herself searching desperately for new stars after suddenly losing her biggest client."
1067,Movie,Boy Bye,United States,"Comedies, Romantic Movies","Single entrepreneur Veronica finally starts to believe in love after meeting sexy Lance, but she soon discovers he's hiding something from her."
1675,Movie,Death to 2020,United States,Comedies,"As the year we all want to end finally does, take a look back at 2020's mad glory in this comedic retrospective from the creators of ""Black Mirror."""
2122,Movie,Fat Ballerina - David A. Arnold,United States,Stand-Up Comedy,"Finally comfortable in his skin, seasoned comic David A. Arnold shares his talent for doing nothing, how he's petty and why divorce saves marriages."
2255,Movie,Frances Ha,"United States, Brazil","Comedies, Dramas, Independent Movies","Determined to make it as a modern dancer in New York, a young woman pursues her unlikely goal with more enthusiasm than natural talent."
