In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import nltk
import re
from nltk.corpus import stopwords
import string
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yehia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [4]:
data = pd.read_csv("netflixData.csv")
data.head()

Unnamed: 0,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
0,cc1b6ed9-cf9e-4057-8303-34577fb54477,(Un)Well,This docuseries takes a deep dive into the luc...,,Reality TV,,United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
1,e2ef4e91-fb25-42ab-b485-be8e3b23dedb,#Alive,"As a grisly virus rampages a city, a lone man ...",Cho Il,"Horror Movies, International Movies, Thrillers","Yoo Ah-in, Park Shin-hye",South Korea,2020.0,TV-MA,99 min,6.2/10,Movie,"September 8, 2020"
2,b01b73b7-81f6-47a7-86d8-acb63080d525,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Sabina Fedeli, Anna Migotto","Documentaries, International Movies","Helen Mirren, Gengher Gatti",Italy,2019.0,TV-14,95 min,6.4/10,Movie,"July 1, 2020"
3,b6611af0-f53c-4a08-9ffa-9716dc57eb9c,#blackAF,Kenya Barris and his family navigate relations...,,TV Comedies,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
4,7f2d4170-bab8-4d75-adc2-197f7124c070,#cats_the_mewvie,This pawesome documentary explores how our fel...,Michael Margolis,"Documentaries, International Movies",,Canada,2020.0,TV-14,90 min,5.1/10,Movie,"February 5, 2020"


In [5]:
data.isnull().sum()

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64

In [6]:
df=data[["Title","Description","Genres","Content Type"]]

In [7]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [8]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5958 entries, 0 to 5966
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         5958 non-null   object
 1   Description   5958 non-null   object
 2   Genres        5958 non-null   object
 3   Content Type  5958 non-null   object
dtypes: object(4)
memory usage: 232.7+ KB


In [10]:
df.head()

Unnamed: 0,Title,Description,Genres,Content Type
0,(Un)Well,This docuseries takes a deep dive into the luc...,Reality TV,TV Show
1,#Alive,"As a grisly virus rampages a city, a lone man ...","Horror Movies, International Movies, Thrillers",Movie
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Documentaries, International Movies",Movie
3,#blackAF,Kenya Barris and his family navigate relations...,TV Comedies,TV Show
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,"Documentaries, International Movies",Movie


In [11]:
df["Title"] = df["Title"].apply(clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Title"] = df["Title"].apply(clean)


In [20]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [13]:
df.reset_index(inplace=True,drop=True)

In [21]:
df.head(50)

Unnamed: 0,Title,Description,Genres,Content Type
0,unwel,This docuseries takes a deep dive into the luc...,Reality TV,TV Show
1,aliv,"As a grisly virus rampages a city, a lone man ...","Horror Movies, International Movies, Thrillers",Movie
2,annefrank parallel stori,"Through her diary, Anne Frank's story is retol...","Documentaries, International Movies",Movie
3,blackaf,Kenya Barris and his family navigate relations...,TV Comedies,TV Show
4,catsthemewvi,This pawesome documentary explores how our fel...,"Documentaries, International Movies",Movie
5,friendbutmarri,"Pining for his high school crush for years, a ...","Dramas, International Movies, Romantic Movies",Movie
6,friendbutmarri,As Ayu and Ditto finally transition from best ...,"Dramas, International Movies, Romantic Movies",Movie
7,realityhigh,When nerdy high schooler Dani finally attracts...,Comedies,Movie
8,,This documentary celebrates the 50th anniversa...,"Documentaries, Sports Movies",Movie
9,selfi,"Two days before their final exams, three teen ...","Comedies, Dramas, International Movies",Movie


In [14]:
feature = df["Genres"].tolist()
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(feature)
similarity = cosine_similarity(X)

In [15]:
indices = pd.Series(df.index, 
                    index=df['Title']).drop_duplicates()

In [22]:
def netFlix_recommendation(title, similarity = similarity,number_of_movies=10):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:number_of_movies]
    movieindices = [i[0] for i in similarity_scores]
    titles=df.iloc[movieindices].reset_index(drop=True)
    #for i in range(len(movieindices)):
        #print(titles.index[i],titles["Title"][i])
        
    return titles
    
netFlix_recommendation("blackaf",number_of_movies=10)

Unnamed: 0,Title,Description,Genres,Content Type
0,blackaf,Kenya Barris and his family navigate relations...,TV Comedies,TV Show
1,washington,Hip-hop icon MC Joe Speed retires from showbiz...,TV Comedies,TV Show
2,arrest develop,It's the Emmy-winning story of a wealthy famil...,TV Comedies,TV Show
3,astronomi club sketch show,With unique individual perspectives that conve...,TV Comedies,TV Show
4,aunti donna big ol hous fun,Comedy trio Aunty Donna showcase their uniquel...,TV Comedies,TV Show
5,big mouth,Teenage friends find their lives upended by th...,TV Comedies,TV Show
6,bojack horseman,Meet the most beloved sitcom horse of the '90s...,TV Comedies,TV Show
7,brew brother,Two rival brothers must work together to keep ...,TV Comedies,TV Show
8,champion,"Years after getting his girlfriend pregnant, w...",TV Comedies,TV Show
9,chappell show,The brilliant Dave Chappelle performs blisteri...,TV Comedies,TV Show
