In [1]:
# Import library

import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Setting ukuran grafik yang akan di tampilkan
import matplotlib.pyplot as plt  # Untuk menampikan grafik 
plt.rcParams["figure.figsize"] = (20,6)  

In [3]:
# Import dataset
data = pd.read_csv("dataset/netflixData.csv")


In [4]:
# Mari kita lihat apakah data tersebut mengandung nilai null atau tidak:

data.isnull().sum()

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64

+ Dataset berisi nilai null, namun sebelum menghapus nilai null, mari pilih kolom yang dapat kita gunakan untuk membangun sistem rekomendasi Netflix:


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5967 entries, 0 to 5966
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Show Id             5967 non-null   object 
 1   Title               5967 non-null   object 
 2   Description         5967 non-null   object 
 3   Director            3903 non-null   object 
 4   Genres              5967 non-null   object 
 5   Cast                5437 non-null   object 
 6   Production Country  5408 non-null   object 
 7   Release Date        5964 non-null   float64
 8   Rating              5963 non-null   object 
 9   Duration            5964 non-null   object 
 10  Imdb Score          5359 non-null   object 
 11  Content Type        5967 non-null   object 
 12  Date Added          4632 non-null   object 
dtypes: float64(1), object(12)
memory usage: 606.1+ KB


In [6]:
# Kita lihat sample dataset tersebut:
data.head()

Unnamed: 0,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
0,cc1b6ed9-cf9e-4057-8303-34577fb54477,(Un)Well,This docuseries takes a deep dive into the luc...,,Reality TV,,United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
1,e2ef4e91-fb25-42ab-b485-be8e3b23dedb,#Alive,"As a grisly virus rampages a city, a lone man ...",Cho Il,"Horror Movies, International Movies, Thrillers","Yoo Ah-in, Park Shin-hye",South Korea,2020.0,TV-MA,99 min,6.2/10,Movie,"September 8, 2020"
2,b01b73b7-81f6-47a7-86d8-acb63080d525,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Sabina Fedeli, Anna Migotto","Documentaries, International Movies","Helen Mirren, Gengher Gatti",Italy,2019.0,TV-14,95 min,6.4/10,Movie,"July 1, 2020"
3,b6611af0-f53c-4a08-9ffa-9716dc57eb9c,#blackAF,Kenya Barris and his family navigate relations...,,TV Comedies,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
4,7f2d4170-bab8-4d75-adc2-197f7124c070,#cats_the_mewvie,This pawesome documentary explores how our fel...,Michael Margolis,"Documentaries, International Movies",,Canada,2020.0,TV-14,90 min,5.1/10,Movie,"February 5, 2020"


+ Pada kesan pertama pada kumpulan data, saya dapat melihat bahwa kolom Judul perlu persiapan karena berisi # sebelum nama film atau acara tv.

In [8]:
# Memilih kolom yang dapat kita gunakan untuk membangun sistem rekomendasi Netflix:

data = data[["Title", "Description", "Content Type", "Genres"]]
data.head()

Unnamed: 0,Title,Description,Content Type,Genres
0,(Un)Well,This docuseries takes a deep dive into the luc...,TV Show,Reality TV
1,#Alive,"As a grisly virus rampages a city, a lone man ...",Movie,"Horror Movies, International Movies, Thrillers"
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...",Movie,"Documentaries, International Movies"
3,#blackAF,Kenya Barris and his family navigate relations...,TV Show,TV Comedies
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,Movie,"Documentaries, International Movies"


### Now let’s drop the rows containing null values and move further:

In [9]:
data = data.dropna()

+ Sekarang saya akan membersihkan kolom Judul karena berisi beberapa persiapan data:

In [10]:
# Import library yang terkait :
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Title"] = data["Title"].apply(clean)

In [13]:
# mari kita lihat beberapa contoh Judul sebelum melanjutkan:
data.Title.sample(10)

5267              umbrella academi
382          anjelah johnson fanci
1815                ginni  georgia
4705              cat hat know lot
1024               citi last thing
1112    craig ferguson tickl fight
306                    alonetogeth
3831                    racket boy
4513                   sweet  sour
5492      trevor noah son patricia
Name: Title, dtype: object

+ Sekarang saya akan menggunakan kolom Genre sebagai fitur untuk merekomendasikan konten serupa kepada pengguna. 
+ Saya akan menggunakan konsep  kesamaan cosinus

In [14]:
feature = data["Genres"].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

+ Sekarang saya akan mengatur kolom Judul sebagai indeks sehingga kita dapat menemukan konten serupa dengan memberikan judul film atau acara TV sebagai input:

In [15]:
indices = pd.Series(data.index, 
                    index=data['Title']).drop_duplicates()

In [16]:
# Sekarang inilah cara menulis fungsi untuk merekomendasikan Film dan acara TV di Netflix:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

In [17]:
netFlix_recommendation("girlfriend")

3                          blackaf
285                     washington
417                 arrest develop
434     astronomi club sketch show
451    aunti donna big ol hous fun
656                      big mouth
752                bojack horseman
805                   brew brother
935                       champion
937                  chappell show
Name: Title, dtype: object

+ Sistem rekomendasi Netflix memprediksi katalog yang dipersonalisasi untuk Anda berdasarkan faktor-faktor seperti riwayat tontonan Anda, riwayat tontonan pengguna lain dengan selera dan preferensi yang sama, serta genre, kategori, deskripsi, dan informasi lebih lanjut tentang konten yang Anda tonton