In [1]:
import numpy as np
import pandas as pd

In [2]:
# Reading the csv file
df=pd.read_csv('netflix_titles.csv')

In [3]:
# Filling the null values with mode for the numeric column and then the columns with string is filled with NULL
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['date_added'] = df['date_added'].fillna(df['date_added'].mode()[0])
df['rating'] = df['rating'].fillna(df['country'].mode()[0])
df['duration'] = df['duration'].fillna('0 min')
df['cast'] = df['cast'].fillna('NULL')
df['director'] = df['director'].fillna('NULL')

In [4]:
# Checking is any Missing data in each column of the dataset.
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

Mini-batch K-means clustering algorithm

The Mini-batch K-means clustering algorithm is a version of the standard K-means algorithm in machine learning. It uses small, random, fixed-size batches of data to store in memory, and then with each iteration, a random sample of the data is collected and used to update the clusters.

Each iteration a new random sample from the dataset is obtained and used to update the clusters and this is repeated until convergence.

In [5]:
df1=df
df1["date_added"] = pd.to_datetime(df['date_added'])
df1['directors'] = df['director'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
df1['categories'] = df['listed_in'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
df1['actors'] = df['cast'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
df1['countries'] = df['country'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

In [7]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(8807, 18895)

In [8]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [index for index in related_docs_indices][0:top_n] 

In [10]:
# Build the tfidf matrix with the descriptions
import time
start_time = time.time()
text_content = df1['description']
vector = TfidfVectorizer(max_df=0.4,         # drop words that occur in more than X percent of documents
                             min_df=1,      # only use words that appear at least X times
                             stop_words='english', # remove stop words
                             lowercase=True, # Convert everything to lower case 
                             use_idf=True,   # Use idf
                             norm=u'l2',     # Normalization
                             smooth_idf=True # Prevents divide-by-zero errors
                            )
tfidf = vector.fit_transform(text_content)

In [11]:
from sklearn.cluster import MiniBatchKMeans# Clustering  Kmeans
k = 200
kmeans = MiniBatchKMeans(n_clusters = k)
kmeans.fit(tfidf)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = vector.get_feature_names()
request_transform = vector.transform(df1['description'])
# new column cluster based on the description
df1['cluster'] = kmeans.predict(request_transform) 
df1['cluster'].value_counts().head()



190    7601
137     521
26      211
75      168
115     109
Name: cluster, dtype: int64

In [12]:
import networkx as nx
P = nx.Graph(label="MOVIE")
start_time = time.time()
for i, rowi in df1.iterrows():
    if (i%1000==0):
        print(" iter {} -- {} seconds --".format(i,time.time() - start_time))
    P.add_node(rowi['title'],key=rowi['show_id'],label="MOVIE",mtype=rowi['type'],rating=rowi['rating'])
    P.add_node(rowi['cluster'],label="CLUSTER")
    P.add_edge(rowi['title'], rowi['cluster'], label="DESCRIPTION")
    for element in rowi['actors']:
        P.add_node(element,label="PERSON")
        P.add_edge(rowi['title'], element, label="ACTED_IN")
    for element in rowi['categories']:
        P.add_node(element,label="CAT")
        P.add_edge(rowi['title'], element, label="CAT_IN")
    for element in rowi['directors']:
        P.add_node(element,label="PERSON")
        P.add_edge(rowi['title'], element, label="DIRECTED")
    for element in rowi['countries']:
        P.add_node(element,label="COU")
        P.add_edge(rowi['title'], element, label="COU_IN")
    indices = find_similar(tfidf, i, top_n = 5)
    snode="Sim("+rowi['title'][:15].strip()+")"        
    P.add_node(snode,label="SIMILAR")
    P.add_edge(rowi['title'], snode, label="SIMILARITY")
    for element in indices:
        P.add_edge(snode, df1['title'].loc[element], label="SIMILARITY")
print(" finish -- {} seconds --".format(time.time() - start_time))    

 iter 0 -- 0.04700303077697754 seconds --
 iter 1000 -- 7.165312767028809 seconds --
 iter 2000 -- 13.336365699768066 seconds --
 iter 3000 -- 21.201383352279663 seconds --
 iter 4000 -- 27.7522075176239 seconds --
 iter 5000 -- 33.37062931060791 seconds --
 iter 6000 -- 39.01405191421509 seconds --
 iter 7000 -- 44.622472524642944 seconds --
 iter 8000 -- 50.33790040016174 seconds --
 finish -- 54.98725199699402 seconds --


In [26]:
import math
def get_recommendation(root):
    commons_dict = {}
    for h in P.neighbors(root):
        for h2 in P.neighbors(h):
            if h2==root:
                continue
            if P.nodes[h2]['label']=="MOVIE":
                commons = commons_dict.get(h2)
                if commons==None:
                    commons_dict.update({h2 : [h]})
                else:
                    commons.append(h)
                    commons_dict.update({h2 : commons})
    movies=[]
    weight=[]
    for key, values in commons_dict.items():
        w=0.0
        for h in values:
            w=w+1/math.log(P.degree(h))
        movies.append(key) 
        weight.append(w)
    
    final = pd.Series(data=np.array(weight),index=movies)
    final.sort_values(inplace=True,ascending=False)        
    return final;


In [27]:
final1 = get_recommendation("PK")
final2 = get_recommendation("Ocean's Thirteen")
final3 = get_recommendation("The Devil Inside")
final4 = get_recommendation("Stranger Things")
print("*"*40+"\n Recommendation for 'PK\n"+"*"*40)
print(final1.head())
print("*"*40+"\n Recommendation for 'Ocean's Thirteen'\n"+"*"*40)
print(final2.head())

print("*"*40+"\n Recommendation for 'Stranger Things'\n"+"*"*40)
print(final4.head())

****************************************
 Recommendation for 'PK
****************************************
3 Idiots                 2.107430
Sanju                    1.722554
Merku Thodarchi Malai    1.625123
Drive                    1.594257
Chance Pe Dance          1.553858
dtype: float64
****************************************
 Recommendation for 'Ocean's Thirteen'
****************************************
Ocean's Twelve       6.207969
The Departed         2.210179
Ocean's Eleven       2.075885
Hostel: Part III     1.793146
Brooklyn's Finest    1.446086
dtype: float64
****************************************
 Recommendation for 'Stranger Things'
****************************************
Beyond Stranger Things     11.419323
Rowdy Rathore               2.688934
Safe Haven                  2.461772
Big Stone Gap               1.903662
The Autopsy of Jane Doe     1.903662
dtype: float64
