# Recommendation System using Word2vec


## Preprocessing of the data


In [1]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")


In [2]:
df_netflix = pd.read_csv("./netflix_titles.csv")
df_netflix.drop(
    columns=[
        "director",
        "cast",
        "country",
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)
df_netflix.head(3)


Unnamed: 0,show_id,title,listed_in,description
0,s1,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


## Removing Punctuations and Stopwords


In [3]:
from nltk.tokenize import word_tokenize

df_netflix["title_list"] = df_netflix["title"].str.lower()
df_netflix["listed_in"] = df_netflix["listed_in"].str.lower()
df_netflix["description"] = df_netflix["description"].str.lower()

df_netflix["title_list"] = df_netflix["title_list"].apply(word_tokenize)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(word_tokenize)
df_netflix["description"] = df_netflix["description"].apply(word_tokenize)


In [4]:
from nltk.corpus import stopwords
from string import punctuation

list_stopwords = set(stopwords.words("english") + list(punctuation))
df_netflix["title_list"] = df_netflix["title_list"].apply(
    lambda x: [word for word in x if word not in list_stopwords]
)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(
    lambda x: [word for word in x if word not in list_stopwords]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if word not in list_stopwords]
)


In [5]:
import string

df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if len(word) > 0]
)


In [6]:
df_netflix["title_list"] = df_netflix["title_list"].apply(lambda x: list(set(x)))
df_netflix["listed_in"] = df_netflix["listed_in"].apply(lambda x: list(set(x)))
df_netflix["description"] = df_netflix["description"].apply(lambda x: list(set(x)))


In [7]:
! wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
! gunzip GoogleNews-vectors-negative300.bin.gz

In [8]:
import gensim

wv = gensim.models.KeyedVectors.load_word2vec_format(
    "./GoogleNews-vectors-negative300.bin", binary=True
)


## Finding Similarities Among Shows using Title, Genres, Description


In [9]:
matrix_netflix_vocab = []
for list_ in df_netflix.to_numpy():
    list_[2] = [word for word in list_[2] if word in wv.key_to_index]
    list_[3] = [word for word in list_[3] if word in wv.key_to_index]
    list_[4] = [word for word in list_[4] if word in wv.key_to_index]
    matrix_netflix_vocab.append(list_)
df_netflix_vocab = pd.DataFrame(matrix_netflix_vocab, columns=df_netflix.columns)


In [10]:
from tqdm import tqdm


def recommendation(title):
    matrix_netflix_title_vocab = []
    for list_ in df_netflix[df_netflix["title"] == title].to_numpy():
        list_[2] = [word for word in list_[2] if word in wv.key_to_index]
        list_[3] = [word for word in list_[3] if word in wv.key_to_index]
        list_[4] = [word for word in list_[4] if word in wv.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(matrix_netflix_vocab)
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:
            score_catg = wv.n_similarity(list1[2], list2[2])
            score_desc = wv.n_similarity(list1[3], list2[3])
            try:
                score_title = wv.n_similarity(list1[4], list2[4]) / 2
            except:
                score_title = 0
            if (list1[1] != list2[1]) & (score_catg > 0.85):
                matrix_similarity.append(
                    [list1[1], list2[1], score_title, score_catg, score_desc]
                )
        pbar.update()
    pbar.close()
    df_netflix_similarity = pd.DataFrame(
        matrix_similarity,
        columns=[
            "recommendation",
            "title",
            "score_title",
            "score_category",
            "score_description",
        ],
    )
    df_netflix_similarity["final_score"] = (
        df_netflix_similarity["score_title"]
        + df_netflix_similarity["score_category"]
        + df_netflix_similarity["score_description"]
    )
    return df_netflix_similarity.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False,
    ).head(10)


## Movie Recommender using word2vec tool


In [11]:
recommendation("The Conjuring")


100%|██████████| 8807/8807 [00:05<00:00, 1692.76it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
383,Conjuring Spirit,The Conjuring,0.376218,0.964287,0.624534,1.965039
95,The Conjuring 2,The Conjuring,0.40848,0.913295,0.595724,1.917499
391,Delirium,The Conjuring,0.11187,1.0,0.687572,1.799442
86,Insidious,The Conjuring,0.093044,1.0,0.687981,1.781025
513,The Diabolical,The Conjuring,0.156948,0.953485,0.669266,1.779699
64,The Strange House,The Conjuring,0.086246,0.964287,0.713428,1.76396
522,The Haunting of Molly Hartley,The Conjuring,0.173505,1.0,0.572111,1.745615
355,All Light Will End,The Conjuring,0.09736,1.0,0.64138,1.73874
273,Malevolent,The Conjuring,0.147643,1.0,0.585509,1.733152
42,The Strangers,The Conjuring,0.073289,1.0,0.645084,1.718374


In [12]:
recommendation("Insidious")


100%|██████████| 8807/8807 [00:04<00:00, 1792.22it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
84,What Lies Below,Insidious,0.112808,1.0,0.807122,1.91993
513,The Diabolical,Insidious,0.260919,0.953485,0.690771,1.905174
273,Malevolent,Insidious,0.247639,1.0,0.624875,1.872514
512,The Devil Inside,Insidious,0.141939,1.0,0.695405,1.837343
522,The Haunting of Molly Hartley,Insidious,0.146591,1.0,0.677317,1.823909
72,Things Heard & Seen,Insidious,0.082905,1.0,0.726047,1.808952
370,Bhoot,Insidious,0.084453,0.964287,0.753416,1.802156
64,The Strange House,Insidious,0.105459,0.964287,0.728778,1.798524
391,Delirium,Insidious,0.050422,1.0,0.745601,1.796024
223,Sinister 2,Insidious,0.223022,0.913295,0.656998,1.793315


# Recommendation System using Node2vec


In [13]:
import networkx as nx  # create and store graph
from node2vec import Node2Vec  # To run node2vec algorithm


In [14]:
df_node2vec = pd.read_csv("./netflix_titles.csv")
df_node2vec = df_node2vec.dropna()
df_node2vec.drop(
    columns=[
        "director",
        "cast",
        "country",
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)


## Creating and analyzing Graph

Now, we'll use networkx to create a graph with movie titles and genres as nodes. I used two different functions: -

- addToGraph(movie name, graph): Adds an edge to the graph with the title and genres as nodes.
- createGraph(): This function calls addToGraph for each movie title in order to generate a complete graph.


In [15]:
# function that will create edges for given movie title and its genres
def addToGraph(movie_name, graph):
    genres = (
        df_node2vec[df_node2vec["title"] == movie_name]["listed_in"]
        .values[0]
        .rstrip()
        .lower()
        .split(", ")
    )
    for genre in genres:
        graph.add_edge(movie_name.strip(), genre)
    return graph


# function that will create graph for all the movies name
def createGraph():
    graph = nx.Graph()
    for movie_name in df_node2vec["title"]:
        graph = addToGraph(movie_name, graph)
    return graph


In [16]:
graph = createGraph()


In [17]:
# should be 2 since two genres are associated with it
print(graph.degree()["Norm of the North: King Sized Adventure"])
# should be 1 since 1 genres are associated with it
print(graph.degree()["#realityhigh"])


2
1


## Running Node2Vec

Node2vec’s sampling strategy, accepts 4 arguments:

- Number of walks: Number of random walks to be generated from each node in the graph
- dimensions : Embedding dimensions
- Walk length: How many nodes are in each random walk
- P: Return hyperparameter
- Q: Input hyperparameter


In [18]:
node2vec = Node2Vec(graph, dimensions=20, walk_length=16, num_walks=10)


Computing transition probabilities: 100%|██████████| 5373/5373 [01:30<00:00, 59.42it/s] 
Generating walks (CPU: 1): 100%|██████████| 10/10 [01:31<00:00,  9.14s/it]


In [19]:
model = node2vec.fit(window=5, min_count=1)


## See Embeddings

Let's take a look at the values in embeddings.


In [20]:
model.wv.get_vector("The Conjuring")


array([ 0.3976648 ,  0.23961946, -0.3167309 , -0.12880409,  0.64077073,
        0.32492435,  0.4883481 ,  0.818047  , -0.685554  ,  0.16029502,
        0.5909224 , -0.2599521 ,  0.72134477,  0.5065936 ,  0.04392775,
        0.5673693 ,  0.9184569 ,  0.13751492, -1.3321538 , -0.5747292 ],
      dtype=float32)

In [21]:
model.wv.get_vector("Insidious")


array([ 0.2840743 ,  0.38807532, -0.4549778 , -0.06669122,  0.5805604 ,
        0.37350893,  0.47478074,  0.99944   , -0.41174024,  0.16872013,
        0.6267749 , -0.44709772,  0.80438036,  0.6019174 , -0.09264741,
        0.3447448 ,  0.7417309 ,  0.13352576, -1.203107  , -0.8766482 ],
      dtype=float32)

## Using Node2Vec Embeddings

We will use the generated embeddings to recommend similar genres and movies.


In [22]:
# generate similar movies to given genre or title
def node2vec_recommender(name):
    for node, _ in model.wv.most_similar(name):
        print(node)


## Movie Recommender using node2vec tool


In [23]:
node2vec_recommender("Insidious")


The Vatican Tapes
Incarnate
The Silence
Green Room
Gothika
Malevolent
Gehenna: Where Death Lives
The Strangers
Stonehearst Asylum
Havenhurst


In [24]:
node2vec_recommender("The Conjuring")


The Bye Bye Man
Clinical
Cabin Fever
The Ring
Stonehearst Asylum
What Lies Beneath
Death Note
The Charnel House
In the Tall Grass
The Craft


# Recommendation System using Sentence Transformer


## Load csv into Pandas Dataframe


In [25]:
sent_df = pd.read_csv("./netflix_titles.csv")


In [26]:
# This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("paraphrase-distilroberta-base-v1")


## Find Embeddings for all show descriptions in dataset.


In [27]:
descriptions = sent_df["description"].tolist()
# print(descriptions)
des_embeddings = []
for i, des in enumerate(descriptions):
    des_embeddings.append(model.encode(des))


## For a query show_id let's find the top ten shows with the highest cosine similarity.


In [None]:
import torch
from sentence_transformers import util


def recommend(query):
    query_embedded = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedded, des_embeddings)
    top10_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][
        1:11
    ]
    return top10_matches


## Movie Recommender using sentence transformers


In [None]:
title = "The Conjuring"
query_show_des = sent_df.loc[sent_df["title"] == title]["description"].to_list()[0]

recommended_results = recommend(query_show_des)
recommended_results = [x + 1 for x in recommended_results]

for i in range(len(recommended_results)):
    print(
        sent_df["title"].loc[
            sent_df["show_id"] == str("s" + str(recommended_results[i]))
        ]
    )


# Recommendation System using Cosine Similarity and TF-IDF


In [None]:
df_cosine = pd.read_csv("../netflix_titles.csv")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# removing stopwords
tfidf = TfidfVectorizer(stop_words="english")

# Replace NaN with an empty string
df_cosine["description"] = df_cosine["description"].fillna("")

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df_cosine["description"])

# Output the shape of tfidf_matrix
tfidf_matrix.shape


In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df_cosine.index, index=df_cosine["title"]).drop_duplicates()

filledna = df_cosine.fillna("")
filledna.head(2)


Cleaning the data - making all the words lower case


In [None]:
def clean_data(x):
    return str.lower(x.replace(" ", ""))


Identifying features on which the model is to be filtered.


In [None]:
features = ["title", "director", "cast", "listed_in", "description"]
filledna = filledna[features]

for feature in features:
    filledna[feature] = filledna[feature].apply(clean_data)

filledna.head(2)


Creating a "soup" or a "bag of words" for all rows.


In [None]:
def create_soup(x):
    return (
        x["title"]
        + " "
        + x["director"]
        + " "
        + x["cast"]
        + " "
        + x["listed_in"]
        + " "
        + x["description"]
    )


filledna["soup"] = filledna.apply(create_soup, axis=1)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words="english")
count_matrix = count.fit_transform(filledna["soup"])

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


In [None]:
filledna = filledna.reset_index()
indices = pd.Series(filledna.index, index=filledna["title"])


In [None]:
def get_recommendations_new(title, cosine_sim=cosine_sim):
    title = title.replace(" ", "").lower()
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df_cosine["title"].iloc[movie_indices]


In [None]:
get_recommendations_new("The Conjuring", cosine_sim2)


# Recommendation System using MiniBatchKMeans


In [None]:
#Import necessary libraries
import numpy as np # linear algebra
import matplotlib.pyplot as plt
%matplotlib inline
import networkx as nx
import time
import math

In [None]:
# Analysing the input dataframe
df = pd.read_csv("../netflix_titles.csv")


In [None]:
# mapping the multiple value cells into list type
df["directors"] = df["director"].apply(
    lambda l: [] if pd.isna(l) else [i.strip().replace(" ", "_") for i in l.split(",")]
)
df["genres"] = df["listed_in"].apply(
    lambda l: [] if pd.isna(l) else [i.strip().replace(" ", "_") for i in l.split(",")]
)
df["actors"] = df["cast"].apply(
    lambda l: [] if pd.isna(l) else [i.strip().replace(" ", "_") for i in l.split(",")]
)
df["countries"] = df["country"].apply(
    lambda l: [] if pd.isna(l) else [i.strip().replace(" ", "_") for i in l.split(",")]
)
df["all_features"] = df["directors"] + df["genres"] + df["actors"] + df["countries"]
df["all_features"] = df["all_features"].apply(lambda x: " ".join(x))


In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [None]:
def text_cleanser(text):
    stemmer = WordNetLemmatizer()
    text = "".join(
        [char for char in text if (char.isalpha() or char.isspace()) and char != "'"]
    )
    text = [
        word.lower()
        for word in text.split()
        if word.lower() not in stopwords.words("english")
    ]
    return [stemmer.lemmatize(word) for word in text]


In [None]:
df_desc = df[["show_id", "description"]].set_index("show_id")
df_desc["trunc_desc"] = df_desc["description"].apply(lambda x: text_cleanser(x))


In [None]:
vocabulary = {}
for words in df_desc.trunc_desc:
    for word in words:
        if word in vocabulary:
            vocabulary[word] = vocabulary[word] + 1
        else:
            vocabulary[word] = 1
df_voc = pd.DataFrame(list(vocabulary.items()), columns=["text", "count"])


#### Clustering :-


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

bow_transformer = CountVectorizer(analyzer=text_cleanser).fit(df_desc["description"])
desc_bow = bow_transformer.transform(df_desc["description"])
print("Shape of the generated matrix : ", desc_bow.shape)
sparsity = desc_bow.nnz / (desc_bow.shape[0] * desc_bow.shape[1]) * 100.0
print("Sparsity of the generated matrix", round(sparsity, 4))


In [None]:
# Finding TF and IDF metrics
vector = TfidfVectorizer(
    max_df=1,
    min_df=1,
    strip_accents="ascii",
    stop_words="english",
    lowercase=True,
    use_idf=True,
    norm="l2",
    smooth_idf=True,
)
tfidf = vector.fit_transform(df.all_features)


In [None]:
# Cluster the description data using MiniBatchKMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import linear_kernel

# setting the no. of resulting clusters for kmeans
k = 400
kmeans = MiniBatchKMeans(n_clusters=k, init="k-means++")
kmeans.fit(tfidf)
centers = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vector.get_feature_names_out()
request_transform = vector.transform(df["all_features"])
# new column cluster based on the description
df["cluster"] = kmeans.predict(request_transform)


In [None]:
def find_similar_movies(tfidf_matrix, index, top_n=5):
    cosine_similarities = linear_kernel(
        tfidf_matrix[index : index + 1], tfidf_matrix
    ).flatten()
    related_docs_indices = [
        i for i in cosine_similarities.argsort()[::-1] if i != index
    ]
    return [index for index in related_docs_indices][0:top_n]


## Graph preparation

- Insert all the notable parameters like castings,genre and directors as nodes in the graph.
- use find_similar_movies() method to fetch the n related movies for the given node
- Connect the all the related movie nodes with the selected movie with an edge names 'SIMILAR'


In [None]:
G = nx.Graph(label="MOVIE")
start_time = time.time()
for i, rowi in df.iterrows():
    G.add_node(
        rowi["title"],
        key=rowi["show_id"],
        label="MOVIE",
        mtype=rowi["type"],
        rating=rowi["rating"],
    )
    for element in rowi["actors"]:
        G.add_node(element, label="PERSON")
        G.add_edge(rowi["title"], element, label="ACTED_IN")
    for element in rowi["genres"]:
        G.add_node(element, label="GENRE")
        G.add_edge(rowi["title"], element, label="GENRE_IN")
    for element in rowi["directors"]:
        G.add_node(element, label="PERSON")
        G.add_edge(rowi["title"], element, label="DIRECTED")
    for element in rowi["countries"]:
        G.add_node(element, label="COU")
        G.add_edge(rowi["title"], element, label="COU_IN")

    indices = find_similar_movies(tfidf, i, top_n=3)
    snode = "Sim(" + rowi["title"][:15].strip() + ")"
    G.add_node(snode, label="SIMILAR")
    G.add_edge(rowi["title"], snode, label="SIMILARITY")
    for element in indices:
        G.add_edge(snode, df["title"].loc[element], label="SIMILARITY")
print(" finish -- {} seconds --".format(time.time() - start_time))


In [None]:
def get_all_adj_nodes(list_in):
    sub_graph = set()
    for m in list_in:
        sub_graph.add(m)
        for e in G.neighbors(m):
            sub_graph.add(e)
    return list(sub_graph)


def draw_sub_graph(sub_graph, title=""):
    subgraph = G.subgraph(sub_graph)
    colors = []
    sizes = []
    std_size = 500
    for e in subgraph.nodes():
        if G.nodes[e]["label"] == "MOVIE":
            colors.append("#b3042c")
            sizes.append(std_size * 5)
        elif G.nodes[e]["label"] == "PERSON":
            colors.append("#047d59")
            sizes.append(std_size * 2.5)
        elif G.nodes[e]["label"] == "GENRE":
            colors.append("#3a018a")
            sizes.append(std_size)
        elif G.nodes[e]["label"] == "COU":
            colors.append("#bd3102")
            sizes.append(std_size * 0.9)
        elif G.nodes[e]["label"] == "SIMILAR":
            colors.append("#b87906")
            sizes.append(std_size)
        elif G.nodes[e]["label"] == "CLUSTER":
            colors.append("#cdf7e9")
            sizes.append(std_size * 0.9)
    fig, ax = plt.subplots(figsize=(18, 18))
    nx.draw(
        subgraph,
        with_labels=True,
        alpha=0.8,
        node_shape="o",
        node_size=sizes,
        cmap="Accent",
        edge_color="white",
        font_color="white",
        font_weight="bold",
        node_color=colors,
    )
    # plt.title('Recommendations after watching The video '+title+' :-')
    ax.set_title("Recommendations for Movie " + title + " :-", color="white", size=30)
    ax.axis("off")
    fig.set_facecolor("#44495c")
    plt.tight_layout()
    plt.gcf().set_dpi(400)
    plt.show()


In [None]:
def get_recommendation(root):
    commons_dict = {}
    for e in G.neighbors(root):
        for e2 in G.neighbors(e):
            if e2 == root:
                continue
            if G.nodes[e2]["label"] == "MOVIE":
                commons = commons_dict.get(e2)
                if commons == None:
                    commons_dict.update({e2: [e]})
                else:
                    commons.append(e)
                    commons_dict.update({e2: commons})
    movies = []
    weight = []
    for key, values in commons_dict.items():
        w = 0.0
        for e in values:
            w = w + 1 / math.log(G.degree(e))
        movies.append(key)
        weight.append(w)

    result = pd.Series(data=np.array(weight), index=movies)
    result.sort_values(inplace=True, ascending=False)
    return result


In [None]:
Movie = "Dark"
recommendation = get_recommendation(Movie)
rel_graph = get_all_adj_nodes([Movie] + list(recommendation.index[:2]))
draw_sub_graph(rel_graph, Movie)
