# Recommendation System using Word2vec


## Preprocessing of the data


In [1]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")


In [2]:
df_netflix = pd.read_csv("../netflix_titles.csv")
df_netflix.drop(
    columns=[
        "director",
        "cast",
        "country",
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)
df_netflix.head(3)


Unnamed: 0,show_id,title,listed_in,description
0,s1,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


## Removing Punctuations and Stopwords


In [3]:
from nltk.tokenize import word_tokenize

df_netflix["title_list"] = df_netflix["title"].str.lower()
df_netflix["listed_in"] = df_netflix["listed_in"].str.lower()
df_netflix["description"] = df_netflix["description"].str.lower()

df_netflix["title_list"] = df_netflix["title_list"].apply(word_tokenize)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(word_tokenize)
df_netflix["description"] = df_netflix["description"].apply(word_tokenize)


In [4]:
from nltk.corpus import stopwords
from string import punctuation

list_stopwords = set(stopwords.words("english") + list(punctuation))
df_netflix["title_list"] = df_netflix["title_list"].apply(
    lambda x: [word for word in x if word not in list_stopwords]
)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(
    lambda x: [word for word in x if word not in list_stopwords]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if word not in list_stopwords]
)


In [5]:
import string

df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if len(word) > 0]
)


In [6]:
df_netflix["title_list"] = df_netflix["title_list"].apply(lambda x: list(set(x)))
df_netflix["listed_in"] = df_netflix["listed_in"].apply(lambda x: list(set(x)))
df_netflix["description"] = df_netflix["description"].apply(lambda x: list(set(x)))


In [7]:
! wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
! gunzip GoogleNews-vectors-negative300.bin.gz

In [8]:
import gensim

wv = gensim.models.KeyedVectors.load_word2vec_format(
    "./GoogleNews-vectors-negative300.bin", binary=True
)


## Finding Similarities Among Shows using Title, Genres, Description


In [9]:
matrix_netflix_vocab = []
for list_ in df_netflix.to_numpy():
    list_[2] = [word for word in list_[2] if word in wv.key_to_index]
    list_[3] = [word for word in list_[3] if word in wv.key_to_index]
    list_[4] = [word for word in list_[4] if word in wv.key_to_index]
    matrix_netflix_vocab.append(list_)
df_netflix_vocab = pd.DataFrame(matrix_netflix_vocab, columns=df_netflix.columns)


In [10]:
from tqdm import tqdm


def recommendation(title):
    matrix_netflix_title_vocab = []
    for list_ in df_netflix[df_netflix["title"] == title].to_numpy():
        list_[2] = [word for word in list_[2] if word in wv.key_to_index]
        list_[3] = [word for word in list_[3] if word in wv.key_to_index]
        list_[4] = [word for word in list_[4] if word in wv.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(matrix_netflix_vocab)
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:
            score_catg = wv.n_similarity(list1[2], list2[2])
            score_desc = wv.n_similarity(list1[3], list2[3])
            try:
                score_title = wv.n_similarity(list1[4], list2[4]) / 2
            except:
                score_title = 0
            if (list1[1] != list2[1]) & (score_catg > 0.85):
                matrix_similarity.append(
                    [list1[1], list2[1], score_title, score_catg, score_desc]
                )
        pbar.update()
    pbar.close()
    df_netflix_similarity = pd.DataFrame(
        matrix_similarity,
        columns=[
            "recommendation",
            "title",
            "score_title",
            "score_category",
            "score_description",
        ],
    )
    df_netflix_similarity["final_score"] = (
        df_netflix_similarity["score_title"]
        + df_netflix_similarity["score_category"]
        + df_netflix_similarity["score_description"]
    )
    return df_netflix_similarity.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False,
    ).head(10)


## Movie Recommender using word2vec tool


In [11]:
recommendation("The Conjuring")


100%|██████████| 8807/8807 [00:05<00:00, 1607.47it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
383,Conjuring Spirit,The Conjuring,0.376218,0.964287,0.624534,1.965039
95,The Conjuring 2,The Conjuring,0.40848,0.913295,0.595724,1.917499
391,Delirium,The Conjuring,0.11187,1.0,0.687572,1.799442
86,Insidious,The Conjuring,0.093044,1.0,0.687981,1.781025
513,The Diabolical,The Conjuring,0.156948,0.953485,0.669266,1.779699
64,The Strange House,The Conjuring,0.086246,0.964287,0.713428,1.76396
522,The Haunting of Molly Hartley,The Conjuring,0.173505,1.0,0.572111,1.745615
355,All Light Will End,The Conjuring,0.09736,1.0,0.64138,1.73874
273,Malevolent,The Conjuring,0.147643,1.0,0.585509,1.733152
42,The Strangers,The Conjuring,0.073289,1.0,0.645084,1.718374


In [12]:
recommendation("Insidious")


100%|██████████| 8807/8807 [00:04<00:00, 1812.88it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
84,What Lies Below,Insidious,0.112808,1.0,0.807122,1.91993
513,The Diabolical,Insidious,0.260919,0.953485,0.690771,1.905174
273,Malevolent,Insidious,0.247639,1.0,0.624875,1.872514
512,The Devil Inside,Insidious,0.141939,1.0,0.695405,1.837343
522,The Haunting of Molly Hartley,Insidious,0.146591,1.0,0.677317,1.823909
72,Things Heard & Seen,Insidious,0.082905,1.0,0.726047,1.808952
370,Bhoot,Insidious,0.084453,0.964287,0.753416,1.802156
64,The Strange House,Insidious,0.105459,0.964287,0.728778,1.798524
391,Delirium,Insidious,0.050422,1.0,0.745601,1.796024
223,Sinister 2,Insidious,0.223022,0.913295,0.656998,1.793315
