# Netflix Movies and TV shows Recommendation system

Algorithm : Content Based Filtering - word2vec for word embedding of text features

Notebook summary

1. Import necessary libraries

2. Import required dataset

3. Data preprocessing for baseline model

4. Install necessary dependencies to leverage the word2vec word embedding model from gensim

5. Model 1 ( baseline model ) : Considering equal weights for all features ( limited feature )

6. Model 2 : Considering different weights for different features

7. Model 3 : Considering wider set of features with differentiated weighting


# 1. Import necessary libraries

In [5]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# 2. Import required dataset

In [6]:
df_netflix = pd.read_csv("netflix_titles.csv")
df_netflix.drop(
    columns=[
        "director",
        "cast",
        "country",
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description
0,s1,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


# 3. Data preprocessing for baseline model

In [7]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
from nltk.tokenize import word_tokenize

df_netflix["title_list"] = df_netflix["title"].str.lower()
df_netflix["listed_in"] = df_netflix["listed_in"].str.lower()
df_netflix["description"] = df_netflix["description"].str.lower()

df_netflix["title_list"] = df_netflix["title_list"].apply(word_tokenize)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(word_tokenize)
df_netflix["description"] = df_netflix["description"].apply(word_tokenize)

In [9]:
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[as, her, father, nears, the, end, of, his, li...","[dick, johnson, is, dead]"
1,s2,Blood & Water,"[international, tv, shows, ,, tv, dramas, ,, t...","[after, crossing, paths, at, a, party, ,, a, c...","[blood, &, water]"
2,s3,Ganglands,"[crime, tv, shows, ,, international, tv, shows...","[to, protect, his, family, from, a, powerful, ...",[ganglands]


In [10]:
import string

df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if len(word) > 0]
)

In [11]:
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[as, her, father, nears, the, end, of, his, li...","[dick, johnson, is, dead]"
1,s2,Blood & Water,"[international, tv, shows, ,, tv, dramas, ,, t...","[after, crossing, paths, at, a, party, a, cape...","[blood, &, water]"
2,s3,Ganglands,"[crime, tv, shows, ,, international, tv, shows...","[to, protect, his, family, from, a, powerful, ...",[ganglands]


In [12]:
df_netflix["title_list"] = df_netflix["title_list"].apply(lambda x: list(set(x)))
df_netflix["listed_in"] = df_netflix["listed_in"].apply(lambda x: list(set(x)))
df_netflix["description"] = df_netflix["description"].apply(lambda x: list(set(x)))

In [13]:
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[end, them, his, to, in, of, johnson, and, bot...","[is, dick, dead, johnson]"
1,s2,Blood & Water,"[,, shows, international, mysteries, dramas, tv]","[swimming, out, to, star, birth, privateschool...","[water, blood, &]"
2,s3,Ganglands,"[action, &, ,, shows, adventure, international...","[deadly, robbers, his, to, and, of, a, war, dr...",[ganglands]


# 4. Install necessary dependencies to leverage the word2vec word embedding model from gensim

In [14]:
!pip install gensim



In [15]:
import gensim

In [15]:
from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')



In [17]:
matrix_netflix_vocab = []
for list_ in df_netflix.to_numpy():
    list_[2] = [word for word in list_[2] if word in wv.key_to_index]
    list_[3] = [word for word in list_[3] if word in wv.key_to_index]
    list_[4] = [word for word in list_[4] if word in wv.key_to_index]
    matrix_netflix_vocab.append(list_)
df_netflix_vocab = pd.DataFrame(matrix_netflix_vocab, columns=df_netflix.columns)

In [18]:
len(matrix_netflix_vocab)

8807

In [19]:
matrix_netflix_vocab[0]

array(['s1', 'Dick Johnson Is Dead', list(['documentaries']),
       list(['end', 'them', 'his', 'in', 'johnson', 'both', 'ways', 'as', 'stages', 'inventive', 'death', 'life', 'filmmaker', 'face', 'the', 'comical', 'father', 'inevitable', 'her', 'nears', 'kirsten', 'help']),
       list(['is', 'dick', 'dead', 'johnson'])], dtype=object)

In [20]:
df_netflix_vocab.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[end, them, his, in, johnson, both, ways, as, ...","[is, dick, dead, johnson]"
1,s2,Blood & Water,"[shows, international, mysteries, dramas, tv]","[swimming, out, star, birth, whether, party, t...","[water, blood, &]"
2,s3,Ganglands,"[action, &, shows, adventure, international, c...","[deadly, robbers, his, war, drug, into, violen...",[ganglands]


# 5. Model 1 ( baseline model ) :  Considering equal weights for all features ( limited feature )

### Define the recommendation function

In [21]:
from tqdm import tqdm


def recommendation(title):
    matrix_netflix_title_vocab = []
    for list_ in df_netflix[df_netflix["title"] == title].to_numpy():
        list_[2] = [word for word in list_[2] if word in wv.key_to_index]
        list_[3] = [word for word in list_[3] if word in wv.key_to_index]
        list_[4] = [word for word in list_[4] if word in wv.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(matrix_netflix_vocab)
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:
            score_catg = wv.n_similarity(list1[2], list2[2])
            score_desc = wv.n_similarity(list1[3], list2[3])
            try:
                score_title = wv.n_similarity(list1[4], list2[4]) / 2
            except:
                score_title = 0
            if (list1[1] != list2[1]) & (score_catg > 0.85):
                matrix_similarity.append(
                    [list1[1], list2[1], score_title, score_catg, score_desc]
                )
        pbar.update()
    pbar.close()
    df_netflix_similarity = pd.DataFrame(
        matrix_similarity,
        columns=[
            "recommendation",
            "title",
            "score_title",
            "score_category",
            "score_description",
        ],
    )
    df_netflix_similarity["final_score"] = (
        df_netflix_similarity["score_title"]
        + df_netflix_similarity["score_category"]
        + df_netflix_similarity["score_description"]
    )
    return df_netflix_similarity.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False,
    ).head(10)

### Example 1 : Execution of recommendation function

In [22]:
recommendation("Grown Ups")

100%|██████████| 8807/8807 [00:07<00:00, 1247.12it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
183,American Pie 9: Girls' Rules,Grown Ups,0.137693,1.0,0.735336,1.873028
205,Work It,Grown Ups,0.067882,1.0,0.730738,1.798619
713,Little Nicky,Grown Ups,0.110633,1.0,0.684654,1.795287
475,Father of the Year,Grown Ups,0.103823,1.0,0.681855,1.785678
690,I Don't Know How She Does It,Grown Ups,0.095467,1.0,0.682575,1.778041
143,Mean Girls 2,Grown Ups,0.092655,1.0,0.685147,1.777803
624,Blue Mountain State: The Rise of Thadland,Grown Ups,0.090477,1.0,0.681724,1.772202
799,The First Wives Club,Grown Ups,0.103501,1.0,0.665402,1.768903
31,Poms,Grown Ups,0.093836,1.0,0.671998,1.765834
615,Bebe's Kids,Grown Ups,0.090393,1.0,0.675039,1.765433


In [25]:
movies_to_check = ['Grown Ups',
"American Pie 9: Girls' Rules",
'Work It',
'Little Nicky',
'Father of the Year',
"I Don't Know How She Does It",
'Mean Girls 2',
'Blue Mountain State: The Rise of Thadland',
'The First Wives Club',
'Poms',
"Bebe's Kids"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,listed_in,description,title_list
27,s28,Grown Ups,[comedies],"[loss, lake, their, basketball, junior, being,...","[grown, ups]"
348,s349,Poms,[comedies],"[funloving, moving, youngatheart, retirement, ...",[poms]
1459,s1460,Mean Girls 2,[comedies],"[school, daughter, way, his, to, of, pariah, s...","[girls, mean, 2]"
1881,s1882,American Pie 9: Girls' Rules,[comedies],"[school, their, to, of, student, plans, a, hom...","[girls, pie, :, american, ', 9, rules]"
2159,s2160,Work It,[comedies],"[school, mater, brilliant, to, senior, ’, and,...","[work, it]"
4761,s4762,Father of the Year,[comedies],"[seriously, dads, take, their, to, in, a, drun...","[father, year, of, the]"
6267,s6268,Bebe's Kids,[comedies],"[and, s, man, surprises, robin, with, but, beb...","[bebe, kids, 's]"
6340,s6341,Blue Mountain State: The Rise of Thadland,[comedies],"[school, teammates, his, star, in, and, party,...","[state, rise, :, thadland, blue, mountain, of,..."
7044,s7045,I Don't Know How She Does It,[comedies],"[personal, in, and, kate, mom, a, wife, yuppie...","[how, n't, it, does, i, do, know, she]"
7316,s7317,Little Nicky,[comedies],"[his, to, devil, siblings, power, plans, dear,...","[little, nicky]"


### Example 2 : Execution of recommendation function

In [26]:
recommendation("The Conjuring")

100%|██████████| 8807/8807 [00:03<00:00, 2336.64it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
373,Conjuring Spirit,The Conjuring,0.370828,0.964287,0.691602,2.026717
95,The Conjuring 2,The Conjuring,0.42371,0.913295,0.686553,2.023558
510,The Haunting of Molly Hartley,The Conjuring,0.214101,1.0,0.712108,1.926209
64,The Strange House,The Conjuring,0.158805,0.964287,0.78324,1.906332
381,Delirium,The Conjuring,0.124619,1.0,0.776508,1.901127
42,The Strangers,The Conjuring,0.138922,1.0,0.759049,1.897971
86,Insidious,The Conjuring,0.100749,1.0,0.783389,1.884138
163,The Darkness,The Conjuring,0.126599,1.0,0.74238,1.86898
501,The Devil Inside,The Conjuring,0.156047,1.0,0.700452,1.856499
502,The Diabolical,The Conjuring,0.218079,0.924792,0.708371,1.851242


In [27]:
movies_to_check = ['The Conjuring',
"Conjuring Spirit",
'The Conjuring 2',
'The Haunting of Molly Hartley',
'The Strange House',
"Delirium",
'The Strangers',
'Insidious',
'The Darkness',
'The Devil Inside',
"The Diabolica"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,listed_in,description,title_list
607,s608,The Strangers,"[horror, ,, thrillers, movies]","[stay, they, for, a, remote, intruders, under,...","[strangers, the]"
887,s888,The Strange House,"[,, horror, thrillers, international, movies]","[moves, haunts, their, that, to, and, town, tr...","[strange, house, the]"
1118,s1119,Insidious,"[horror, ,, thrillers, movies]","[moves, events, their, where, begin, to, in, a...",[insidious]
1283,s1284,The Conjuring,"[horror, ,, thrillers, movies]","[rhode, pair, farmhouse, help, of, island, sup...","[conjuring, the]"
1284,s1285,The Conjuring 2,"[horror, movies]","[daughter, releases, spirit, their, woman, to,...","[2, conjuring, the]"
2382,s2383,The Darkness,"[horror, ,, thrillers, movies]","[boy, grand, that, to, his, of, canyon, a, anc...","[darkness, the]"
6527,s6528,Conjuring Spirit,"[,, horror, thrillers, international, movies]","[betrayal, encounters, moves, spirit, receives...","[conjuring, spirit]"
6590,s6591,Delirium,"[horror, ,, thrillers, movies]","[moves, his, to, of, and, be, haunted, man, a,...",[delirium]
8276,s8277,The Devil Inside,"[horror, ,, thrillers, movies]","[woman, to, and, s, a, once, mother, behind, e...","[devil, inside, the]"
8338,s8339,The Haunting of Molly Hartley,"[horror, ,, thrillers, movies]","[school, to, up, a, with, but, girl, destined,...","[haunting, hartley, of, molly, the]"


# 6. Model 2 : Considering different weights for different features

### Defining the recommendation function with different weights to be considered for different features

In [29]:
def recommendation(title, weight_title=0.5, weight_category=1.0, weight_description=1.0):
    matrix_netflix_title_vocab = []
    for list_ in df_netflix[df_netflix["title"] == title].to_numpy():
        list_[2] = [word for word in list_[2] if word in wv.key_to_index]
        list_[3] = [word for word in list_[3] if word in wv.key_to_index]
        list_[4] = [word for word in list_[4] if word in wv.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(matrix_netflix_vocab)
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:
            score_catg = wv.n_similarity(list1[2], list2[2])
            score_desc = wv.n_similarity(list1[3], list2[3])
            try:
                score_title = wv.n_similarity(list1[4], list2[4])
            except:
                score_title = 0
            if (list1[1] != list2[1]) & (score_catg > 0.85):
                matrix_similarity.append(
                    [list1[1], list2[1], score_title, score_catg, score_desc]
                )
        pbar.update()
    pbar.close()
    df_netflix_similarity = pd.DataFrame(
        matrix_similarity,
        columns=[
            "recommendation",
            "title",
            "score_title",
            "score_category",
            "score_description",
        ],
    )

    # Calculate the final score incorporating weights
    df_netflix_similarity["final_score"] = (
        weight_title * df_netflix_similarity["score_title"]
        + weight_category * df_netflix_similarity["score_category"]
        + weight_description * df_netflix_similarity["score_description"]
    )

    return df_netflix_similarity.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False,
    ).head(10)

### example 1

In [30]:
recommendation("Grown Ups", weight_title=0.3, weight_category=1.2, weight_description=0.8)

100%|██████████| 8807/8807 [00:05<00:00, 1700.86it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
183,American Pie 9: Girls' Rules,Grown Ups,0.275386,1.0,0.735336,1.870884
205,Work It,Grown Ups,0.135764,1.0,0.730738,1.825319
713,Little Nicky,Grown Ups,0.221265,1.0,0.684654,1.814103
475,Father of the Year,Grown Ups,0.207646,1.0,0.681855,1.807777
143,Mean Girls 2,Grown Ups,0.185311,1.0,0.685147,1.803711
690,I Don't Know How She Does It,Grown Ups,0.190933,1.0,0.682575,1.80334
624,Blue Mountain State: The Rise of Thadland,Grown Ups,0.180955,1.0,0.681724,1.799666
688,How to Be a Player,Grown Ups,0.132778,1.0,0.694481,1.795418
799,The First Wives Club,Grown Ups,0.207002,1.0,0.665402,1.794422
615,Bebe's Kids,Grown Ups,0.180786,1.0,0.675039,1.794267


### Example 2

In [31]:
recommendation("Jeans", weight_title=0.3, weight_category=1.2, weight_description=0.8)

100%|██████████| 8807/8807 [00:06<00:00, 1453.40it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
541,Shimla Mirchi,Jeans,0.202888,1.0,0.83865,1.931786
435,Lock Your Girls In,Jeans,0.21844,1.0,0.831262,1.930541
822,Tere Naal Love Ho Gaya,Jeans,0.201543,1.0,0.804745,1.904259
59,Flower Girl,Jeans,0.219998,1.0,0.797348,1.903878
313,A Chaster Marriage,Jeans,0.080294,1.0,0.84908,1.903352
114,Kambili: The Whole 30 Yards,Jeans,0.084734,1.0,0.839707,1.897186
453,Zaki Chan,Jeans,0.10154,1.0,0.828308,1.893109
440,The Married Couples,Jeans,0.190842,1.0,0.790251,1.889453
223,Hire a Woman,Jeans,0.204355,1.0,0.782599,1.887386
465,Rich in Love,Jeans,0.135829,1.0,0.806073,1.885607


# 7. Model 3 : Considering wider set of features with differentiated weighting

Features considered for content based filtering are title , director , cast , country , genre(listed_in) , description

basic weights are applied to different features to explain the possibility , generally weights are allocated in conjunction with the functional expert

### Load the dataset

In [60]:
# Load data
df_netflix = pd.read_csv("netflix_titles.csv")


In [61]:
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### Remove features not to be considered for recommendation engine

In [62]:
df_netflix.drop(
    columns=[
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)
df_netflix.head(3)

Unnamed: 0,show_id,title,director,cast,country,listed_in,description
0,s1,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


In [63]:
df_netflix.columns

Index(['show_id', 'title', 'director', 'cast', 'country', 'listed_in',
       'description'],
      dtype='object')

### Data preprocessing

In [65]:
from nltk.tokenize import word_tokenize

df_netflix["title_list"] = df_netflix["title"].str.lower()
df_netflix["director"] = df_netflix["director"].str.lower()
df_netflix["cast"] = df_netflix["cast"].str.lower()
df_netflix["country"] = df_netflix["country"].str.lower()
df_netflix["listed_in"] = df_netflix["listed_in"].str.lower()
df_netflix["description"] = df_netflix["description"].str.lower()

# Fill NaN values with empty strings and convert lists to strings
for col in ["director", "cast", "country"]:
    df_netflix[col] = df_netflix[col].fillna('').astype(str).apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

df_netflix["title_list"] = df_netflix["title_list"].apply(word_tokenize)
df_netflix["director"] = df_netflix["director"].apply(word_tokenize)
df_netflix["cast"] = df_netflix["cast"].apply(word_tokenize)
df_netflix["country"] = df_netflix["country"].apply(word_tokenize)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(word_tokenize)
df_netflix["description"] = df_netflix["description"].apply(word_tokenize)

In [66]:
df_netflix.head(3)

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,"[kirsten, johnson]",[],"[united, states]",[documentaries],"[as, her, father, nears, the, end, of, his, li...","[dick, johnson, is, dead]"
1,s2,Blood & Water,[],"[ama, qamata, ,, khosi, ngema, ,, gail, mabala...","[south, africa]","[international, tv, shows, ,, tv, dramas, ,, t...","[after, crossing, paths, at, a, party, ,, a, c...","[blood, &, water]"
2,s3,Ganglands,"[julien, leclercq]","[sami, bouajila, ,, tracy, gotoas, ,, samuel, ...",[],"[crime, tv, shows, ,, international, tv, shows...","[to, protect, his, family, from, a, powerful, ...",[ganglands]


In [67]:
import string

df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if len(word) > 0]
)

In [68]:
df_netflix["title_list"] = df_netflix["title_list"].apply(lambda x: list(set(x)))
df_netflix["director"] = df_netflix["director"].apply(lambda x: list(set(x)))
df_netflix["cast"] = df_netflix["cast"].apply(lambda x: list(set(x)))
df_netflix["country"] = df_netflix["country"].apply(lambda x: list(set(x)))
df_netflix["listed_in"] = df_netflix["listed_in"].apply(lambda x: list(set(x)))
df_netflix["description"] = df_netflix["description"].apply(lambda x: list(set(x)))

In [70]:
df_netflix.head(2)

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,"[kirsten, johnson]",[],"[states, united]",[documentaries],"[end, them, his, to, in, of, johnson, and, bot...","[is, dick, dead, johnson]"
1,s2,Blood & Water,[],"[greeff, ryle, gwanya, fincham, molaba, ,, sit...","[africa, south]","[,, shows, international, mysteries, dramas, tv]","[swimming, out, to, star, birth, privateschool...","[water, blood, &]"


In [71]:
matrix_netflix_vocab = []
for list_ in df_netflix.to_numpy():
    list_[2] = [word for word in list_[2] if word in wv.key_to_index]
    list_[3] = [word for word in list_[3] if word in wv.key_to_index]
    list_[4] = [word for word in list_[4] if word in wv.key_to_index]
    list_[5] = [word for word in list_[5] if word in wv.key_to_index]
    list_[6] = [word for word in list_[6] if word in wv.key_to_index]
    list_[7] = [word for word in list_[7] if word in wv.key_to_index]
    matrix_netflix_vocab.append(list_)
df_netflix_vocab = pd.DataFrame(matrix_netflix_vocab, columns=df_netflix.columns)

In [74]:
matrix_netflix_vocab[0]

array(['s1', 'Dick Johnson Is Dead', list(['kirsten', 'johnson']),
       list([]), list(['states', 'united']), list(['documentaries']),
       list(['end', 'them', 'his', 'in', 'johnson', 'both', 'ways', 'as', 'stages', 'inventive', 'death', 'life', 'filmmaker', 'face', 'the', 'comical', 'father', 'inevitable', 'her', 'nears', 'kirsten', 'help']),
       list(['is', 'dick', 'dead', 'johnson'])], dtype=object)

### Defining the recommendation engine function

In [82]:
def recommendation(title, weight_title=1.0, weight_director=1.0, weight_cast=1.0,
                   weight_country=1.0, weight_genre=1.0, weight_description=1.0):

    # Extract the selected movie's features based on title
    matrix_netflix_title_vocab = []
    target_movies = df_netflix[df_netflix["title"] == title]

    if target_movies.empty:
        print(f"'{title}' not found in the dataset.")
        return pd.DataFrame()

    for list_ in target_movies.to_numpy():
        list_[2] = [word for word in list_[2] if word in wv.key_to_index]
        list_[3] = [word for word in list_[3] if word in wv.key_to_index]
        list_[4] = [word for word in list_[4] if word in wv.key_to_index]
        list_[5] = [word for word in list_[5] if word in wv.key_to_index]
        list_[6] = [word for word in list_[6] if word in wv.key_to_index]
        list_[7] = [word for word in list_[7] if word in wv.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(df_netflix.to_numpy(), total=len(df_netflix))
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:

            # Calculate similarity scores for each feature, handling empty lists
            score_director = wv.n_similarity(list1[2], list2[2]) if list1[2] and list2[2] else 0  # Handle empty lists
            score_cast = wv.n_similarity(list1[3], list2[3]) if list1[3] and list2[3] else 0  # Handle empty lists
            score_country = wv.n_similarity(list1[4], list2[4]) if list1[4] and list2[4] else 0  # Handle empty lists
            score_catg = wv.n_similarity(list1[5], list2[5]) if list1[5] and list2[5] else 0  # Handle empty lists
            score_desc = wv.n_similarity(list1[6], list2[6]) if list1[6] and list2[6] else 0  # Handle empty lists

            # Handle title similarity calculation with try-except
            try:
                score_title = wv.n_similarity(list1[7], list2[7])
            except KeyError:
                score_title = 0

            # If the titles are different and genre similarity is significant
            if list1[1] != list2[1] and score_catg > 0.85:
                # Calculate the weighted final score
                final_score = (
                    weight_title * score_title +
                    weight_director * score_director +
                    weight_cast * score_cast +
                    weight_country * score_country +
                    weight_genre * score_catg +
                    weight_description * score_desc
                )

                # Append all relevant scores for the DataFrame
                matrix_similarity.append([list1[1], score_title, score_director, score_country, score_catg, score_desc, final_score])

    pbar.close()

    # Create DataFrame of recommendations and sort by final score
    df_netflix_similarity = pd.DataFrame(matrix_similarity, columns=["recommendation", "score_title" , "score_director" ,"score_country","score_genre" , "score_description" , "final_score"])
    return df_netflix_similarity.sort_values(by="final_score", ascending=False).head(10)

### Example 1 : execution of the recommendation engine

In [83]:
recommendation("Jeans", weight_title=0.5, weight_director=1.0,
                                 weight_cast=0.8, weight_country=0.3,
                                 weight_genre=1.2, weight_description=0.7)

100%|██████████| 8807/8807 [00:06<00:00, 1320.84it/s]


Unnamed: 0,recommendation,score_title,score_director,score_country,score_genre,score_description,final_score
1319,Story of an Egg,0.14499,0.487648,1.0,1.0,0.765636,3.26104
541,Shimla Mirchi,0.202888,0.444767,1.0,1.0,0.83865,3.227988
673,Shabd,0.0,0.546363,1.0,0.927278,0.724035,3.200233
822,Tere Naal Love Ho Gaya,0.201543,0.487452,1.0,1.0,0.804745,3.18912
746,Rajma Chawal,0.23981,0.546363,1.0,0.905684,0.79401,3.186901
639,Marriage Palace,0.072713,0.542512,1.0,0.93558,0.836553,3.172426
852,Running Shaadi,0.148171,0.496246,1.0,1.0,0.721104,3.164321
501,Kuch Kuch Hota Hai,0.157871,0.528753,1.0,0.905684,0.829632,3.15447
817,Fitoor,0.0,0.609172,1.0,0.927278,0.743982,3.153724
709,Fatso,0.199852,0.549578,1.0,0.905684,0.721141,3.152828


In [84]:
movies_to_check = ['Jeans',
"Story of an Egg",
'Shimla Mirchi',
'Shabd',
'Tere Naal Love Ho Gaya',
"Rajma Chawa",
'Marriage Palace',
'Running Shaadi',
'Kuch Kuch Hota Hai',
'Fitoor',
"Fatso"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
24,s25,Jeans,"[shankar, s.]","[rai, sri, ,, prashanth, aishwarya, bachchan, ...",[india],"[romantic, comedies, ,, international, movies]","[marry, woman, that, his, of, be, man, a, conv...",[jeans]
2734,s2735,Kuch Kuch Hota Hai,"[karan, johar]","[saeed, ,, rukh, jalal, shah, farida, khan, mu...",[india],"[comedies, ,, international, dramas, movies]","[out, loved, woman, to, ’, s, a, with, reunite...","[hai, hota, kuch]"
2985,s2986,Shimla Mirchi,"[sippy, ramesh]","[rao, preet, hema, shakti, rakul, gill, ,, kap...",[india],"[romantic, comedies, ,, international, movies]","[letter, his, to, in, tonguetied, of, up, s, m...","[shimla, mirchi]"
3564,s3565,Marriage Palace,"[sunil, thakur]","[., sharma, sangha, roopi, ,, sharry, nirmal, ...",[india],"[movies, comedies, international, ,]","[on, marry, bride, their, his, to, swap, man, ...","[marriage, palace]"
3917,s3918,Shabd,"[leena, yadav]","[siddiqui, rai, ,, kala, kamini, sadiya, zayed...",[india],"[romantic, ,, international, dramas, movies]","[line, grows, his, to, and, a, seeks, wife, ur...",[shabd]
4103,s4104,Fatso,"[rajat, kapoor]","[atul, ,, bakshi, kala, achrekar, panag, gul, ...",[india],"[comedies, ,, international, dramas, movies]","[error, on, his, to, in, of, portly, man, a, w...",[fatso]
4722,s4723,Fitoor,"[abhishek, kapoor]","[kaif, tabu, rahul, bhatt, rao, hydari, ,, kat...",[india],"[romantic, ,, international, dramas, movies]","[trained, woman, in, of, bitter, for, a, mothe...",[fitoor]
4735,s4736,Tere Naal Love Ho Gaya,"[mandeep, kumar]","[cheema, ,, jaykar, om, riteish, anand, prabha...",[india],"[romantic, comedies, ,, international, movies]","[rich, own, to, and, s, man, for, a, with, who...","[naal, ho, gaya, tere, love]"
4904,s4905,Running Shaadi,"[amit, roy]","[arsh, amit, ,, bhullar, kala, jha, sadh, mohi...",[india],"[romantic, comedies, ,, international, movies]","[daughter, his, migrant, to, elope, of, a, for...","[running, shaadi]"
8098,s8099,Story of an Egg,"[b., raj, shetty]","[rai, ,, shailashree, prakash, amrutha, usha, ...",[india],"[romantic, comedies, ,, international, movies]","[out, bride, to, being, year, a, alone, gives,...","[egg, an, of, story]"


### Example 2 : execution of the recommendation engine

In [85]:
recommendation("Dark", weight_title=0.5, weight_director=1.0,
                                 weight_cast=0.8, weight_country=0.3,
                                 weight_genre=1.2, weight_description=0.7)

100%|██████████| 8807/8807 [00:07<00:00, 1215.72it/s]


Unnamed: 0,recommendation,score_title,score_director,score_country,score_genre,score_description,final_score
182,In The Dark,0.821786,0,0.815271,0.96196,0.606756,2.926603
184,Dark Desire,0.718159,0,0.462665,0.922875,0.695216,2.775437
193,The Woods,0.378318,0,0.887903,1.0,0.674035,2.709436
350,When They See Us,0.215027,0,0.815271,0.96196,0.665618,2.698001
108,Invisible City,0.337146,0,0.404001,1.0,0.717482,2.684233
495,The Frozen Dead,0.290684,0,0.460444,1.0,0.628069,2.63668
431,Dogs of Berlin,0.096403,0,0.747093,1.0,0.646366,2.631375
339,Black Spot,0.350655,0,0.503075,1.0,0.571254,2.62747
624,Deep Water,0.282871,0,0.443549,1.0,0.602855,2.619751
430,Travelers,0.052719,0,0.81204,1.0,0.663855,2.605853


In [86]:
movies_to_check = ['Dark',
"In The Dark",
'Dark Desire',
'The Woods',
'When They See Us',
"Invisible City",
'The Frozen Dead',
'Dogs of Berlin',
'Black Spot',
'Deep Water',
"Travelers"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
1339,s1340,Invisible City,[],"[jéssica, manu, ,, marco, lago, córes, fábio, ...",[brazil],"[,, shows, international, crime, dramas, tv]","[his, to, and, man, they, a, realizes, living,...","[invisible, city]"
2229,s2230,In The Dark,[],"[brooke, rich, york, ,, krantz, keston, perry,...","[states, united]","[,, shows, crime, dramas, tv]","[woman, in, of, up, a, with, murder, investiga...","[in, dark, the]"
2235,s2236,Dark Desire,[],"[leticia, merlo, ,, orozco, pineda, alejandro,...",[mexico],"[spanish-language, ,, shows, international, cr...","[those, passion, that, to, in, away, and, a, e...","[desire, dark]"
2327,s2328,Dark,[],"[winkler, ,, sebastian, mendl, michael, schöne...","[germany, states, united, ,]","[,, shows, international, crime, dramas, tv]","[on, child, that, they, for, a, hunt, answers,...",[dark]
2401,s2402,The Woods,[],"[,, martyna, adam, hubert, grzegorz, jacek, by...","[states, ,, united, poland]","[,, shows, international, crime, dramas, tv]","[on, earlier, that, his, in, of, be, prosecuto...","[woods, the]"
3746,s3747,Black Spot,[],"[,, naidra, hubert, renaud, daviot, suliane, o...","[france, belgium, ,]","[,, shows, international, crime, dramas, tv]","[edge, in, and, of, prosecutor, town, a, isola...","[black, spot]"
3792,s3793,When They See Us,[],"[,, caleel, farmiga, jerome, harris, michael, ...","[states, united]","[,, shows, crime, dramas, tv]","[on, central, accused, harlem, in, of, they, a...","[when, us, they, see]"
4310,s4311,Travelers,[],"[reilly, eric, patrick, paul, ,, gilmore, coop...","[states, canada, united, ,]","[,, shows, international, crime, dramas, tv]","[future, to, suddenly, s, a, agent, possess, d...",[travelers]
4316,s4317,Dogs of Berlin,[],"[,, sebastian, schüttler, bennent, anna, krame...",[germany],"[,, shows, international, crime, dramas, tv]","[cops, case, them, that, in, of, a, famous, bu...","[dogs, berlin, of]"
5099,s5100,The Frozen Dead,[],"[lubna, ,, plagnol, ny, le, meurisse, charles,...",[france],"[,, shows, international, crime, dramas, tv]","[investigator, in, a, with, into, martin, this...","[frozen, dead, the]"
