# Netflix Movies and TV shows Recommendation system

Algorithm : Content Based Filtering - fasttext for word embedding of text features

Notebook summary

1. Import necessary libraries

2. Import required dataset

3. Data preprocessing for baseline model

4. Install necessary dependencies to leverage the fasttext word embedding model from gensim

5. Model 1 ( baseline model ) : Considering equal weights for all features ( limited feature )

6. Model 2 : Considering different weights for different features

7. Model 3 : Considering wider set of features with differentiated weighting


# 1. Import necessary libraries

In [45]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# 2. Import required dataset

In [46]:
df_netflix = pd.read_csv("netflix_titles.csv")
df_netflix.drop(
    columns=[
        "director",
        "cast",
        "country",
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description
0,s1,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


# 3. Data preprocessing for baseline model

In [47]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [48]:
from nltk.tokenize import word_tokenize

df_netflix["title_list"] = df_netflix["title"].str.lower()
df_netflix["listed_in"] = df_netflix["listed_in"].str.lower()
df_netflix["description"] = df_netflix["description"].str.lower()

df_netflix["title_list"] = df_netflix["title_list"].apply(word_tokenize)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(word_tokenize)
df_netflix["description"] = df_netflix["description"].apply(word_tokenize)

In [49]:
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[as, her, father, nears, the, end, of, his, li...","[dick, johnson, is, dead]"
1,s2,Blood & Water,"[international, tv, shows, ,, tv, dramas, ,, t...","[after, crossing, paths, at, a, party, ,, a, c...","[blood, &, water]"
2,s3,Ganglands,"[crime, tv, shows, ,, international, tv, shows...","[to, protect, his, family, from, a, powerful, ...",[ganglands]


In [50]:
import string

df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if len(word) > 0]
)

In [51]:
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[as, her, father, nears, the, end, of, his, li...","[dick, johnson, is, dead]"
1,s2,Blood & Water,"[international, tv, shows, ,, tv, dramas, ,, t...","[after, crossing, paths, at, a, party, a, cape...","[blood, &, water]"
2,s3,Ganglands,"[crime, tv, shows, ,, international, tv, shows...","[to, protect, his, family, from, a, powerful, ...",[ganglands]


In [52]:
df_netflix["title_list"] = df_netflix["title_list"].apply(lambda x: list(set(x)))
df_netflix["listed_in"] = df_netflix["listed_in"].apply(lambda x: list(set(x)))
df_netflix["description"] = df_netflix["description"].apply(lambda x: list(set(x)))

In [53]:
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[johnson, kirsten, inventive, death, life, and...","[is, dick, johnson, dead]"
1,s2,Blood & Water,"[mysteries, dramas, shows, ,, tv, international]","[party, birth, her, sister, paths, swimming, w...","[blood, water, &]"
2,s3,Ganglands,"[crime, adventure, shows, action, ,, tv, inter...","[deadly, protect, and, expert, from, powerful,...",[ganglands]


# 4. Install necessary dependencies to leverage the fasttext word embedding model from gensim

In [9]:
!pip install gensim



In [54]:
import gensim

In [55]:
import gensim.downloader as api

FT = api.load('fasttext-wiki-news-subwords-300')



In [56]:
matrix_netflix_vocab = []
for list_ in df_netflix.to_numpy():
    list_[2] = [word for word in list_[2] if word in FT.key_to_index]
    list_[3] = [word for word in list_[3] if word in FT.key_to_index]
    list_[4] = [word for word in list_[4] if word in FT.key_to_index]
    matrix_netflix_vocab.append(list_)
df_netflix_vocab = pd.DataFrame(matrix_netflix_vocab, columns=df_netflix.columns)

In [57]:
len(matrix_netflix_vocab)

8807

In [58]:
matrix_netflix_vocab[0]

array(['s1', 'Dick Johnson Is Dead', list(['documentaries']),
       list(['johnson', 'kirsten', 'inventive', 'death', 'life', 'and', 'in', 'her', 'face', 'help', 'filmmaker', 'them', 'to', 'of', 'end', 'both', 'nears', 'comical', 'as', 'stages', 'the', 'father', 'inevitable', 'his', 'ways']),
       list(['is', 'dick', 'johnson', 'dead'])], dtype=object)

In [59]:
df_netflix_vocab.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[johnson, kirsten, inventive, death, life, and...","[is, dick, johnson, dead]"
1,s2,Blood & Water,"[mysteries, dramas, shows, ,, tv, international]","[party, birth, her, sister, paths, swimming, w...","[blood, water, &]"
2,s3,Ganglands,"[crime, adventure, shows, action, ,, tv, inter...","[deadly, protect, and, expert, from, powerful,...",[]


# 5. Model 1 ( baseline model ) :  Considering equal weights for all features ( limited feature )

### Define the recommendation function

In [60]:
from tqdm import tqdm


def recommendation(title):
    matrix_netflix_title_vocab = []
    for list_ in df_netflix[df_netflix["title"] == title].to_numpy():
        list_[2] = [word for word in list_[2] if word in FT.key_to_index]
        list_[3] = [word for word in list_[3] if word in FT.key_to_index]
        list_[4] = [word for word in list_[4] if word in FT.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(matrix_netflix_vocab)
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:
            # Check if lists are empty before calculating similarity
            if list1[2] and list2[2]:
                score_catg = FT.n_similarity(list1[2], list2[2])
            else:
                score_catg = 0  # or a suitable default value

            if list1[3] and list2[3]:
                score_desc = FT.n_similarity(list1[3], list2[3])
            else:
                score_desc = 0  # or a suitable default value

            try:
                if list1[4] and list2[4]:
                    score_title = FT.n_similarity(list1[4], list2[4]) / 2
                else:
                    score_title = 0
            except:
                score_title = 0
            if (list1[1] != list2[1]) & (score_catg > 0.85):
                matrix_similarity.append(
                    [list1[1], list2[1], score_title, score_catg, score_desc]
                )
        pbar.update()
    pbar.close()
    df_netflix_similarity = pd.DataFrame(
        matrix_similarity,
        columns=[
            "recommendation",
            "title",
            "score_title",
            "score_category",
            "score_description",
        ],
    )
    df_netflix_similarity["final_score"] = (
        df_netflix_similarity["score_title"]
        + df_netflix_similarity["score_category"]
        + df_netflix_similarity["score_description"]
    )
    return df_netflix_similarity.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False,
    ).head(10)

### Example 1 : Execution of recommendation function

In [61]:
recommendation("Grown Ups")

100%|██████████| 8807/8807 [00:06<00:00, 1362.54it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
84,Fall Girls,Grown Ups,0.264962,1.0,0.908869,2.17383
75,Blue Mountain State: The Rise of Thadland,Grown Ups,0.255071,1.0,0.913453,2.168524
76,Bring It On: Worldwide Showdown,Grown Ups,0.26072,1.0,0.901196,2.161916
59,Wild Oats,Grown Ups,0.238926,1.0,0.921691,2.160617
91,Little Nicky,Grown Ups,0.259115,1.0,0.900927,2.160042
58,The Outcasts,Grown Ups,0.239393,1.0,0.920426,2.159819
73,Big Bear,Grown Ups,0.259489,1.0,0.898815,2.158304
53,Father of the Year,Grown Ups,0.244703,1.0,0.912112,2.156815
70,Adventures in Public School,Grown Ups,0.250181,1.0,0.905719,2.155899
13,Bad Trip,Grown Ups,0.243564,1.0,0.911599,2.155163


In [62]:
movies_to_check = ['Grown Ups',
"Fall Girls",
'Blue Mountain State: The Rise of Thadland',
'Bring It On: Worldwide Showdown',
'Wild Oats',
"Little Nicky",
'The Outcasts',
'Big Bear',
'Father of the Year',
'Adventures in Public Schoo',
"Bad Trip"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,listed_in,description,title_list
27,s28,Grown Ups,[comedies],"[beloved, basketball, coach, loss, their, and,...","[grown, ups]"
1157,s1158,Bad Trip,[comedies],"[people, comedy, real, pull, on, this, their, ...","[trip, bad]"
4761,s4762,Father of the Year,[comedies],"[win, debate, mayhem, whose, in, their, when, ...","[the, of, father, year]"
5328,s5329,The Outcasts,[comedies],"[hierarchy, turns, on, nerd, s, uprising, thei...","[the, outcasts]"
5487,s5488,Wild Oats,[comedies],"[pal, canary, insurance, and, their, her, reti...","[wild, oats]"
6302,s6303,Big Bear,[comedies],"[party, fiancée, jinks, illfated, s, new, when...","[bear, big]"
6340,s6341,Blue Mountain State: The Rise of Thadland,[comedies],"[party, beloved, raunchiest, it, and, in, new,...","[:, the, blue, state, of, rise, thadland, moun..."
6377,s6378,Bring It On: Worldwide Showdown,[comedies],"[squad, on, cheer, in, new, teams, when, from,...","[:, worldwide, on, bring, it, showdown]"
6734,s6735,Fall Girls,[comedies],"[left, closing, it, try, their, in, following,...","[girls, fall]"
7316,s7317,Little Nicky,[comedies],"[dad, nicky, dear, back, from, oust, earth, so...","[nicky, little]"


### Example 2 : Execution of recommendation function

In [63]:
recommendation("The Conjuring")

100%|██████████| 8807/8807 [00:05<00:00, 1530.29it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
827,The Darkness,The Conjuring,0.42548,1.0,0.951606,2.377087
202,The Strangers,The Conjuring,0.428531,1.0,0.946117,2.374648
2474,House at the End of the Street,The Conjuring,0.412842,1.0,0.951542,2.364383
296,The Strange House,The Conjuring,0.417601,0.987606,0.958616,2.363822
3012,The Haunting of Molly Hartley,The Conjuring,0.425382,1.0,0.932987,2.35837
1366,The Silence,The Conjuring,0.412425,1.0,0.942992,2.355417
2988,The Devil Inside,The Conjuring,0.41833,1.0,0.930568,2.348898
2948,The Basement,The Conjuring,0.413454,1.0,0.92882,2.342274
3073,The Rezort,The Conjuring,0.447428,0.963551,0.930551,2.34153
2976,The Craft,The Conjuring,0.411635,1.0,0.928963,2.340598


In [64]:
movies_to_check = ['The Conjuring',
"The Darkness",
'The Strangers',
'House at the End of the Street',
'The Strange House',
"The Haunting of Molly Hartley",
'The Silence',
'The Devil Inside',
'The Basemen',
'The Rezort',
"The Craf"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,listed_in,description,title_list
607,s608,The Strangers,"[movies, thrillers, horror, ,]","[home, attack, three, when, quiet, vacation, b...","[strangers, the]"
887,s888,The Strange House,"[movies, thrillers, horror, ,, international]","[home, brothers, moves, bigcity, try, and, the...","[strange, the, house]"
1283,s1284,The Conjuring,"[movies, thrillers, horror, ,]","[supernatural, noted, farmhouse, pair, demonol...","[conjuring, the]"
2382,s2383,The Darkness,"[movies, thrillers, horror, ,]","[home, unleashes, begins, ancient, set, bringi...","[the, darkness]"
3930,s3931,The Silence,"[movies, thrillers, horror, ,]","[deadly, creatures, attack, city, and, with, h...","[silence, the]"
7009,s7010,House at the End of the Street,"[movies, thrillers, horror, ,]","[she, new, when, was, house, site, town, more,...","[house, the, street, of, at, end]"
8276,s8277,The Devil Inside,"[movies, thrillers, horror, ,]","[people, this, s, and, her, behind, three, gri...","[devil, the, inside]"
8338,s8339,The Haunting of Molly Hartley,"[movies, thrillers, horror, ,]","[troubled, catch, with, new, her, but, start, ...","[the, hartley, molly, of, haunting]"
8483,s8484,The Rezort,"[movies, horror, international, ,]","[luxury, tourists, fails, hunted, on, global, ...","[rezort, the]"


# 6. Model 2 : Considering different weights for different features

### Defining the recommendation function with different weights to be considered for different features

In [65]:
from tqdm import tqdm


def recommendation(title, weight_title=0.5, weight_category=1.0, weight_description=1.0):
    matrix_netflix_title_vocab = []
    for list_ in df_netflix[df_netflix["title"] == title].to_numpy():
        list_[2] = [word for word in list_[2] if word in FT.key_to_index]
        list_[3] = [word for word in list_[3] if word in FT.key_to_index]
        list_[4] = [word for word in list_[4] if word in FT.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(matrix_netflix_vocab)
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:
            # Check if lists are empty before calculating similarity
            if list1[2] and list2[2]:  # Ensure both lists are not empty
                score_catg = FT.n_similarity(list1[2], list2[2])
            else:
                score_catg = 0  # or a suitable default value

            if list1[3] and list2[3]:  # Ensure both lists are not empty
                score_desc = FT.n_similarity(list1[3], list2[3])
            else:
                score_desc = 0  # or a suitable default value

            try:
                if list1[4] and list2[4]:  # Ensure both lists are not empty
                    score_title = FT.n_similarity(list1[4], list2[4]) / 2
                else:
                    score_title = 0
            except:
                score_title = 0
            if (list1[1] != list2[1]) & (score_catg > 0.85):
                matrix_similarity.append(
                    [list1[1], list2[1], score_title, score_catg, score_desc]
                )
        pbar.update()
    pbar.close()
    df_netflix_similarity = pd.DataFrame(
        matrix_similarity,
        columns=[
            "recommendation",
            "title",
            "score_title",
            "score_category",
            "score_description",
        ],
    )
    df_netflix_similarity["final_score"] = (
        weight_title * df_netflix_similarity["score_title"]
        + weight_category * df_netflix_similarity["score_category"]
        + weight_description * df_netflix_similarity["score_description"]
    )
    return df_netflix_similarity.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False,
    ).head(10)

### example 1

In [66]:
recommendation("Grown Ups", weight_title=0.3, weight_category=1.2, weight_description=0.8)

100%|██████████| 8807/8807 [00:06<00:00, 1458.47it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
88,How to Be a Player,Grown Ups,0.212058,1.0,0.933468,2.010392
59,Wild Oats,Grown Ups,0.238926,1.0,0.921691,2.009031
58,The Outcasts,Grown Ups,0.239393,1.0,0.920426,2.008159
75,Blue Mountain State: The Rise of Thadland,Grown Ups,0.255071,1.0,0.913453,2.007283
84,Fall Girls,Grown Ups,0.264962,1.0,0.908869,2.006584
53,Father of the Year,Grown Ups,0.244703,1.0,0.912112,2.0031
13,Bad Trip,Grown Ups,0.243564,1.0,0.911599,2.002349
93,Mad Money,Grown Ups,0.23401,1.0,0.913897,2.001321
70,Adventures in Public School,Grown Ups,0.250181,1.0,0.905719,1.999629
63,Take the 10,Grown Ups,0.231953,1.0,0.912374,1.999485


### Example 2

In [67]:
recommendation("Jeans", weight_title=0.3, weight_category=1.2, weight_description=0.8)

100%|██████████| 8807/8807 [00:06<00:00, 1353.48it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
1080,Lock Your Girls In,Jeans,0.187516,1.0,0.947592,2.014328
2096,Tere Naal Love Ho Gaya,Jeans,0.169272,1.0,0.945369,2.007077
779,Ginny Weds Sunny,Jeans,0.208882,1.0,0.927564,2.004716
3297,Maza Pati Karodpati,Jeans,0.152755,1.0,0.945174,2.001965
1139,Rich in Love,Jeans,0.164621,1.0,0.932938,1.995736
1503,Brother in Love,Jeans,0.156901,1.0,0.934849,1.99495
3277,Man Up,Jeans,0.164388,1.0,0.931239,1.994308
156,Flower Girl,Jeans,0.196663,1.0,0.917342,1.992872
557,Squared Love,Jeans,0.163682,1.0,0.929007,1.99231
1101,Zaki Chan,Jeans,0.131871,1.0,0.940293,1.991796


# 7. Model 3 : Considering wider set of features with differentiated weighting

Features considered for content based filtering are title , director , cast , country , genre(listed_in) , description

basic weights are applied to different features to explain the possibility , generally weights are allocated in conjunction with the functional expert

### Load the dataset

In [68]:
# Load data
df_netflix = pd.read_csv("netflix_titles.csv")


In [69]:
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### Remove features not to be considered for recommendation engine

In [70]:
df_netflix.drop(
    columns=[
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)
df_netflix.head(3)

Unnamed: 0,show_id,title,director,cast,country,listed_in,description
0,s1,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


In [71]:
df_netflix.columns

Index(['show_id', 'title', 'director', 'cast', 'country', 'listed_in',
       'description'],
      dtype='object')

### Data preprocessing

In [72]:
from nltk.tokenize import word_tokenize

df_netflix["title_list"] = df_netflix["title"].str.lower()
df_netflix["director"] = df_netflix["director"].str.lower()
df_netflix["cast"] = df_netflix["cast"].str.lower()
df_netflix["country"] = df_netflix["country"].str.lower()
df_netflix["listed_in"] = df_netflix["listed_in"].str.lower()
df_netflix["description"] = df_netflix["description"].str.lower()

# Fill NaN values with empty strings and convert lists to strings
for col in ["director", "cast", "country"]:
    df_netflix[col] = df_netflix[col].fillna('').astype(str).apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

df_netflix["title_list"] = df_netflix["title_list"].apply(word_tokenize)
df_netflix["director"] = df_netflix["director"].apply(word_tokenize)
df_netflix["cast"] = df_netflix["cast"].apply(word_tokenize)
df_netflix["country"] = df_netflix["country"].apply(word_tokenize)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(word_tokenize)
df_netflix["description"] = df_netflix["description"].apply(word_tokenize)

In [73]:
df_netflix.head(3)

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,"[kirsten, johnson]",[],"[united, states]",[documentaries],"[as, her, father, nears, the, end, of, his, li...","[dick, johnson, is, dead]"
1,s2,Blood & Water,[],"[ama, qamata, ,, khosi, ngema, ,, gail, mabala...","[south, africa]","[international, tv, shows, ,, tv, dramas, ,, t...","[after, crossing, paths, at, a, party, ,, a, c...","[blood, &, water]"
2,s3,Ganglands,"[julien, leclercq]","[sami, bouajila, ,, tracy, gotoas, ,, samuel, ...",[],"[crime, tv, shows, ,, international, tv, shows...","[to, protect, his, family, from, a, powerful, ...",[ganglands]


In [74]:
import string

df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if len(word) > 0]
)

In [75]:
df_netflix["title_list"] = df_netflix["title_list"].apply(lambda x: list(set(x)))
df_netflix["director"] = df_netflix["director"].apply(lambda x: list(set(x)))
df_netflix["cast"] = df_netflix["cast"].apply(lambda x: list(set(x)))
df_netflix["country"] = df_netflix["country"].apply(lambda x: list(set(x)))
df_netflix["listed_in"] = df_netflix["listed_in"].apply(lambda x: list(set(x)))
df_netflix["description"] = df_netflix["description"].apply(lambda x: list(set(x)))

In [76]:
df_netflix.head(2)

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,"[johnson, kirsten]",[],"[states, united]",[documentaries],"[johnson, kirsten, inventive, death, life, and...","[is, dick, johnson, dead]"
1,s2,Blood & Water,[],"[morny, mahlangu, mabalane, qamata, odwa, sham...","[africa, south]","[mysteries, dramas, shows, ,, tv, international]","[party, birth, her, sister, paths, swimming, w...","[blood, water, &]"


In [77]:
matrix_netflix_vocab = []
for list_ in df_netflix.to_numpy():
    list_[2] = [word for word in list_[2] if word in FT.key_to_index]
    list_[3] = [word for word in list_[3] if word in FT.key_to_index]
    list_[4] = [word for word in list_[4] if word in FT.key_to_index]
    list_[5] = [word for word in list_[5] if word in FT.key_to_index]
    list_[6] = [word for word in list_[6] if word in FT.key_to_index]
    list_[7] = [word for word in list_[7] if word in FT.key_to_index]
    matrix_netflix_vocab.append(list_)
df_netflix_vocab = pd.DataFrame(matrix_netflix_vocab, columns=df_netflix.columns)

In [78]:
matrix_netflix_vocab[0]

array(['s1', 'Dick Johnson Is Dead', list(['johnson', 'kirsten']),
       list([]), list(['states', 'united']), list(['documentaries']),
       list(['johnson', 'kirsten', 'inventive', 'death', 'life', 'and', 'in', 'her', 'face', 'help', 'filmmaker', 'them', 'to', 'of', 'end', 'both', 'nears', 'comical', 'as', 'stages', 'the', 'father', 'inevitable', 'his', 'ways']),
       list(['is', 'dick', 'johnson', 'dead'])], dtype=object)

### Defining the recommendation engine function

In [79]:
def recommendation(title, weight_title=1.0, weight_director=1.0, weight_cast=1.0,
                   weight_country=1.0, weight_genre=1.0, weight_description=1.0):

    # Extract the selected movie's features based on title
    matrix_netflix_title_vocab = []
    target_movies = df_netflix[df_netflix["title"] == title]

    if target_movies.empty:
        print(f"'{title}' not found in the dataset.")
        return pd.DataFrame()

    for list_ in target_movies.to_numpy():
        list_[2] = [word for word in list_[2] if word in FT.key_to_index]
        list_[3] = [word for word in list_[3] if word in FT.key_to_index]
        list_[4] = [word for word in list_[4] if word in FT.key_to_index]
        list_[5] = [word for word in list_[5] if word in FT.key_to_index]
        list_[6] = [word for word in list_[6] if word in FT.key_to_index]
        list_[7] = [word for word in list_[7] if word in FT.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(df_netflix.to_numpy(), total=len(df_netflix))
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:

            # Calculate similarity scores for each feature, handling empty lists
            score_director = FT.n_similarity(list1[2], list2[2]) if list1[2] and list2[2] else 0  # Handle empty lists
            score_cast = FT.n_similarity(list1[3], list2[3]) if list1[3] and list2[3] else 0  # Handle empty lists
            score_country = FT.n_similarity(list1[4], list2[4]) if list1[4] and list2[4] else 0  # Handle empty lists
            score_catg = FT.n_similarity(list1[5], list2[5]) if list1[5] and list2[5] else 0  # Handle empty lists
            score_desc = FT.n_similarity(list1[6], list2[6]) if list1[6] and list2[6] else 0  # Handle empty lists

            # Handle title similarity calculation with try-except
            try:
                score_title = FT.n_similarity(list1[7], list2[7])
            except KeyError:
                score_title = 0

            # If the titles are different and genre similarity is significant
            if list1[1] != list2[1] and score_catg > 0.85:
                # Calculate the weighted final score
                final_score = (
                    weight_title * score_title +
                    weight_director * score_director +
                    weight_cast * score_cast +
                    weight_country * score_country +
                    weight_genre * score_catg +
                    weight_description * score_desc
                )

                # Append all relevant scores for the DataFrame
                matrix_similarity.append([list1[1], score_title, score_director, score_country, score_catg, score_desc, final_score])

    pbar.close()

    # Create DataFrame of recommendations and sort by final score
    df_netflix_similarity = pd.DataFrame(matrix_similarity, columns=["recommendation", "score_title" , "score_director" ,"score_country","score_genre" , "score_description" , "final_score"])
    return df_netflix_similarity.sort_values(by="final_score", ascending=False).head(10)

### Example 1 : execution of the recommendation engine

In [80]:
recommendation("Jeans", weight_title=0.5, weight_director=1.0,
                                 weight_cast=0.8, weight_country=0.3,
                                 weight_genre=1.2, weight_description=0.7)

100%|██████████| 8807/8807 [00:07<00:00, 1208.08it/s]


Unnamed: 0,recommendation,score_title,score_director,score_country,score_genre,score_description,final_score
3614,Story of an Egg,0.248979,0.724073,1.0,1.0,0.929319,3.689187
1402,Baby Dolls,0.42379,0.635492,1.0,1.0,0.909741,3.655913
2066,Pad Man,0.39095,0.696254,1.0,0.974175,0.862578,3.655251
2096,Tere Naal Love Ho Gaya,0.338545,0.626213,1.0,1.0,0.945369,3.631695
1538,Watchman,0.139439,0.799672,1.0,0.959184,0.86265,3.616319
1310,Hum Aapke Hain Koun,0.305975,0.696254,1.0,0.949942,0.933632,3.608734
3632,Super Nani,0.308808,0.642367,1.0,0.959184,0.91838,3.583727
2533,X: Past Is Present,0.196331,0.735206,1.0,0.937818,0.893038,3.576176
713,One More Try,0.268944,0.921026,0.560115,0.974175,0.919138,3.573227
2163,Running Shaadi,0.351045,0.568379,1.0,1.0,0.909852,3.573179


In [81]:
movies_to_check = ['Jeans',
"Story of an Egg",
'Baby Dolls',
'Pad Man',
'Tere Naal Love Ho Gaya',
"Watchman",
'Hum Aapke Hain Koun',
'Super Nani',
'X: Past Is Present',
'One More Try',
"Running Shaadi"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
24,s25,Jeans,"[shankar, s.]","[bachchan, prashanth, aishwarya, lakshmi, sri,...",[india],"[comedies, movies, romantic, ,, international]","[too, she, man, when, twin, convincing, marry,...",[jeans]
1735,s1736,One More Try,"[s., ruel, bayani]","[angelica, angel, vergara, gina, dingdong, dan...",[philippines],"[movies, dramas, romantic, ,, international]","[severe, on, s, their, with, her, reconnects, ...","[try, more, one]"
2957,s2958,Hum Aapke Hain Koun,"[r., barjatya, sooraj]","[lagoo, khan, bahl, anupam, nath, bindu, renuk...",[india],"[movies, dramas, classic, ,, international]","[falls, good, other, agrees, she, their, but, ...","[hum, koun, aapke, hain]"
3189,s3190,Baby Dolls,"[arora, vijay, kumar]","[nirmal, tania, gurnam, chahal, sonam, sukhwin...",[india],"[comedies, movies, romantic, ,, international]","[their, with, mend, punjab, when, from, grandm...","[dolls, baby]"
3537,s3538,Watchman,"[vijay, a., l.]","[g.v, suman, yogi, munishkanth, hegde, samyukt...",[india],"[comedies, movies, dramas, ,, international]","[shark, or, breaks, man, trouble, –, off, bung...",[watchman]
4693,s4694,Pad Man,"[r., balki]","[singh, subhash, suneel, chaturvedi, jyoti, ra...",[india],"[movies, dramas, romantic, ,, international]","[resistance, s, sanitary, and, entrepreneur, p...","[pad, man]"
4735,s4736,Tere Naal Love Ho Gaya,"[mandeep, kumar]","[kartar, tinnu, riteish, navin, jaykar, cheema...",[india],"[comedies, movies, romantic, ,, international]","[works, been, she, s, man, suitor, and, her, w...","[ho, love, tere, gaya, naal]"
4904,s4905,Running Shaadi,"[amit, roy]","[amit, pannu, brijendra, pankaj, neeta, jha, k...",[india],"[comedies, movies, romantic, ,, international]","[couples, help, daughter, former, to, of, a, r...","[running, shaadi]"
5837,s5838,X: Past Is Present,"[mukherjee, d., tiwari, menon, hemant, pratim,...","[rajat, swara, radhika, pooja, ruparel, jha, a...",[india],"[movies, dramas, ,, independent, international]","[party, s, stranger, middleaged, when, meets, ...","[:, present, past, is, x]"
8098,s8099,Story of an Egg,"[shetty, b., raj]","[amrutha, b., usha, naik, bhandary, prakash, r...",[india],"[comedies, movies, romantic, ,, international]","[or, obstacles, mate, face, when, being, to, o...","[egg, of, story, an]"


### Example 2 : execution of the recommendation engine

In [82]:
recommendation("Dark", weight_title=0.5, weight_director=1.0,
                                 weight_cast=0.8, weight_country=0.3,
                                 weight_genre=1.2, weight_description=0.7)

100%|██████████| 8807/8807 [00:07<00:00, 1219.59it/s]


Unnamed: 0,recommendation,score_title,score_director,score_country,score_genre,score_description,final_score
709,In The Dark,0.761273,0,0.68367,0.994008,0.922426,3.138837
1298,Black Spot,0.63447,0,0.785425,1.0,0.879154,3.103439
712,Dark Desire,0.864873,0,0.40285,0.98137,0.925155,3.088954
2034,Spotless,0.506177,0,0.882893,1.0,0.888883,3.082857
1910,Criminal Minds,0.506244,0,0.929293,0.9837,0.913958,3.07223
823,White Lines,0.605029,0,0.863576,0.965369,0.879927,3.06891
1834,The Mist,0.620031,0,0.929293,0.931463,0.902659,3.067594
834,The Eddy,0.504896,0,0.93414,0.970948,0.912671,3.060617
844,Reckoning,0.403519,0,0.943984,0.9837,0.929391,3.059445
2551,Ripper Street,0.475139,0,0.860319,0.985908,0.914269,3.054307


In [83]:
movies_to_check = ['Dark',
"In The Dark",
'Black Spot',
'Dark Desire',
'Spotless',
"Criminal Minds",
'White Lines',
'The Mist',
'The Eddy',
'Reckoning',
"Ripper Street"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
2229,s2230,In The Dark,[],"[casey, mpumlwana, krantz, sengbloh, morgan, s...","[states, united]","[dramas, shows, ,, tv, crime]","[turns, blind, in, her, with, when, best, vice...","[the, dark, in]"
2235,s2236,Dark Desire,[],"[maite, carlos, huijara, hayser, regina, erik,...",[mexico],"[crime, shows, spanish-language, ,, tv, intern...","[home, alma, and, in, her, fateful, weekend, f...","[desire, dark]"
2327,s2328,Dark,[],"[maja, kampwirth, mendl, michael, anatole, seb...","[states, united, germany, ,]","[crime, dramas, shows, ,, tv, international]","[on, three, families, missing, generations, my...",[dark]
2530,s2531,White Lines,[],"[belén, tom, fox, diego, laurence, nuno, casab...","[kingdom, spain, united, ,]","[international, shows, british, ,, tv, crime]","[heads, she, life, s, quickly, in, down, behin...","[lines, white]"
2553,s2554,The Eddy,"[marrakchi, poul, alan, chazelle, laïla, houda...","[elyes, stenberg, george, karyo, randy, biolay...","[kingdom, germany, states, united, ,, france]","[dramas, shows, ,, tv, international]","[band, club, fights, protect, paris, and, with...","[the, eddy]"
2590,s2591,Reckoning,[],"[trammell, gloria, garayua, mitzi, finn, oxenb...","[australia, states, united, ,]","[mysteries, dramas, shows, ,, tv, crime]","[suburban, local, contend, on, their, in, with...",[reckoning]
3746,s3747,Black Spot,[],"[samuel, herzberg, jouy, tiphaine, naidra, rut...","[france, belgium, ,]","[crime, dramas, shows, ,, tv, international]","[crimes, edge, and, in, new, forest, grisly, e...","[spot, black]"
5205,s5206,The Mist,[],"[sutherland, danica, butler, gus, luke, france...","[states, canada, united, ,]","[mysteries, dramas, horror, ,, tv]","[stephen, creatures, on, it, s, new, by, based...","[the, mist]"
5412,s5413,Criminal Minds,[],"[kirsten, gray, hewitt, vangsness, lola, jeann...","[states, canada, united, ,]","[mysteries, dramas, shows, ,, tv, crime]","[fbi, getting, this, their, of, a, days, spend...","[minds, criminal]"
5752,s5753,Spotless,[],"[marc-andré, brendan, lucy, denis, grondin, ga...","[france, kingdom, united, ,]","[crime, dramas, shows, ,, tv, international]","[mob, with, powerful, to, of, is, owner, a, cr...",[spotless]
