# Netflix Movies and TV shows Recommendation system

Algorithm : Content Based Filtering - Glove for word embedding of text features

Notebook summary

1. Import necessary libraries

2. Import required dataset

3. Data preprocessing for baseline model

4. Install necessary dependencies to leverage the Glove word embedding model from gensim

5. Model 1 ( baseline model ) : Considering equal weights for all features ( limited feature )

6. Model 2 : Considering different weights for different features

7. Model 3 : Considering wider set of features with differentiated weighting


# 1. Import necessary libraries

In [1]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# 2. Import required dataset

In [2]:
df_netflix = pd.read_csv("netflix_titles.csv")
df_netflix.drop(
    columns=[
        "director",
        "cast",
        "country",
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description
0,s1,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


# 3. Data preprocessing for baseline model

In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
from nltk.tokenize import word_tokenize

df_netflix["title_list"] = df_netflix["title"].str.lower()
df_netflix["listed_in"] = df_netflix["listed_in"].str.lower()
df_netflix["description"] = df_netflix["description"].str.lower()

df_netflix["title_list"] = df_netflix["title_list"].apply(word_tokenize)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(word_tokenize)
df_netflix["description"] = df_netflix["description"].apply(word_tokenize)

In [5]:
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[as, her, father, nears, the, end, of, his, li...","[dick, johnson, is, dead]"
1,s2,Blood & Water,"[international, tv, shows, ,, tv, dramas, ,, t...","[after, crossing, paths, at, a, party, ,, a, c...","[blood, &, water]"
2,s3,Ganglands,"[crime, tv, shows, ,, international, tv, shows...","[to, protect, his, family, from, a, powerful, ...",[ganglands]


In [6]:
import string

df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if len(word) > 0]
)

In [7]:
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[as, her, father, nears, the, end, of, his, li...","[dick, johnson, is, dead]"
1,s2,Blood & Water,"[international, tv, shows, ,, tv, dramas, ,, t...","[after, crossing, paths, at, a, party, a, cape...","[blood, &, water]"
2,s3,Ganglands,"[crime, tv, shows, ,, international, tv, shows...","[to, protect, his, family, from, a, powerful, ...",[ganglands]


In [8]:
df_netflix["title_list"] = df_netflix["title_list"].apply(lambda x: list(set(x)))
df_netflix["listed_in"] = df_netflix["listed_in"].apply(lambda x: list(set(x)))
df_netflix["description"] = df_netflix["description"].apply(lambda x: list(set(x)))

In [None]:
df_netflix.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[end, them, his, to, in, of, johnson, and, bot...","[is, dick, dead, johnson]"
1,s2,Blood & Water,"[,, shows, international, mysteries, dramas, tv]","[swimming, out, to, star, birth, privateschool...","[water, blood, &]"
2,s3,Ganglands,"[action, &, ,, shows, adventure, international...","[deadly, robbers, his, to, and, of, a, war, dr...",[ganglands]


# 4. Install necessary dependencies to leverage the Glove word embedding model from gensim

In [9]:
!pip install gensim



In [10]:
import gensim

In [11]:
import gensim.downloader as api

gv = api.load('glove-wiki-gigaword-300')



In [12]:
matrix_netflix_vocab = []
for list_ in df_netflix.to_numpy():
    list_[2] = [word for word in list_[2] if word in gv.key_to_index]
    list_[3] = [word for word in list_[3] if word in gv.key_to_index]
    list_[4] = [word for word in list_[4] if word in gv.key_to_index]
    matrix_netflix_vocab.append(list_)
df_netflix_vocab = pd.DataFrame(matrix_netflix_vocab, columns=df_netflix.columns)

In [13]:
len(matrix_netflix_vocab)

8807

In [14]:
matrix_netflix_vocab[0]

array(['s1', 'Dick Johnson Is Dead', list(['documentaries']),
       list(['johnson', 'kirsten', 'inventive', 'death', 'life', 'and', 'in', 'her', 'face', 'help', 'filmmaker', 'them', 'to', 'of', 'end', 'both', 'nears', 'comical', 'as', 'stages', 'the', 'father', 'inevitable', 'his', 'ways']),
       list(['is', 'dick', 'johnson', 'dead'])], dtype=object)

In [15]:
df_netflix_vocab.head(3)

Unnamed: 0,show_id,title,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,[documentaries],"[johnson, kirsten, inventive, death, life, and...","[is, dick, johnson, dead]"
1,s2,Blood & Water,"[mysteries, dramas, shows, ,, tv, international]","[party, birth, her, sister, paths, swimming, w...","[blood, water, &]"
2,s3,Ganglands,"[crime, adventure, shows, action, ,, tv, inter...","[deadly, protect, and, expert, from, powerful,...",[]


# 5. Model 1 ( baseline model ) :  Considering equal weights for all features ( limited feature )

### Define the recommendation function

In [18]:
from tqdm import tqdm


def recommendation(title):
    matrix_netflix_title_vocab = []
    for list_ in df_netflix[df_netflix["title"] == title].to_numpy():
        list_[2] = [word for word in list_[2] if word in gv.key_to_index]
        list_[3] = [word for word in list_[3] if word in gv.key_to_index]
        list_[4] = [word for word in list_[4] if word in gv.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(matrix_netflix_vocab)
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:
            # Check if lists are empty before calculating similarity
            if list1[2] and list2[2]:
                score_catg = gv.n_similarity(list1[2], list2[2])
            else:
                score_catg = 0  # or a suitable default value

            if list1[3] and list2[3]:
                score_desc = gv.n_similarity(list1[3], list2[3])
            else:
                score_desc = 0  # or a suitable default value

            try:
                if list1[4] and list2[4]:
                    score_title = gv.n_similarity(list1[4], list2[4]) / 2
                else:
                    score_title = 0
            except:
                score_title = 0
            if (list1[1] != list2[1]) & (score_catg > 0.85):
                matrix_similarity.append(
                    [list1[1], list2[1], score_title, score_catg, score_desc]
                )
        pbar.update()
    pbar.close()
    df_netflix_similarity = pd.DataFrame(
        matrix_similarity,
        columns=[
            "recommendation",
            "title",
            "score_title",
            "score_category",
            "score_description",
        ],
    )
    df_netflix_similarity["final_score"] = (
        df_netflix_similarity["score_title"]
        + df_netflix_similarity["score_category"]
        + df_netflix_similarity["score_description"]
    )
    return df_netflix_similarity.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False,
    ).head(10)

### Example 1 : Execution of recommendation function

In [19]:
recommendation("Grown Ups")

100%|██████████| 8807/8807 [00:04<00:00, 1829.11it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
36,American Pie 9: Girls' Rules,Grown Ups,0.179184,1.0,0.875891,2.055075
39,Work It,Grown Ups,0.188743,1.0,0.859949,2.048693
116,How to Be a Player,Grown Ups,0.190303,1.0,0.851863,2.042166
103,Bring It On: Worldwide Showdown,Grown Ups,0.200693,1.0,0.829971,2.030664
73,Father of the Year,Grown Ups,0.18794,1.0,0.842052,2.029992
27,Mean Girls 2,Grown Ups,0.171069,1.0,0.856893,2.027962
138,The Bad Education Movie,Grown Ups,0.188562,1.0,0.838282,2.026843
112,Fall Girls,Grown Ups,0.189624,1.0,0.835608,2.025232
94,A Bad Moms Christmas,Grown Ups,0.21274,1.0,0.808177,2.020917
144,"The Goods: Live Hard, Sell Hard",Grown Ups,0.223016,1.0,0.793356,2.016372


In [20]:
movies_to_check = ['Grown Ups',
"American Pie 9: Girls' Rules",
'Work It',
'How to Be a Player',
'Bring It On: Worldwide Showdown',
"Father of the Year",
'Mean Girls 2',
'The Bad Education Movie',
'Fall Girls',
'A Bad Moms Christmas',
"The Goods: Live Hard, Sell Hard"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,listed_in,description,title_list
27,s28,Grown Ups,[comedies],"[beloved, basketball, coach, loss, their, and,...","[grown, ups]"
1459,s1460,Mean Girls 2,[comedies],"[if, pariah, she, through, s, with, student, d...","[mean, girls, 2]"
1881,s1882,American Pie 9: Girls' Rules,[comedies],"[around, seniors, muddles, their, new, student...","[:, american, rules, pie, girls, ', 9]"
2159,s2160,Work It,[comedies],"[’, squad, s, and, her, senior, but, clumsy, b...","[it, work]"
4761,s4762,Father of the Year,[comedies],"[win, debate, mayhem, whose, in, their, when, ...","[the, of, father, year]"
6027,s6028,A Bad Moms Christmas,[comedies],"[and, their, carla, looming, amy, back, when, ...","[christmas, a, moms, bad]"
6377,s6378,Bring It On: Worldwide Showdown,[comedies],"[squad, on, cheer, in, new, teams, when, from,...","[:, worldwide, on, bring, it, showdown]"
6734,s6735,Fall Girls,[comedies],"[left, closing, it, try, their, in, following,...","[girls, fall]"
7015,s7016,How to Be a Player,[comedies],"[life, and, sister, philandering, to, woman, o...","[player, be, to, a, how]"
8201,s8202,The Bad Education Movie,[comedies],"[class, britain, s, and, trip, but, –, caring,...","[the, movie, education, bad]"


### Example 2 : Execution of recommendation function

In [21]:
recommendation("The Conjuring")

100%|██████████| 8807/8807 [00:04<00:00, 1945.49it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
136,The Darkness,The Conjuring,0.254167,1.0,0.902806,2.156973
466,The Haunting of Molly Hartley,The Conjuring,0.269575,1.0,0.887063,2.156638
379,House at the End of the Street,The Conjuring,0.248824,1.0,0.906028,2.154853
78,The Conjuring 2,The Conjuring,0.375097,0.921947,0.846441,2.143485
33,The Strangers,The Conjuring,0.255073,1.0,0.883061,2.138134
342,Conjuring Spirit,The Conjuring,0.32395,0.937428,0.871832,2.13321
52,The Strange House,The Conjuring,0.268225,0.937428,0.918477,2.12413
265,The Ritual,The Conjuring,0.285342,0.937428,0.897559,2.120329
456,The Devil Inside,The Conjuring,0.252811,1.0,0.866727,2.119538
123,We Summon the Darkness,The Conjuring,0.258835,1.0,0.853419,2.112254


In [22]:
movies_to_check = ['The Conjuring',
"The Darkness",
'The Haunting of Molly Hartley',
'House at the End of the Street',
'The Conjuring 2',
"The Strangers",
'Conjuring Spirit',
'The Strange House',
'The Ritual',
'The Devil Inside',
"We Summon the Darkness"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,listed_in,description,title_list
607,s608,The Strangers,"[movies, thrillers, horror, ,]","[home, attack, three, when, quiet, vacation, b...","[strangers, the]"
887,s888,The Strange House,"[movies, thrillers, horror, ,, international]","[home, brothers, moves, bigcity, try, and, the...","[strange, the, house]"
1283,s1284,The Conjuring,"[movies, thrillers, horror, ,]","[supernatural, noted, farmhouse, pair, demonol...","[conjuring, the]"
1284,s1285,The Conjuring 2,"[movies, horror]","[confront, releases, spirit, enlists, london, ...","[conjuring, the, 2]"
2149,s2150,We Summon the Darkness,"[movies, thrillers, horror, ,]","[themselves, in, new, when, hits, concert, gri...","[we, summon, darkness, the]"
2382,s2383,The Darkness,"[movies, thrillers, horror, ,]","[home, unleashes, begins, ancient, set, bringi...","[the, darkness]"
5042,s5043,The Ritual,"[movies, thrillers, horror, ,, international]","[with, trip, but, from, –, take, strained, wil...","[the, ritual]"
6527,s6528,Conjuring Spirit,"[movies, thrillers, horror, ,, international]","[spirit, moves, novelist, she, s, and, in, her...","[conjuring, spirit]"
7009,s7010,House at the End of the Street,"[movies, thrillers, horror, ,]","[she, new, when, was, house, site, town, more,...","[house, the, street, of, at, end]"
8276,s8277,The Devil Inside,"[movies, thrillers, horror, ,]","[people, this, s, and, her, behind, three, gri...","[devil, the, inside]"


# 6. Model 2 : Considering different weights for different features

### Defining the recommendation function with different weights to be considered for different features

In [25]:
from tqdm import tqdm


def recommendation(title, weight_title=0.5, weight_category=1.0, weight_description=1.0):
    matrix_netflix_title_vocab = []
    for list_ in df_netflix[df_netflix["title"] == title].to_numpy():
        list_[2] = [word for word in list_[2] if word in gv.key_to_index]
        list_[3] = [word for word in list_[3] if word in gv.key_to_index]
        list_[4] = [word for word in list_[4] if word in gv.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(matrix_netflix_vocab)
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:
            # Check if lists are empty before calculating similarity
            if list1[2] and list2[2]:  # Ensure both lists are not empty
                score_catg = gv.n_similarity(list1[2], list2[2])
            else:
                score_catg = 0  # or a suitable default value

            if list1[3] and list2[3]:  # Ensure both lists are not empty
                score_desc = gv.n_similarity(list1[3], list2[3])
            else:
                score_desc = 0  # or a suitable default value

            try:
                if list1[4] and list2[4]:  # Ensure both lists are not empty
                    score_title = gv.n_similarity(list1[4], list2[4]) / 2
                else:
                    score_title = 0
            except:
                score_title = 0
            if (list1[1] != list2[1]) & (score_catg > 0.85):
                matrix_similarity.append(
                    [list1[1], list2[1], score_title, score_catg, score_desc]
                )
        pbar.update()
    pbar.close()
    df_netflix_similarity = pd.DataFrame(
        matrix_similarity,
        columns=[
            "recommendation",
            "title",
            "score_title",
            "score_category",
            "score_description",
        ],
    )
    df_netflix_similarity["final_score"] = (
        weight_title * df_netflix_similarity["score_title"]
        + weight_category * df_netflix_similarity["score_category"]
        + weight_description * df_netflix_similarity["score_description"]
    )
    return df_netflix_similarity.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False,
    ).head(10)

### example 1

In [26]:
recommendation("Grown Ups", weight_title=0.3, weight_category=1.2, weight_description=0.8)

100%|██████████| 8807/8807 [00:04<00:00, 2013.70it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
36,American Pie 9: Girls' Rules,Grown Ups,0.179184,1.0,0.875891,1.954468
39,Work It,Grown Ups,0.188743,1.0,0.859949,1.944583
116,How to Be a Player,Grown Ups,0.190303,1.0,0.851863,1.938582
27,Mean Girls 2,Grown Ups,0.171069,1.0,0.856893,1.936835
73,Father of the Year,Grown Ups,0.18794,1.0,0.842052,1.930024
111,Dumb and Dumberer: When Harry Met Lloyd,Grown Ups,0.163759,1.0,0.849449,1.928687
138,The Bad Education Movie,Grown Ups,0.188562,1.0,0.838282,1.927194
112,Fall Girls,Grown Ups,0.189624,1.0,0.835608,1.925374
103,Bring It On: Worldwide Showdown,Grown Ups,0.200693,1.0,0.829971,1.924185
21,Bad Trip,Grown Ups,0.173631,1.0,0.835535,1.920517


### Example 2

In [27]:
recommendation("Jeans", weight_title=0.3, weight_category=1.2, weight_description=0.8)

100%|██████████| 8807/8807 [00:05<00:00, 1701.72it/s]


Unnamed: 0,recommendation,title,score_title,score_category,score_description,final_score
793,Lock Your Girls In,Jeans,0.121569,1.0,0.917724,1.97065
837,Rich in Love,Jeans,0.072769,1.0,0.924072,1.961088
101,Flower Girl,Jeans,0.109514,1.0,0.904719,1.956629
562,A Chaster Marriage,Jeans,0.045104,1.0,0.926697,1.954889
2727,We Need to Talk,Jeans,0.060617,1.0,0.919157,1.953511
569,You've Got This,Jeans,0.082126,1.0,0.909601,1.952318
195,Kambili: The Whole 30 Yards,Jeans,0.043745,1.0,0.918973,1.948302
769,Mr. Romantic,Jeans,0.073811,1.0,0.907146,1.94786
798,The Married Couples,Jeans,0.054506,1.0,0.907658,1.942479
817,"I love you, stupid",Jeans,0.105102,1.0,0.888149,1.94205


# 7. Model 3 : Considering wider set of features with differentiated weighting

Features considered for content based filtering are title , director , cast , country , genre(listed_in) , description

basic weights are applied to different features to explain the possibility , generally weights are allocated in conjunction with the functional expert

### Load the dataset

In [28]:
# Load data
df_netflix = pd.read_csv("netflix_titles.csv")


In [29]:
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### Remove features not to be considered for recommendation engine

In [30]:
df_netflix.drop(
    columns=[
        "date_added",
        "release_year",
        "rating",
        "duration",
        "type",
    ],
    inplace=True,
)
df_netflix.head(3)

Unnamed: 0,show_id,title,director,cast,country,listed_in,description
0,s1,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


In [31]:
df_netflix.columns

Index(['show_id', 'title', 'director', 'cast', 'country', 'listed_in',
       'description'],
      dtype='object')

### Data preprocessing

In [32]:
from nltk.tokenize import word_tokenize

df_netflix["title_list"] = df_netflix["title"].str.lower()
df_netflix["director"] = df_netflix["director"].str.lower()
df_netflix["cast"] = df_netflix["cast"].str.lower()
df_netflix["country"] = df_netflix["country"].str.lower()
df_netflix["listed_in"] = df_netflix["listed_in"].str.lower()
df_netflix["description"] = df_netflix["description"].str.lower()

# Fill NaN values with empty strings and convert lists to strings
for col in ["director", "cast", "country"]:
    df_netflix[col] = df_netflix[col].fillna('').astype(str).apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

df_netflix["title_list"] = df_netflix["title_list"].apply(word_tokenize)
df_netflix["director"] = df_netflix["director"].apply(word_tokenize)
df_netflix["cast"] = df_netflix["cast"].apply(word_tokenize)
df_netflix["country"] = df_netflix["country"].apply(word_tokenize)
df_netflix["listed_in"] = df_netflix["listed_in"].apply(word_tokenize)
df_netflix["description"] = df_netflix["description"].apply(word_tokenize)

In [33]:
df_netflix.head(3)

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,"[kirsten, johnson]",[],"[united, states]",[documentaries],"[as, her, father, nears, the, end, of, his, li...","[dick, johnson, is, dead]"
1,s2,Blood & Water,[],"[ama, qamata, ,, khosi, ngema, ,, gail, mabala...","[south, africa]","[international, tv, shows, ,, tv, dramas, ,, t...","[after, crossing, paths, at, a, party, ,, a, c...","[blood, &, water]"
2,s3,Ganglands,"[julien, leclercq]","[sami, bouajila, ,, tracy, gotoas, ,, samuel, ...",[],"[crime, tv, shows, ,, international, tv, shows...","[to, protect, his, family, from, a, powerful, ...",[ganglands]


In [34]:
import string

df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
)
df_netflix["description"] = df_netflix["description"].apply(
    lambda x: [word for word in x if len(word) > 0]
)

In [35]:
df_netflix["title_list"] = df_netflix["title_list"].apply(lambda x: list(set(x)))
df_netflix["director"] = df_netflix["director"].apply(lambda x: list(set(x)))
df_netflix["cast"] = df_netflix["cast"].apply(lambda x: list(set(x)))
df_netflix["country"] = df_netflix["country"].apply(lambda x: list(set(x)))
df_netflix["listed_in"] = df_netflix["listed_in"].apply(lambda x: list(set(x)))
df_netflix["description"] = df_netflix["description"].apply(lambda x: list(set(x)))

In [36]:
df_netflix.head(2)

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
0,s1,Dick Johnson Is Dead,"[johnson, kirsten]",[],"[states, united]",[documentaries],"[johnson, kirsten, inventive, death, life, and...","[is, dick, johnson, dead]"
1,s2,Blood & Water,[],"[morny, mahlangu, mabalane, qamata, odwa, sham...","[africa, south]","[mysteries, dramas, shows, ,, tv, international]","[party, birth, her, sister, paths, swimming, w...","[blood, water, &]"


In [37]:
matrix_netflix_vocab = []
for list_ in df_netflix.to_numpy():
    list_[2] = [word for word in list_[2] if word in gv.key_to_index]
    list_[3] = [word for word in list_[3] if word in gv.key_to_index]
    list_[4] = [word for word in list_[4] if word in gv.key_to_index]
    list_[5] = [word for word in list_[5] if word in gv.key_to_index]
    list_[6] = [word for word in list_[6] if word in gv.key_to_index]
    list_[7] = [word for word in list_[7] if word in gv.key_to_index]
    matrix_netflix_vocab.append(list_)
df_netflix_vocab = pd.DataFrame(matrix_netflix_vocab, columns=df_netflix.columns)

In [38]:
matrix_netflix_vocab[0]

array(['s1', 'Dick Johnson Is Dead', list(['johnson', 'kirsten']),
       list([]), list(['states', 'united']), list(['documentaries']),
       list(['johnson', 'kirsten', 'inventive', 'death', 'life', 'and', 'in', 'her', 'face', 'help', 'filmmaker', 'them', 'to', 'of', 'end', 'both', 'nears', 'comical', 'as', 'stages', 'the', 'father', 'inevitable', 'his', 'ways']),
       list(['is', 'dick', 'johnson', 'dead'])], dtype=object)

### Defining the recommendation engine function

In [40]:
def recommendation(title, weight_title=1.0, weight_director=1.0, weight_cast=1.0,
                   weight_country=1.0, weight_genre=1.0, weight_description=1.0):

    # Extract the selected movie's features based on title
    matrix_netflix_title_vocab = []
    target_movies = df_netflix[df_netflix["title"] == title]

    if target_movies.empty:
        print(f"'{title}' not found in the dataset.")
        return pd.DataFrame()

    for list_ in target_movies.to_numpy():
        list_[2] = [word for word in list_[2] if word in gv.key_to_index]
        list_[3] = [word for word in list_[3] if word in gv.key_to_index]
        list_[4] = [word for word in list_[4] if word in gv.key_to_index]
        list_[5] = [word for word in list_[5] if word in gv.key_to_index]
        list_[6] = [word for word in list_[6] if word in gv.key_to_index]
        list_[7] = [word for word in list_[7] if word in gv.key_to_index]
        matrix_netflix_title_vocab.append(list_)

    matrix_similarity = []
    pbar = tqdm(df_netflix.to_numpy(), total=len(df_netflix))
    for list1 in pbar:
        for list2 in matrix_netflix_title_vocab:

            # Calculate similarity scores for each feature, handling empty lists
            score_director = gv.n_similarity(list1[2], list2[2]) if list1[2] and list2[2] else 0  # Handle empty lists
            score_cast = gv.n_similarity(list1[3], list2[3]) if list1[3] and list2[3] else 0  # Handle empty lists
            score_country = gv.n_similarity(list1[4], list2[4]) if list1[4] and list2[4] else 0  # Handle empty lists
            score_catg = gv.n_similarity(list1[5], list2[5]) if list1[5] and list2[5] else 0  # Handle empty lists
            score_desc = gv.n_similarity(list1[6], list2[6]) if list1[6] and list2[6] else 0  # Handle empty lists

            # Handle title similarity calculation with try-except
            try:
                score_title = gv.n_similarity(list1[7], list2[7])
            except KeyError:
                score_title = 0

            # If the titles are different and genre similarity is significant
            if list1[1] != list2[1] and score_catg > 0.85:
                # Calculate the weighted final score
                final_score = (
                    weight_title * score_title +
                    weight_director * score_director +
                    weight_cast * score_cast +
                    weight_country * score_country +
                    weight_genre * score_catg +
                    weight_description * score_desc
                )

                # Append all relevant scores for the DataFrame
                matrix_similarity.append([list1[1], score_title, score_director, score_country, score_catg, score_desc, final_score])

    pbar.close()

    # Create DataFrame of recommendations and sort by final score
    df_netflix_similarity = pd.DataFrame(matrix_similarity, columns=["recommendation", "score_title" , "score_director" ,"score_country","score_genre" , "score_description" , "final_score"])
    return df_netflix_similarity.sort_values(by="final_score", ascending=False).head(10)

### Example 1 : execution of the recommendation engine

In [41]:
recommendation("Jeans", weight_title=0.5, weight_director=1.0,
                                 weight_cast=0.8, weight_country=0.3,
                                 weight_genre=1.2, weight_description=0.7)

100%|██████████| 8807/8807 [00:08<00:00, 1004.70it/s]


Unnamed: 0,recommendation,score_title,score_director,score_country,score_genre,score_description,final_score
1120,Oh! Baby,0.20947,0.547239,1.0,0.890147,0.868717,3.218636
2562,Story of an Egg,0.012047,0.473245,1.0,1.0,0.902408,3.123998
1542,Mom,0.26506,0.53127,1.0,0.886717,0.876154,3.107324
957,Thottappan,0.0,0.571753,1.0,0.943261,0.894483,3.052695
1500,Luv Shuv Tey Chicken Khurana,0.123488,0.494641,1.0,1.0,0.764547,3.048975
1055,Evvarikee Cheppoddu,0.0,0.581202,1.0,1.0,0.881492,3.045599
2242,Ishqedarriyaan,0.0,0.618009,1.0,0.969476,0.918113,3.044096
1835,Pink,0.352745,0.389992,1.0,0.909108,0.798607,3.043638
1649,Guru,0.024545,0.439359,1.0,0.909108,0.839286,3.033859
993,Aaviri,0.0,0.547487,1.0,0.855623,0.88905,3.031763


In [42]:
movies_to_check = ['Jeans',
"Oh! Baby",
'Story of an Egg',
'Mom',
'Thottappan',
"Luv Shuv Tey Chicken Khurana",
'Evvarikee Cheppoddu',
'Ishqedarriyaan',
'Pink',
'Guru',
"Aaviri"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
24,s25,Jeans,"[shankar, s.]","[bachchan, prashanth, aishwarya, lakshmi, sri,...",[india],"[comedies, movies, romantic, ,, international]","[too, she, man, when, twin, convincing, marry,...",[jeans]
2935,s2936,Thottappan,"[shanavas, bavakutty, k.]","[dileesh, roshan, lal, k., sunitha, krishnan, ...",[india],"[comedies, movies, dramas, ,, international]","[partner, ’, left, life, s, himself, in, behin...",[thottappan]
3077,s3078,Aaviri,"[babu, ravi]","[shankar, muktha, neha, khan, bharani, sri, ra...",[india],"[movies, thrillers, horror, ,, international]","[home, spirit, moves, their, in, daughter, eer...",[aaviri]
3224,s3225,Evvarikee Cheppoddu,"[shankar, eeday, basava]","[ghani, p, vamsi, raj, k, prasanna, nekkanti, ...",[india],"[comedies, movies, romantic, ,, international]","[wrench, otherwise, ’, s, their, when, must, t...","[cheppoddu, evvarikee]"
3516,s3517,Oh! Baby,"[v., b., reddy, nandini]","[ruth, sajja, jagapathi, babu, aishwarya, laks...",[india],"[comedies, movies, musicals, ,, music, interna...","[life, her, snapped, off, another, after, magi...","[!, oh, baby]"
4582,s4583,Luv Shuv Tey Chicken Khurana,"[sharma, sameer]","[rajesh, tangri, bagga, dolly, munish, vinod, ...",[india],"[comedies, movies, romantic, ,, international]","[native, omi, london, himself, in, lawyer, ret...","[chicken, shuv, khurana, tey, luv]"
4692,s4693,Mom,"[udyawar, ravi]","[singh, vikas, sajal, sridevi, siddiqui, adnan...",[india],"[thrillers, movies, dramas, ,, international]","[party, furious, destroy, sexually, her, free,...",[mom]
5008,s5009,Guru,"[ratnam, mani]","[roshan, abhishek, mithun, balan, dhritiman, b...",[india],"[dramas, movies, international, ,]","[desai, ambitious, father, turkey, village, le...",[guru]
5629,s5630,Pink,"[chowdhury, roy, aniruddha]","[pannu, bedi, dhritiman, kulhari, bachchan, pa...",[india],"[dramas, movies, international, ,]","[assault, rape, s, lawyer, her, retired, attem...",[pink]
7099,s7100,Ishqedarriyaan,"[prakash, v., k.]","[evelyn, suhasini, mulay, dave, ,, kavin, sharma]",[india],"[movies, dramas, romantic, ,, international]","[happy, if, do, it, her, someone, willing, hel...",[ishqedarriyaan]


### Example 2 : execution of the recommendation engine

In [43]:
recommendation("Dark", weight_title=0.5, weight_director=1.0,
                                 weight_cast=0.8, weight_country=0.3,
                                 weight_genre=1.2, weight_description=0.7)

100%|██████████| 8807/8807 [00:08<00:00, 979.12it/s] 


Unnamed: 0,recommendation,score_title,score_director,score_country,score_genre,score_description,final_score
533,In The Dark,0.722748,0,0.911336,0.95908,0.829338,2.734899
984,Black Spot,0.577885,0,0.684016,1.0,0.764338,2.718938
518,The Rain,0.368472,0,0.942079,0.940377,0.891177,2.687038
1185,Dogs of Berlin,0.293526,0,0.736394,1.0,0.831437,2.678881
1134,Black Earth Rising,0.572252,0,0.806567,0.932584,0.769729,2.669526
1917,NSU German History X,0.190514,0,0.736394,1.0,0.783561,2.669373
816,Holiday Secrets,0.301575,0,0.736394,0.967966,0.822792,2.652093
911,13 Reasons Why: Beyond the Reasons,0.343255,0,0.911336,0.941553,0.79394,2.642866
2043,The Fear,0.362412,0,0.806567,0.937441,0.807292,2.641884
1585,Old Money,0.210149,0,0.911336,0.939714,0.818331,2.64148


In [44]:
movies_to_check = ['Dark',
"In The Dark",
'Black Spot',
'The Rain',
'Dogs of Berlin',
"Black Earth Rising",
'NSU German History X',
'Holiday Secrets',
'13 Reasons Why: Beyond the Reasons',
'The Fear',
"Old Money"]
df_netflix[df_netflix.title.isin(movies_to_check)]

Unnamed: 0,show_id,title,director,cast,country,listed_in,description,title_list
2162,s2163,The Rain,[],"[lindberg, lars, tønnesen, mikkel, simonsen, a...","[states, united, denmark, ,]","[mysteries, dramas, shows, ,, tv, international]","[band, seeking, s, and, –, population, join, a...","[the, rain]"
2229,s2230,In The Dark,[],"[casey, mpumlwana, krantz, sengbloh, morgan, s...","[states, united]","[dramas, shows, ,, tv, crime]","[turns, blind, in, her, with, when, best, vice...","[the, dark, in]"
2327,s2328,Dark,[],"[maja, kampwirth, mendl, michael, anatole, seb...","[states, united, germany, ,]","[crime, dramas, shows, ,, tv, international]","[on, three, families, missing, generations, my...",[dark]
3252,s3253,Holiday Secrets,[],"[emilie, harfouch, rose, zu, svenja, neumeiste...",[germany],"[dramas, shows, ,, tv, international]","[this, s, in, series, to, of, family, past, a,...","[holiday, secrets]"
3561,s3562,13 Reasons Why: Beyond the Reasons,[],"[dylan, marissa, butler, prentice, selena, tom...","[states, united]","[docuseries, shows, ,, tv, crime]","[cast, why, members, and, in, mental, themes, ...","[beyond, :, the, 13, why, reasons]"
3746,s3747,Black Spot,[],"[samuel, herzberg, jouy, tiphaine, naidra, rut...","[france, belgium, ,]","[crime, dramas, shows, ,, tv, international]","[crimes, edge, and, in, new, forest, grisly, e...","[spot, black]"
4166,s4167,Black Earth Rising,[],"[goodman, imani, abena, tunie, michaela, ayivo...","[kingdom, united]","[dramas, shows, british, ,, tv, international]","[rwandan, cases, crimes, she, on, her, when, b...","[rising, earth, black]"
4316,s4317,Dogs of Berlin,[],"[urs, yardim, saß, sinan, anna, maria, fahri, ...",[germany],"[crime, dramas, shows, ,, tv, international]","[connections, player, case, in, but, soccer, c...","[berlin, of, dogs]"
5774,s5775,Old Money,"[david, schalko]","[melles, stipsits, strauss, krisch, schwarz, v...","[states, united]","[comedies, dramas, shows, ,, tv, international]","[consume, costs, backstabbing, liver, and, new...","[old, money]"
7623,s7624,NSU German History X,[],"[schuch, nina, stetter, urzendowsky, anna, tom...",[germany],"[crime, dramas, shows, ,, tv, international]","[neonazi, begins, catch, uphill, national, bat...","[nsu, x, history, german]"
