In [67]:
import pandas as pd
import numpy as np


import pickle 

from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

In [129]:
df = pd.read_csv("netflix_titles.csv")

In [130]:
keep_countries = ["United States", "United Kingdom", "Canada", "Australia", 
                  "United Kingdom, United States", "United States, Canada", "Sweden"]

df = df[df['country'].isin(keep_countries)]


In [131]:
df.country.value_counts()

country
United States                    2818
United Kingdom                    419
Canada                            181
Australia                          87
United Kingdom, United States      75
United States, Canada              73
Sweden                             13
Name: count, dtype: int64

In [132]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
15,s16,TV Show,Dear White People,,"Logan Browning, Brandon P. Bell, DeRon Horton,...",United States,"September 22, 2021",2021,TV-MA,4 Seasons,"TV Comedies, TV Dramas",Students of color navigate the daily slights a...
25,s26,TV Show,Love on the Spectrum,,Brooke Satchwell,Australia,"September 21, 2021",2021,TV-14,2 Seasons,"Docuseries, International TV Shows, Reality TV",Finding love can be hard for anyone. For young...


In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3666 entries, 0 to 8805
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       3666 non-null   object
 1   type          3666 non-null   object
 2   title         3666 non-null   object
 3   director      2537 non-null   object
 4   cast          3234 non-null   object
 5   country       3666 non-null   object
 6   date_added    3658 non-null   object
 7   release_year  3666 non-null   int64 
 8   rating        3665 non-null   object
 9   duration      3663 non-null   object
 10  listed_in     3666 non-null   object
 11  description   3666 non-null   object
dtypes: int64(1), object(11)
memory usage: 372.3+ KB


In [134]:
df.describe()

Unnamed: 0,release_year
count,3666.0
mean,2013.702128
std,9.733933
min,1942.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [136]:
df = df.fillna("")

show_id         s5s25s40s51s67s70s106s115s117s119s127s191s193s...
type            TV ShowMovieTV ShowTV ShowTV ShowTV ShowMovieM...
title           Kota FactoryJeansChhota BheemDharmakshetraRaja...
director        S. ShankarLijo Jose PellisseryRahul RawailNage...
cast            Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...
country         IndiaIndiaIndiaIndiaIndiaIndiaIndiaIndiaIndiaI...
date_added      September 24, 2021September 21, 2021September ...
release_year    2021199820212014201420152017199420152017202020...
rating          TV-MATV-14TV-Y7TV-PGTV-GTV-PGTV-14TV-14TV-PGTV...
duration        2 Seasons166 min3 Seasons1 Season1 Season1 Sea...
listed_in       International TV Shows, Romantic TV Shows, TV ...
description     In a city of coaching centers known to train I...
dtype: object

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3666 entries, 0 to 8805
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       3666 non-null   object
 1   type          3666 non-null   object
 2   title         3666 non-null   object
 3   director      3666 non-null   object
 4   cast          3666 non-null   object
 5   country       3666 non-null   object
 6   date_added    3666 non-null   object
 7   release_year  3666 non-null   int64 
 8   rating        3666 non-null   object
 9   duration      3666 non-null   object
 10  listed_in     3666 non-null   object
 11  description   3666 non-null   object
dtypes: int64(1), object(11)
memory usage: 372.3+ KB


In [138]:
df.iloc[2]

show_id                                                       s10
type                                                        Movie
title                                                The Starling
director                                           Theodore Melfi
cast            Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...
country                                             United States
date_added                                     September 24, 2021
release_year                                                 2021
rating                                                      PG-13
duration                                                  104 min
listed_in                                        Comedies, Dramas
description     A woman adjusting to life after a loss contend...
Name: 9, dtype: object

In [139]:
df['release_year'] = df['release_year'].apply(lambda x: str(int(x)) if not pd.isna(x) else x)


In [140]:
df["combined_text"] = (
    df["title"] + " "
    + df["director"] + " "
    + df["cast"] + " "
    + df["listed_in"] + " "
    + df[str("release_year")] + " "
    + df["description"]
)

In [141]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
tfid_matrix = vectorizer.fit_transform(df["combined_text"])

In [142]:
nn_model = NearestNeighbors(n_neighbors=10, metric="cosine", algorithm="brute")
nn_model.fit(tfid_matrix)

In [143]:
with open("models/vectorizer_west.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("models/knn_model_west.pkl", "wb") as f:
    pickle.dump(nn_model, f)



In [144]:
df.to_pickle("models/netflix_data_west.pkl")