In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import tensorflow.keras as keras
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dot, Dropout
from keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from tqdm import tqdm

In [None]:
# Load the dataset
df = pd.read_csv('/content/netflix_titles.csv')

In [None]:
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,2634
cast,825
country,831
date_added,10
release_year,0
rating,4
duration,3


# Data Preprocessing

In [None]:
# convert date_added to pandas datetime type
df['date_added'] = pd.to_datetime(df['date_added'],format="mixed")

In [None]:
# fill null values in director, cast and country columns to be set to 'unknown', and for duration to be 'Not added'
df["director"]=df["director"].fillna("Unknown")
df["cast"]=df["cast"].fillna("Unknown")
df["country"]=df["country"].fillna("Unknown")
df['duration'].replace(np.nan,'Not Added', inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['duration'].replace(np.nan,'Not Added', inplace= True)


In [None]:
# fill the rest of null values to previous and next values
df = df.ffill().bfill()

In [None]:
# drop rows have 'duration' == 'Not Added'
idx = df[df.duration == 'Not Added'].index
df.drop(idx, inplace= True)
df.reset_index(drop= True, inplace= True)

In [None]:
# Create features year_added, month_added & month_name
df['year_added'] = df['date_added'].dt.year.astype(int)
df['month_added'] = df['date_added'].dt.month
df['month_name'] = df['date_added'].dt.month_name()

In [None]:
# create 'season_count' col for TV Shows & 'duration' col for movies
df['season_count'] = df.duration.apply(lambda x: x.split(' ')[0] if 'Season' in x else np.nan)
df['duration'] = df.duration.apply(lambda x: x.split(' ')[0] if 'Season' not in x else np.nan)

In [None]:
# convert ['duration', 'release_year', 'season_count'] to numeric format
cols = ['duration', 'release_year', 'season_count']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [None]:
df.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,0
cast,0
country,0
date_added,0
release_year,0
rating,0
duration,2676


In [None]:
print(df.columns)

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'year_added', 'month_added', 'month_name', 'season_count'],
      dtype='object')


# Data Analysis

In [None]:
type_counts = df['type'].value_counts().reset_index()
type_counts.columns = ['type', 'count']

fig = px.bar(type_counts, x='count', y='type', orientation='h', color='type',
             color_discrete_sequence=px.colors.qualitative.Dark2)

fig.show()

In [None]:
# Calculate the top 10 countries with the most content
top_countries = df['country'].value_counts().nlargest(10).reset_index()
top_countries.columns = ['country', 'count']

fig = px.bar(top_countries, x='count', y='country', orientation='h',
             title='Top 10 countries with most content',
             labels={'count': 'Count', 'country': 'Country'},
             color='country',  # Differentiate colors by country
             color_discrete_sequence=px.colors.qualitative.Safe)  # Use a qualitative color sequence

fig.update_layout(
    yaxis=dict(title='Country'),
    xaxis=dict(title='Count'),
    template='plotly_white'
)
fig.show()


In [None]:
# Split the DataFrame into movies and TV shows
df_movies = df[df['type'] == 'Movie']
df_shows = df[df['type'] == 'TV Show']

In [None]:
# How many Horror Movies and Tv Shows on Netflix?
print('no. of Horror movies on Netflix: ',df_movies["listed_in"].str.contains("Horror").sum())
print('no. of Horror series on TV shows: ', df_shows["listed_in"].str.contains("Horror").sum())

no. of Horror movies on Netflix:  357
no. of Horror series on TV shows:  75


In [None]:
# What is the average duration of the movie?
df_movies.duration.median()

98.0

In [None]:
# Create a summary statstics table for duration and season count features using dedicated pandas function
df_movies.duration.describe()

Unnamed: 0,duration
count,6128.0
mean,99.577187
std,28.290593
min,3.0
25%,87.0
50%,98.0
75%,114.0
max,312.0


In [None]:
df_shows.season_count.describe()

Unnamed: 0,season_count
count,2676.0
mean,1.764948
std,1.582752
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,17.0


In [None]:
# longest movie ever
df_movies[df_movies.duration == df_movies.duration.max()]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,month_name,season_count
4253,s4254,Movie,Black Mirror: Bandersnatch,Unknown,"Fionn Whitehead, Will Poulter, Craig Parkinson...",United States,2018-12-28,2018,TV-MA,312.0,"Dramas, International Movies, Sci-Fi & Fantasy","In 1984, a young programmer begins to question...",2018,12,December,


In [None]:
# top 10 countries in Movie releases
df_grouped = df_movies.groupby("country")[["show_id"]].count().sort_values(by="show_id", ascending=False).head(10).reset_index()
df_grouped = df_grouped.rename(columns= {'show_id': 'count'})
fig = px.bar(df_grouped, x= 'country', y= 'count', color= 'count', color_continuous_scale= 'purp')
fig.show()

In [None]:
# top 10 countries in Tv-Show releases
df_grouped = df_shows.groupby("country")[["show_id"]].count().sort_values(by="show_id", ascending=False).head(10).reset_index()
df_grouped = df_grouped.rename(columns= {'show_id': 'count'})
fig = px.bar(df_grouped, x= 'country', y= 'count', color= 'count', color_continuous_scale= 'inferno')
fig.show()

In [None]:
# What are movies for Tom Cruise
df_movies[df_movies["cast"].str.contains("Tom Cruise")]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,month_name,season_count
341,s342,Movie,Magnolia,Paul Thomas Anderson,"John C. Reilly, Philip Baker Hall, Tom Cruise,...",United States,2021-08-01,1999,R,189.0,"Dramas, Independent Movies","Through chance, history and divine interventio...",2021,8,August,
1254,s1255,Movie,Rain Man,Barry Levinson,"Dustin Hoffman, Tom Cruise, Valeria Golino, Ge...",United States,2021-03-01,1988,R,134.0,"Classic Movies, Dramas","Motivated by money, a selfish workaholic seeki...",2021,3,March,


In [None]:
# create line chart showing for each release_year the nu of movies or series added to netflix
df_grouped_movies = df_movies.groupby('release_year')['show_id'].count().reset_index().sort_values('release_year')
df_grouped_movies.rename(columns= {'show_id': 'count'}, inplace= True)
df_grouped_movies

Unnamed: 0,release_year,count
0,1942,2
1,1943,3
2,1944,3
3,1945,3
4,1946,1
...,...,...
68,2017,766
69,2018,767
70,2019,633
71,2020,517


In [None]:
df_grouped_series = df_shows.groupby('release_year')['show_id'].count().reset_index().sort_values('release_year')
df_grouped_series.rename(columns= {'show_id': 'count'}, inplace= True)
df_grouped_series

Unnamed: 0,release_year,count
0,1925,1
1,1945,1
2,1946,1
3,1963,1
4,1967,1
5,1972,1
6,1974,1
7,1977,1
8,1979,1
9,1981,1


In [None]:
# what's the longest movie
longest_movie = df_movies.loc[df_movies.duration.idxmax(), 'title']
longest_movie

'Black Mirror: Bandersnatch'

In [None]:
# show histogram for movies duration with annotation indicates the longest movie
fig = px.histogram(df_movies, x= 'duration')
fig.update_layout(annotations= [{'showarrow': True, 'arrowhead': 3, 'x': 312, 'y': 1, 'text': longest_movie}])
fig.show()

In [None]:
# show below table
df_grouped = df.type.value_counts().reset_index()
df_grouped

Unnamed: 0,type,count
0,Movie,6128
1,TV Show,2676


In [None]:
# show pie chart for df.type
fig = px.pie(df_grouped, values= 'count', names= 'type', hole= 0.3)
fig.update_traces(textinfo='percent+label')
fig.update_layout({'title': {'text': 'Movies & TV Shows Percentages', 'x': 0.5, 'y': 0.95}})
fig.show()

In [None]:
# Genre Distribution
genre_counts = df['listed_in'].str.split(', ').explode().value_counts().reset_index()
genre_counts.columns = ['Genre', 'Count']
fig = px.bar(genre_counts, x='Genre', y='Count', title='Genre Distribution on Netflix')
fig.show()

# Classification Models

In [None]:
# Separate features and target variable
X = df.drop(columns=['type'])  # Features
y = df['type']  # Target variable

# Apply oversampling to address class imbalance
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Convert back to DataFrame if needed
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['type'])], axis=1)

# Encode categorical variables
label_encoder = LabelEncoder()
X_resampled_encoded = X_resampled.apply(label_encoder.fit_transform)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_encoded, y_resampled, test_size=0.2, random_state=42)

# Initializing classifiers
rf_classifier = RandomForestClassifier()
logistic_classifier = LogisticRegression()

# Fit the models on the training set
rf_classifier.fit(X_train, y_train)
logistic_classifier.fit(X_train, y_train)

# Making predictions on both the training and test sets for both classifiers
# Random Forest
rf_train_pred = rf_classifier.predict(X_train)
rf_test_pred = rf_classifier.predict(X_test)

# Logistic Regression
logistic_train_pred = logistic_classifier.predict(X_train)
logistic_test_pred = logistic_classifier.predict(X_test)

# Evaluating RandomForestClassifier (train and test accuracy)
rf_train_accuracy = accuracy_score(y_train, rf_train_pred)
rf_test_accuracy = accuracy_score(y_test, rf_test_pred)

# Evaluating Logistic Regression (train and test accuracy)
logistic_train_accuracy = accuracy_score(y_train, logistic_train_pred)
logistic_test_accuracy = accuracy_score(y_test, logistic_test_pred)

# Print the train and test accuracies
print("RandomForestClassifier Train Accuracy:", rf_train_accuracy)
print("RandomForestClassifier Test Accuracy:", rf_test_accuracy)

print("Logistic Regression Train Accuracy:", logistic_train_accuracy)
print("Logistic Regression Test Accuracy:", logistic_test_accuracy)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



RandomForestClassifier Train Accuracy: 1.0
RandomForestClassifier Test Accuracy: 1.0
Logistic Regression Train Accuracy: 0.9883720930232558
Logistic Regression Test Accuracy: 0.9877650897226754


# Content-Based Filtering

In [None]:
# Create a TF-IDF Vectorizer for the 'description' column
tfidf = TfidfVectorizer(stop_words='english')
df['description'] = df['description'].fillna('')
tfidf_matrix = tfidf.fit_transform(df['description'])

In [None]:
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Function to get recommendations based on cosine similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

# Test the recommendation system
print(get_recommendations('Avengers: Infinity War'))

8523                              The Talented Mr. Ripley
2382                                         The Darkness
4139              LEGO Marvel Super Heroes: Black Panther
6110                               Aliens Ate My Homework
1473                       Chilling Adventures of Sabrina
8681                                              Vroomiz
4936                                            Orbiter 9
143                                         Green Lantern
4140    LEGO Marvel Super Heroes: Guardians of the Galaxy
752                                       Vampire Academy
Name: title, dtype: object


# KNN Classification

In [None]:
# Initialize K-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_classifier.fit(X_train, y_train)

# Make predictions with KNN
y_train_pred = knn_classifier.predict(X_train)
y_test_pred = knn_classifier.predict(X_test)

# Calculate the training and test accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the train and test accuracies
print(f"K-Nearest Neighbors Classifier Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"K-Nearest Neighbors Classifier Test Accuracy: {test_accuracy * 100:.2f}%")

K-Nearest Neighbors Classifier Train Accuracy: 88.69%
K-Nearest Neighbors Classifier Test Accuracy: 82.87%


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # Create and assign value to X_train_scaled

knn_classifier = KNeighborsClassifier(n_neighbors=10)
scores = cross_val_score(knn_classifier, X_train_scaled, y_train, cv=5)
print(f"Cross-Validation Accuracy: {scores.mean() * 100:.2f}%")

Cross-Validation Accuracy: 99.89%


# Combined Voting Classifier

In [None]:
# Step 1: Preprocess the data
# Fill missing values with an empty string
df['director'] = df['director'].fillna('')
df['cast'] = df['cast'].fillna('')
df['listed_in'] = df['listed_in'].fillna('')
df['description'] = df['description'].fillna('')
df['duration'] = df['duration'].fillna('')

# Handle the duration column
def process_duration(duration):
  duration = str(duration)
  if 'min' in duration:
         return int(duration.replace(' min', ''))  # Extract minutes
  elif 'Season' in duration:
         return int(duration.replace(' Seasons', '').replace(' Season', '')) * 60  # Convert seasons to hours (as a proxy)
  else:
         return 0

df['duration'] = df['duration'].fillna('0').apply(process_duration)

# Step 2: Combine features including description for similarity computation
df['combined_features'] = df['type'] + ' ' + df['director'] + ' ' + df['cast'] + ' ' + df['listed_in'] + ' ' + df['description']

# Step 3: Vectorize the text data using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Step 4: Compute cosine similarity between all movies/shows
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 5: Build a function that recommends shows or movies based on similarity score
def get_recommendations(title, cosine_sim=cosine_sim):
    # Try to get the index of the movie that matches the title
    try:
        idx = df[df['title'].str.contains(title, case=False)].index[0]
    except IndexError:
        return "Sorry, the title you entered was not found in the dataset."

    # Get the pairwise similarity scores of all shows/movies with that title
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the shows/movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar shows/movies
    sim_scores = sim_scores[1:11]

    # Get the show/movie indices
    show_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar shows/movies
    return df['title'].iloc[show_indices]

# Step 6: Test the recommendation function with error handling
print(get_recommendations('Avengers: Infinity War'))


8577                       Thor: Ragnarok
7402     Mark Gatiss: A Study in Sherlock
6322                        Black Panther
6955                                  Her
8693                            War Horse
8392                    The Little Prince
1027                         Crimson Peak
1612                                 Chef
969                  August: Osage County
1406    Penguins of Madagascar: The Movie
Name: title, dtype: object


In [None]:
import joblib

# Save the TF-IDF vectorizer
tfidf_path = 'tfidf_vectorizer.joblib'
joblib.dump(tfidf, tfidf_path)

# Save the cosine similarity matrix
cosine_sim_path = 'cosine_similarity_matrix.joblib'
joblib.dump(cosine_sim, cosine_sim_path)

# Save the DataFrame (if needed)
df_path = 'movies_dataframe.joblib'
joblib.dump(df, df_path)


['movies_dataframe.joblib']

In [None]:
# Combining text features for content-based similarity
df['combined_features'] = df['type'] + ' ' + df['director'] + ' ' + df['cast'] + ' ' + df['listed_in'] + ' ' + df['description']

# Step 1: Content-Based Model (TF-IDF + Cosine Similarity)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# KNN Preparation
# Step 2: KNN Model (for classification)

# Separate features and target variable for KNN
X = df.drop(columns=['type'])  # Features
y = df['type']  # Target variable

# Apply oversampling to address class imbalance
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Encode categorical variables for KNN
label_encoder = LabelEncoder()
X_resampled_encoded = X_resampled.apply(label_encoder.fit_transform)

# Feature scaling (KNN benefits from feature scaling)
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_scaled, y_resampled, test_size=0.2, random_state=42)

# Initialize K-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_classifier.fit(X_train, y_train)

# Function to get recommendations from content-based model
def get_content_recommendations(title, cosine_sim=cosine_sim):
    try:
        idx = df[df['title'].str.contains(title, case=False)].index[0]
    except IndexError:
        return []

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    show_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[show_indices]

# Voting Classifier: Combine KNN and Content-Based Model
def voting_classifier(title, cosine_sim=cosine_sim, knn_model=knn_classifier, weight_knn=0.5, weight_content=0.5):
    # Step 1: Get content-based recommendations
    content_based_recommendations = get_content_recommendations(title)

    # Step 2: Get KNN predictions
    # Note: We use KNN to predict the "type" (TV Show/Movie) based on numeric and categorical features
    # For simplicity, we simulate a scenario where we want to predict the type of the given title.
    try:
        idx = df[df['title'].str.contains(title, case=False)].index[0]
        knn_pred = knn_model.predict([X_resampled_scaled[idx]])[0]  # KNN predicts the type
    except IndexError:
        return "Title not found in the dataset."

    # Step 3: Combine content-based and KNN results
    if len(content_based_recommendations) > 0:
        return {
            "KNN_Prediction": knn_pred,
            "Content-Based_Recommendations": content_based_recommendations,
        }
    else:
        return "No content-based recommendations found."

# Test the combined voting classifier with a specific title
result = voting_classifier('Avengers: Infinity War')
print(result)

{'KNN_Prediction': 'Movie', 'Content-Based_Recommendations': 8577                       Thor: Ragnarok
7402     Mark Gatiss: A Study in Sherlock
6322                        Black Panther
6955                                  Her
8693                            War Horse
8392                    The Little Prince
1027                         Crimson Peak
1612                                 Chef
969                  August: Osage County
1406    Penguins of Madagascar: The Movie
Name: title, dtype: object}


In [None]:
# Load the saved models and necessary components
df = joblib.load(df_path)

# Print columns for debugging
st.write("DataFrame columns:", df.columns.tolist())


In [None]:
# Test the recommendation system with another movie or TV show
result = voting_classifier('Dumb And Dumber')
print(result)

{'KNN_Prediction': 'Movie', 'Content-Based_Recommendations': 7576                                      New York Minute
6726                                           F the Prom
677                                        Schitt's Creek
1887    Best Wishes, Warmest Regards: A Schitt's Creek...
7100                                         It Takes Two
584                                Not Another Teen Movie
6879                                                 Goon
4864                                               Spivak
155                                             Labyrinth
218                                        Titletown High
Name: title, dtype: object}


In [None]:
# Test the recommendation system with another movie or TV show
result = voting_classifier('Black Mirror')
print(result)

{'KNN_Prediction': 'TV Show', 'Content-Based_Recommendations': 4262                         Watership Down
3551    The Dark Crystal: Age of Resistance
5097                               Lovesick
6837                              Get Santa
6919                           Happy Valley
3306           The End of the F***ing World
6968                             Hinterland
1991                           Criminal: UK
4095                              Black Sea
1301                        Behind Her Eyes
Name: title, dtype: object}


In [None]:
import joblib

# Assuming 'knn_classifier' is your trained KNN model
knn_path = 'knn_model.joblib'
joblib.dump(knn_classifier, knn_path)

# Save the label encoder
label_encoder_path = 'label_encoder.joblib'
joblib.dump(label_encoder, label_encoder_path)

# Save the scaler
scaler_path = 'scaler.joblib'
joblib.dump(scaler, scaler_path)

# Save the TF-IDF vectorizer
tfidf_path = 'tfidf_vectorizer.joblib'
joblib.dump(tfidf, tfidf_path)

# Save the cosine similarity matrix
cosine_sim_path = 'cosine_similarity_matrix.joblib'
joblib.dump(cosine_sim, cosine_sim_path)

# Save the DataFrame (if needed)
df_path = 'movies_dataframe.joblib'
joblib.dump(df, df_path)


['movies_dataframe.joblib']

In [None]:
print(df.columns)


Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'year_added', 'month_added', 'month_name', 'season_count',
       'combined_features'],
      dtype='object')


In [None]:
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,month_name,season_count,combined_features
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,0,Documentaries,"As her father nears the end of his life, filmm...",2021,9,September,,Movie Kirsten Johnson Unknown Documentaries As...
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,0,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021,9,September,2.0,"TV Show Unknown Ama Qamata, Khosi Ngema, Gail ..."


# Deep learning model

TF-IDF uses word counts and document frequency to measure the importance of words, whereas word embeddings like GloVe use pre-trained vectors that capture semantic meanings.

1.   Load GloVe embeddings: Download the GloVe embeddings file and load it into a dictionary.
2.   Preprocess your text data: Tokenize the descriptions and map each word to its corresponding GloVe vector.
3.   Generate document embeddings: Instead of calculating TF-IDF, compute the document embedding as the average (or weighted average) of the GloVe embeddings for the words in the document.

In [None]:
!curl --header 'Host: storage.googleapis.com' --user-agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:130.0) Gecko/20100101 Firefox/130.0' --header 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8' --header 'Accept-Language: en-US,en;q=0.5' --header 'Upgrade-Insecure-Requests: 1' --header 'Sec-Fetch-Dest: document' --header 'Sec-Fetch-Mode: navigate' --header 'Sec-Fetch-Site: none' --header 'Sec-Fetch-User: ?1' 'https://storage.googleapis.com/kaggle-data-sets/715814/1246668/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240928%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240928T221902Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=0ef866cc29837053951c53711e12ac68b6125a13fb09c9ff71b39c8e30de68481d27580976dea537673649f2072cee1621f5fe5dd1c26df71e74e78a925f26f8173a09052a97780f026be2ee3678abdba5779eb14622733e8360c70dd2c0274f50910bf99c72e41195ad74311c78c5f3aacbeca31b0d19f1181c231e2fa15e1b604245c2ff39aa50888691e3d8232f2cdc709a745a1a5b092eae6d157c192bcf5064fa693cdec9ebec3ef56e2000ce455e725a04654c877fe68bc049da1aa0c742dac4aceddee7bde6cbd643e91f9e6943fb1b8842694cc7380cd7b6e8abc3d61842c9c0f4b8b055e4865b777e53616e00351ad4024e5d57f0823590a91542e3' --output 'archive.zip'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   220  100   220    0     0   1105      0 --:--:-- --:--:-- --:--:--  1111


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip -d glove

--2024-10-06 13:42:39--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-10-06 13:42:39--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-10-06 13:42:40--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
!unzip archive.zip

Archive:  archive.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of archive.zip or
        archive.zip.zip, and cannot find archive.zip.ZIP, period.


In [None]:
# 1. Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding = np.array(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    return embeddings_index

# Path to your GloVe file
glove_file = 'glove/glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_file)

# 2. Tokenize and preprocess text
def preprocess(text):
    return text.lower().split()

# 3. Get GloVe embedding for each word in the text
def get_document_embedding(text, glove_embeddings, embedding_dim=100):
    tokens = preprocess(text)
    valid_embeddings = []

    for token in tokens:
        if token in glove_embeddings:
            valid_embeddings.append(glove_embeddings[token])

    if valid_embeddings:
        # Average the embeddings (you can also use weighted averages)
        doc_embedding = np.mean(valid_embeddings, axis=0)
    else:
        # If no valid embeddings are found, return a zero vector
        doc_embedding = np.zeros(embedding_dim)

    return doc_embedding

# Apply this to your dataframe
df['description'] = df['description'].fillna('')
embedding_dim = 100  # Adjust according to the GloVe version you're using

# Create document embeddings for each description
embeddings = np.array([get_document_embedding(desc, glove_embeddings, embedding_dim) for desc in tqdm(df['description'])])

# embeddings now contains GloVe-based vector representations for each document

100%|██████████| 8804/8804 [00:00<00:00, 14376.23it/s]


In [None]:
# 1. Function to compute cosine similarity between document embeddings
def compute_cosine_similarity(embeddings):
    return cosine_similarity(embeddings)

# Compute the cosine similarity matrix using GloVe embeddings
cosine_sim_glove = compute_cosine_similarity(embeddings)

# 2. Function to get recommendations based on GloVe cosine similarity
def get_glove_recommendations(title, cosine_sim=cosine_sim_glove):
    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]  # Skipping the first movie (itself)

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

# Test the recommendation system with GloVe embeddings
print(get_glove_recommendations('Avengers: Infinity War'))

1616                                        Monster House
2212                                       Sing On! Spain
6951                                        Hell and Back
7767                    Power Rangers Operation Overdrive
3551                  The Dark Crystal: Age of Resistance
240                    The Witcher: Nightmare of the Wolf
2544                                       Action Replayy
3517    The Crystal Calls Making the Dark Crystal: Age...
7576                                      New York Minute
2878                                       Couple of Days
Name: title, dtype: object


# Neural Collaborative Filtering

In [None]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'year_added', 'month_added', 'month_name', 'season_count',
       'combined_features'],
      dtype='object')

In [None]:
# Encoding user_ids (for demonstration purposes, we simulate users)
df['user_id'] = np.random.randint(0, 1000, size=len(df))  # Randomly simulate users

# Label encoding for user and movie IDs
user_encoder = LabelEncoder()
df['user_id_encoded'] = user_encoder.fit_transform(df['user_id'])

movie_encoder = LabelEncoder()
df['movie_id_encoded'] = movie_encoder.fit_transform(df['show_id'])

# Metadata preparation (e.g., genre, director, actors)
# This will be used for content-based embedding extensions later on
df['genre'] = df['listed_in'].fillna('')
df['director'] = df['director'].fillna('')

# Split the data into training and testing sets
train, test = train_test_split(df[['user_id_encoded', 'movie_id_encoded']], test_size=0.2)

# Hyperparameters
embedding_size = 50  # Size of the embedding vectors for users and movies
num_users = len(df['user_id_encoded'].unique())
num_movies = len(df['movie_id_encoded'].unique())

# Define input layers for users and movies
user_input = Input(shape=(1,), name='user_input')
movie_input = Input(shape=(1,), name='movie_input')

# Embedding layers for users and movies
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name='user_embedding')(user_input)
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size, name='movie_embedding')(movie_input)

# Flatten the embeddings to feed into the neural network
user_vec = Flatten()(user_embedding)
movie_vec = Flatten()(movie_embedding)

# Dot product of user and movie vectors (to model user-movie interactions)
dot_product = Dot(axes=1, normalize=True)([user_vec, movie_vec])

# Output layer (predicting interaction strength, like a rating)
output = Dense(1, activation='sigmoid')(dot_product)

# Define the model
model = Model([user_input, movie_input], output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Prepare data for training
train_user_data = train['user_id_encoded'].values
train_movie_data = train['movie_id_encoded'].values
train_labels = np.random.randint(0, 2, size=len(train))  # Simulated interaction data (0/1)

# Fit the model
model.fit([train_user_data, train_movie_data], train_labels, epochs=10, batch_size=64, validation_split=0.1)

# Now you can use the trained embeddings for recommendation
# For example, you can compute similarity between movie embeddings to recommend similar movies


Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.4961 - loss: 0.6951 - val_accuracy: 0.5092 - val_loss: 0.6954
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9835 - loss: 0.6011 - val_accuracy: 0.5092 - val_loss: 0.6958
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9994 - loss: 0.4944 - val_accuracy: 0.4979 - val_loss: 0.6962
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 1.0000 - loss: 0.3989 - val_accuracy: 0.5007 - val_loss: 0.6964
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 1.0000 - loss: 0.3313 - val_accuracy: 0.4965 - val_loss: 0.6971
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 1.0000 - loss: 0.2836 - val_accuracy: 0.4993 - val_loss: 0.6976
Epoch 7/10
[1m100/100

<keras.src.callbacks.history.History at 0x7ace0aa01a50>

In [None]:
# Hyperparameters
n_users = df['user_id'].nunique()  # Number of unique users
n_movies = df['show_id'].nunique()  # Number of unique movies
embedding_size = 50  # Size of user and movie embeddings
dropout_rate = 0.5  # Dropout rate for regularization

# Input layers for users and movies
user_input = Input(shape=(1,), name='user_input')
movie_input = Input(shape=(1,), name='movie_input')

# Embedding layers for users and movies
user_embedding = Embedding(input_dim=n_users, output_dim=embedding_size, name='user_embedding')(user_input)
movie_embedding = Embedding(input_dim=n_movies, output_dim=embedding_size, name='movie_embedding')(movie_input)

# Flatten the embeddings
user_vec = Flatten()(user_embedding)
movie_vec = Flatten()(movie_embedding)

# Concatenate user and movie embeddings
concat = Concatenate()([user_vec, movie_vec])

# Dense layers to capture higher-order interactions
dense_1 = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(concat)
dropout_1 = Dropout(dropout_rate)(dense_1)
dense_2 = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(dropout_1)
dropout_2 = Dropout(dropout_rate)(dense_2)

# Output layer (interaction prediction)
output = Dense(1, activation='linear', name='output')(dropout_2)

# Define the model
ncf_model = Model([user_input, movie_input], output)

# Compile the model
ncf_model.compile(optimizer=Adam(learning_rate=0.0005), loss='mean_squared_error', metrics=['mean_squared_error'])

# Print model summary
ncf_model.summary()

# Assuming you have the following prepared data:
# X_train_user, X_train_movie: arrays of user and movie IDs for training
# y_train: the target values (e.g., ratings or interaction labels)

# Split the data into training and testing sets
train, test = train_test_split(df[['user_id_encoded', 'movie_id_encoded', 'rating']], test_size=0.2)

# Prepare data for training
X_train_user = train['user_id_encoded'].values
X_train_movie = train['movie_id_encoded'].values
y_train = np.random.randint(0, 2, size=len(train)) # Generate random labels for training since you don't have numerical ratings

# Prepare data for testing
X_test_user = test['user_id_encoded'].values
X_test_movie = test['movie_id_encoded'].values
y_test = np.random.randint(0, 2, size=len(test)) # Generate random labels for testing since you don't have numerical ratings

# Train the model
ncf_model.fit([X_train_user, X_train_movie], y_train, batch_size=64, epochs=100, validation_split=0.2)

Epoch 1/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - loss: 2.0666 - mean_squared_error: 0.4202 - val_loss: 1.1471 - val_mean_squared_error: 0.2545
Epoch 2/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.9916 - mean_squared_error: 0.2561 - val_loss: 0.6319 - val_mean_squared_error: 0.2528
Epoch 3/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.5559 - mean_squared_error: 0.2450 - val_loss: 0.4117 - val_mean_squared_error: 0.2525
Epoch 4/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.3429 - mean_squared_error: 0.2074 - val_loss: 0.3473 - val_mean_squared_error: 0.2537
Epoch 5/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.1832 - mean_squared_error: 0.0912 - val_loss: 0.3429 - val_mean_squared_error: 0.2609
Epoch 6/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss:

<keras.src.callbacks.history.History at 0x7acd9423c940>

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_similar_movies(movie_name, movie_embeddings, df, top_n=10):

    # Create a mapping from encoded IDs to movie titles
    movie_idx_to_title = dict(zip(df['movie_id_encoded'], df['title']))

    # Check if the given movie name is in the dataset
    if movie_name not in df['title'].values:
        print(f"Movie '{movie_name}' not found in the dataset.")
        return []

    # Get the encoded ID for the given movie name
    movie_idx = df[df['title'] == movie_name].iloc[0]['movie_id_encoded']

    # Get the embedding for the given movie
    target_movie_embedding = movie_embeddings[movie_idx].reshape(1, -1)

    # Compute cosine similarity between the target movie embedding and all other movie embeddings
    similarities = cosine_similarity(target_movie_embedding, movie_embeddings)[0]

    # Create a dataframe to hold movie indices and similarity scores
    similarity_df = pd.DataFrame({
        'movie_id_encoded': np.arange(len(similarities)),
        'similarity_score': similarities
    })

    # Exclude the target movie itself from the recommendations
    similarity_df = similarity_df[similarity_df['movie_id_encoded'] != movie_idx]

    # Get top N most similar movies
    top_similar_movies = similarity_df.sort_values(by='similarity_score', ascending=False).head(top_n)

    # Map encoded IDs back to movie titles
    recommended_titles = top_similar_movies['movie_id_encoded'].map(lambda x: movie_idx_to_title.get(x, "Unknown Title")).values

    return recommended_titles

# Assuming you have a trained movie embedding layer
# We extract the weights from the embedding layer to get movie embeddings
movie_embeddings_weights = ncf_model.get_layer('movie_embedding').get_weights()[0]

# Example usage:
movie_name = "Avengers: Infinity War"  # Replace with any movie name to get similar movies
similar_movies = recommend_similar_movies(movie_name, movie_embeddings_weights, df, top_n=10)

print(f"Top 10 movies similar to '{movie_name}':")
for idx, title in enumerate(similar_movies, 1):
    print(f"{idx}. {title}")


Top 10 movies similar to 'Avengers: Infinity War':
1. The Society
2. YES DAY
3. Katarzyna Piasecka, Rafał Pacześ Seriously Funny
4. Marianne
5. A Perfect Fit
6. The Babysitter
7. Monster Math Squad
8. Arrow
9. Making The Witcher
10. Axone


In [None]:
df.to_csv('preprocessed_netflix_data.csv', index=False)

In [None]:
import joblib

# Assuming your KNN model is stored in the variable 'knn_classifier'
model_path = '/content/knn_model.pkl'  # Specify the path where you want to save the model
joblib.dump(knn_classifier, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/knn_model.pkl
