In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import joblib

# Load the dataset
file_path = 'Datasets/netflix_titles.csv'
df = pd.read_csv(file_path)

# Split the listed_in column to handle multiple genres
df['listed_in'] = df['listed_in'].apply(lambda x: x.split(', '))

# Use MultiLabelBinarizer to create a binary matrix for genres
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df['listed_in'])

# Create a new DataFrame with the encoded genres
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

# Combine the genres with the original DataFrame
df = pd.concat([df, genres_df], axis=1)

# Save the processed data and the genre columns
df.to_csv('Datasets/processed_netflix_titles.csv', index=False)
joblib.dump(mlb.classes_, 'Datasets/genre_columns.joblib')


['Datasets/genre_columns.joblib']

In [2]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

# Define the feature matrix and target
X = genres_df
y = df['title']

# Initialize and train the k-NN model
knn = NearestNeighbors(n_neighbors=10, algorithm='auto').fit(X)

# Save the model using joblib
joblib_file = 'Datasets/knn_model.joblib'
joblib.dump(knn, joblib_file)


['Datasets/knn_model.joblib']