In [None]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Load the data
df = pd.read_csv('../Data/imdb_top_1000.csv')

# Keep only relevant columns and clean data
df = df[['Series_Title', 'Genre', 'IMDB_Rating']]
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

# MultiLabelBinarizer for genres
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(df['Genre']), columns=mlb.classes_, index=df.index)

# Combine the one-hot-encoded genres with the original DataFrame
df_transformed = pd.concat([df[['Series_Title', 'IMDB_Rating']], genre_encoded], axis=1)

# Scale all features uniformly
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_transformed.drop(columns=['Series_Title'])),
    columns=df_transformed.drop(columns=['Series_Title']).columns,
    index=df.index
)

# Fit the NearestNeighbors model
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(df_scaled)

# Search for similar movies
movie = 'The Big Lebowski'
movie_index = df[df['Series_Title'] == movie].index[0]
movie_vector = pd.DataFrame([df_scaled.iloc[movie_index]], columns=df_scaled.columns)
#print(film_vector)

# Find nearest neighbors
distances, indices = knn.kneighbors(movie_vector)

# Show similar movies
similar_films = df.iloc[indices[0]]
print(f"Ähnliche Filme zu '{movie}':")
print(similar_films[['Series_Title', 'IMDB_Rating', 'Genre']])
