In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer

#Load the dataset
file_path = 'final_movies_.csv'  # Replace with your dataset file path
movies = pd.read_csv(file_path)

#Features to use
numerical_features = ["popularity", "runtime", "vote_average"]
categorical_features = ["genres", "original_language"]

#Scale Numerical Features
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(movies[numerical_features])

#Encode 'original_language' 
onehot_encoded_language = pd.get_dummies(movies['original_language'], prefix='lang')

#Remove extra whitespaces from the 'genres' column
movies['genres'] = movies['genres'].str.replace(r'\s+', '', regex=True)

#Encode 'genre'
onehot_encoded_genres = movies['genres'].str.get_dummies(sep=',')

#Combine Features
features_combined = pd.concat(
    [pd.DataFrame(numerical_scaled, columns=numerical_features), onehot_encoded_genres, onehot_encoded_language],
    axis=1
).values

print(features_combined)


[[0.9691176470588233 0.023848112989117848 0.6662222222222223 ... False
  False False]
 [0.8191176470588236 0.030562630238481128 0.6698888888888888 ... False
  False False]
 [0.6926470588235293 0.023848112989117848 0.6006666666666667 ... False
  False False]
 ...
 [0.9117647058823529 0.018291271127575826 0.39999999999999997 ... False
  False False]
 [0.0 0.01203982403334105 0.6222222222222222 ... False False False]
 [0.0 0.013660569576290808 0.5111111111111111 ... False False False]]


In [3]:
print(onehot_encoded_genres)

       Action  Adventure  Animation  Comedy  Crime  Documentary  Drama  \
0           1          1          0       0      0            0      0   
1           1          1          0       0      0            0      0   
2           1          1          0       0      0            0      0   
3           0          0          1       0      0            0      1   
4           0          0          0       1      0            0      0   
...       ...        ...        ...     ...    ...          ...    ...   
23095       0          0          0       0      0            0      1   
23096       0          1          0       1      0            0      1   
23097       0          0          0       0      0            0      1   
23098       0          0          0       0      0            1      0   
23099       0          0          0       0      0            1      0   

       Family  Fantasy  History  Horror  Music  Mystery  Romance  \
0           0        0        0       0    

In [9]:

#Train the KNN Model
knn = NearestNeighbors(metric='cosine')
knn.fit(features_combined)

#Function to Recommend Movies
def recommend_movies_knn(movie_title, n_recommendations=5):
    try:
        # Find the index of the movie
        movie_idx = movies[movies['title'].str.lower() == movie_title.lower()].index[0]
        
        # Get the feature vector for the movie
        movie_vector = features_combined[movie_idx].reshape(1, -1)
        
        # Find the nearest neighbors
        distances, indices = knn.kneighbors(movie_vector, n_neighbors=n_recommendations + 1)
    
        # Get the titles of the recommended movies (exclude the movie itself)
        recommended_movies = movies.iloc[indices[0][1:]]['title'].tolist()
        return recommended_movies
    except IndexError:
        return f"Movie '{movie_title}' not found in the dataset."


movie_to_search = "Dark Silence"  # Replace with a valid movie title
recommended_movies = recommend_movies_knn(movie_to_search)

#Output Recommendations
print(f"Movies similar to '{movie_to_search}':")
print(recommended_movies)


Movies similar to 'Dark Silence':
['Ghosts That Still Walk', 'Raymond Did It', 'Play Dead', 'Fear House', 'Tomboys']
