In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("milanvaddoriya/imdb-movie-rating")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/milanvaddoriya/imdb-movie-rating?dataset_version_number=1...


100%|██████████| 24.0k/24.0k [00:00<00:00, 25.9MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/milanvaddoriya/imdb-movie-rating/versions/1





In [None]:
import os
import pandas as pd

# List all files in the downloaded directory to find the correct file path
dataset_dir = path  # Ensure this is the path to the downloaded directory
print("Contents of the dataset directory:", os.listdir(dataset_dir))

# Assuming you identify the correct file from the list (e.g., "imdb-movie-ratings.csv")
file_path = os.path.join(dataset_dir, "imdb-movie-ratings.csv")


Contents of the dataset directory: ['imdb.csv']


In [None]:
import os

# List all files in the dataset directory
dataset_dir = '/root/.cache/kagglehub/datasets/milanvaddoriya/imdb-movie-rating/versions/1'
print("Files in the dataset directory:", os.listdir(dataset_dir))


Files in the dataset directory: ['imdb.csv']


In [None]:
import pandas as pd

# Replace "your_actual_file.csv" with the actual file name found from the directory listing
file_path = os.path.join(dataset_dir, "imdb.csv")

# Load the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Display the first few rows to confirm the dataset loaded correctly
print(data.head())


  Rank                                     Movie_name  Year Certificate  \
0    1                       The Shawshank Redemption  1994           R   
1    2                                  The Godfather  1972           R   
2    3                                The Dark Knight  2008       PG-13   
3    4  The Lord of the Rings: The Return of the King  2003       PG-13   
4    5                               Schindler's List  1993           R   

   Runtime_in_min                      Genre  Metascore Gross_in_$_M  \
0             142                      Drama       81.0        28.34   
1             175               Crime, Drama      100.0       134.97   
2             152       Action, Crime, Drama       84.0       534.86   
3             201   Action, Adventure, Drama       94.0       377.85   
4             195  Biography, Drama, History       94.0        96.90   

   Rating_from_10  
0             9.3  
1             9.2  
2             9.0  
3             9.0  
4             9.

In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.impute import SimpleImputer

In [None]:
features = data[['Runtime_in_min', 'Genre', 'Metascore', 'Gross_in_$_M', 'Rating_from_10']]
data['Gross_in_$_M'] = pd.to_numeric(data['Gross_in_$_M'], errors='coerce')  # Convert to numeric

# Preprocessing pipeline with an Imputer for NaNs
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
            ('scaler', StandardScaler())
        ]), ['Runtime_in_min', 'Metascore', 'Gross_in_$_M', 'Rating_from_10']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Genre'])
    ]
)

X = preprocessor.fit_transform(features)

# Range of k values to test
k_values = np.arange(5, 21, 2)
distances_mean = []

# Iterate through each k and calculate the average distance (proxy for recommendation quality)
for k in k_values:
    knn = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute')
    knn.fit(X)
    distances, _ = knn.kneighbors(X)
    avg_distance = distances[:, 1:].mean()  # Exclude the 0-distance to the item itself
    distances_mean.append(avg_distance)

# Select the k with the smallest mean distance
optimal_k = k_values[np.argmin(distances_mean)]
print(f"Optimal k based on average distance: {optimal_k}")

# Train the KNN model with optimal k
knn = NearestNeighbors(n_neighbors=optimal_k, metric='cosine', algorithm='brute')
knn.fit(X)

# Function to recommend movies
def recommend_movies(movie_title, data, model, preprocessor, num_recommendations=5):
    # Find the index of the selected movie
    movie_idx = data[data['Movie_name'] == movie_title].index[0]

    # Get the feature vector for the movie
    movie_features = preprocessor.transform(features.iloc[[movie_idx]])

    # Find nearest neighbors
    distances, indices = model.kneighbors(movie_features, n_neighbors=num_recommendations + 1)

    # Retrieve movie names for recommendations
    recommended_movies = data.iloc[indices[0][1:]]['Movie_name']
    return recommended_movies

# Example usage
movie_to_search = "The Godfather"
recommendations = recommend_movies(movie_to_search, data, knn, preprocessor)
print(f"Recommendations for {movie_to_search}:")
print(recommendations)

Optimal k based on average distance: 5
Recommendations for The Godfather:
8                       Pulp Fiction
5              The Godfather Part II
4                   Schindler's List
19                        Goodfellas
16    The Good, the Bad and the Ugly
Name: Movie_name, dtype: object
