# Train a Regression Model 

In [2]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [46]:
# Load training data
train_df = pd.read_csv('train.csv')

# Features and Target let's use only movie features and skip user features
X_train = train_df.drop(columns=['Unnamed: 0','userId','movieId','rating', 'user_last_rating_ts', 'user_last_rating_ordinal', 'user_rating_count','user_rating_mean','user_rating_var','user_rating_std'])
y_train = train_df['rating']

In [82]:
X_train

Unnamed: 0,movie_rating_count,movie_rating_mean,movie_rating_var,movie_rating_std,year,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,668,3.101048,1.420434,1.191819,1995,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,18242,3.715355,0.967558,0.983645,1995,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1124,3.430160,0.924103,0.961303,1995,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,1179,3.486853,1.092145,1.045057,1995,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,3235,3.288099,1.025353,1.012597,1996,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20000071,18474,4.026145,0.782865,0.884797,2007,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20000072,14182,3.491045,0.884871,0.940676,2007,1,0,0,0,0,...,0,1,1,0,0,0,1,1,0,0
20000073,2059,3.122875,1.155934,1.075144,2008,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
20000074,4100,3.451220,1.260001,1.122498,2008,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X_train.columns

Index(['movie_rating_count', 'movie_rating_mean', 'movie_rating_var',
       'movie_rating_std', 'year', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [83]:
# Define the model
xgb = XGBRegressor(random_state = 42, n_jobs = -1)

# Define hyperparameters to search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2]
}

In [84]:
# Grid Search
grid_search = GridSearchCV(
    estimator = xgb,
    param_grid = param_grid,
    cv = 3,
    scoring = 'neg_root_mean_squared_error',
    verbose = 2
)

In [85]:
# Fit the model
grid_search.fit(X_train, y_train)

print("Best Params: ", grid_search.best_params_)
print("Best RMSE: ", grid_search.best_score_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END ..learning_rate=0.05, max_depth=4, n_estimators=100; total time=  17.0s
[CV] END ..learning_rate=0.05, max_depth=4, n_estimators=100; total time=  16.3s
[CV] END ..learning_rate=0.05, max_depth=4, n_estimators=100; total time=  15.9s
[CV] END ..learning_rate=0.05, max_depth=4, n_estimators=200; total time=  23.8s
[CV] END ..learning_rate=0.05, max_depth=4, n_estimators=200; total time=  24.1s
[CV] END ..learning_rate=0.05, max_depth=4, n_estimators=200; total time=  23.9s
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=100; total time=  21.6s
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=100; total time=  21.3s
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=100; total time=  21.1s
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=200; total time=  32.5s
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=200; total time=  33.0s
[CV] END ..learning_rate=0.05, max_depth=6, n_es

# Model Evaluation and Exporting the Model

In [47]:
import joblib

# Load Test Set
test_df = pd.read_csv('test.csv')

# Prepare X_test and y_test
X_test = test_df.drop(columns=['Unnamed: 0','userId', 'rating', 'user_last_rating_ts', 'user_last_rating_ordinal', 'user_rating_count','user_rating_mean','user_rating_var','user_rating_std'])
y_test = test_df['rating']

In [92]:
# Load the best model
best_model = grid_search.best_estimator_

In [93]:
# Predict and Evaluate
y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(f"Test RMSE: {rmse:.4f}")

joblib.dump(best_model, 'xgb_rating_model.joblib')

Test RMSE: 0.9363




['xgb_rating_model.joblib']

In [94]:
y_pred = best_model.predict(X_train)
rmse = mean_squared_error(y_train, y_pred, squared = False)
print(f"Train RMSE: {rmse:.4f}")

Train RMSE: 0.9622




In [95]:
X_train.shape

(20000076, 24)

# Movie Search & Recommendation Interface  
# Using KNN Cosine Metric for Directional Vectors

In [7]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import joblib

# Load DataSets
movie_feats = train_df.drop(columns=['Unnamed: 0','userId','rating', 'user_last_rating_ts', 'user_last_rating_ordinal', 'user_rating_count','user_rating_mean','user_rating_var','user_rating_std'])

movie_feats = movie_feats.drop_duplicates(subset = 'movieId').reset_index(drop = True)
 
movies = pd.read_csv('../movies.csv')

feature_cols = [
    'movie_rating_count','movie_rating_mean','movie_rating_var','movie_rating_std',
    'year','Action','Adventure','Animation','Children','Comedy','Crime',
    'Documentary','Drama','Fantasy','Film-Noir','Horror','IMAX','Musical',
    'Mystery','Romance','Sci-Fi','Thriller','War','Western'
]

In [120]:
X_movies = movie_feats[feature_cols].values

# Fit NearestNeighbors on movie features (cosine similarity)
nn = NearestNeighbors(n_neighbors = 11, metric = 'cosine', n_jobs = -1, algorithm = 'brute')
nn.fit(X_movies)
print('Model Trained....')

Model Trained....


In [125]:
# Recommendation Function
rating_model = joblib.load("xgb_rating_model.joblib") # Load Rating Model

# Recommend based on rating
def recommend(movie_title, n_similar = 10, per_genre = 3):
    # Lookup movieId
    movie_row = movies[movies.title.str.contains(movie_title, regex = False)]
    if movie_row.empty:
        raise ValueError(f"No Movie found matching '{movie_title}'")
    movie_id = movie_row.iloc[0].movieId
    
    # Predict Rating
    mf = movie_feats[movie_feats.movieId == movie_id]
    if mf.empty:
        raise ValueError(f"Features not found for movieId {movie_id}")
    Xq = mf[feature_cols]
    pred_rating = rating_model.predict(Xq)[0]
    
    # Find similar movies KNN query
    idx = movie_feats.index[movie_feats.movieId == movie_id][0]
    distances, indices = nn.kneighbors(X_movies[idx].reshape(1, -1), n_neighbors = n_similar + 1)
    
    # Skip the first one (itself)
    sim_ids = movie_feats.iloc[indices[0][1:]]['movieId'].values
    sim_titles = movies[movies.movieId.isin(sim_ids)]['title'].tolist()
    
    # Genre-filtered suggestions
    genres = movie_row.iloc[0].genres.split('|')
    genres_suggestions = {}
    for g in genres:
        # find movie in same genre, sort by movie_rating_mean desc
        candidates = movie_feats[movie_feats[g] == 1]
        topn = candidates.sort_values('movie_rating_mean', ascending = False)\
                .movieId.head(per_genre).values
        genres_suggestions[g] = movies[movies.movieId.isin(topn)]['title'].tolist()
        
    # Return all
    return {
        'input_movie': movie_row.iloc[0].title,
        'predicted_rating': round(pred_rating, 2),
        'similar_movies': sim_titles,
        'by_genre': genres_suggestions
    }

In [134]:
# Example Usage
res = recommend("Alien")
print("Input Movie:", res['input_movie'])
print("Predicted Rating:", res['predicted_rating'], "★")
print("Similar Movies:", res['similar_movies'])
print("Genre‐based Suggestions:")
for genre, titles in res['by_genre'].items():
    print(f"  {genre}: {titles}")

Input Movie: Aliens (1986)
Predicted Rating: 4.01 ★
Similar Movies: ['Babe (1995)', 'Batman Forever (1995)', 'Dumb & Dumber (Dumb and Dumber) (1994)', 'Rock, The (1996)', 'American History X (1998)', 'Star Wars: Episode I - The Phantom Menace (1999)', 'Ghostbusters (a.k.a. Ghost Busters) (1984)', 'X-Men (2000)', "Ocean's Eleven (2001)", 'Beautiful Mind, A (2001)']
Genre‐based Suggestions:
  Action: ["Tarzan's Savage Fury (1952)", 'Temple of the White Elephant (1964)', 'To Be the Best (1993)']
  Adventure: ['Bicycle Dreams (2009)', 'The Viking (1931)', 'Windstorm 2 (2015)']
  Horror: ['Fractured (2015)', 'Dark Asylum (2001)', 'Von einem, der auszog, das Fürchten zu lernen (2014)']
  Sci-Fi: ['Awaken (2013)', 'The Electric Grandmother (1982)', 'Pale (2016)']


# Using Surprise Module - For Training and Better Prediction  
  
# SVD and NearestNeighbors

In [18]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from sklearn.neighbors import NearestNeighbors

In [33]:
# -- 1. Load raw ratings (to learn latent factors) -------------------------
ratings = pd.read_csv('../ratings.csv')[['userId','movieId','rating']]

# Build Surprise dataset & train full SVD model
reader = Reader(rating_scale = (0.5, 5.0))
data = Dataset.load_from_df(ratings, reader)
trainset = data.build_full_trainset()
svd = SVD(n_factors=50, random_state=42)
svd.fit(trainset)

# -- 2. Extract movie‐factor matrix ---------------------------------------
# svd.qi is shape (n_items, n_factors), aligned to trainset inner IDs
# Map inner IDs → raw movieId, build DataFrame of latent vectors
movie_inner_ids = list(trainset._raw2inner_id_items.keys())
movie_ids = [int(trainset.to_raw_iid(i)) for i in range(len(movie_inner_ids))] 

latent_df = pd.DataFrame(
    svd.qi, 
    index=movie_ids,  # raw movieId as index
)
latent_df.index.name = 'movieId'

# -- 3. Fit NearestNeighbors on those latent vectors ----------------------
nn_latent = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute')
nn_latent.fit(latent_df.values)

# -- 4. Load movie features & titles for “predicted rating” & genre lists -
movie_feats = pd.read_csv('train.csv')[[
    'movieId','movie_rating_mean'
]]
movies = pd.read_csv('../movies.csv')[['movieId','title','genres']]

# Build lookups
id_to_title = dict(zip(movies.movieId, movies.title))
feat_mean = dict(zip(movie_feats.movieId, movie_feats.movie_rating_mean))

In [21]:
# -- 5. Recommendation function -------------------------------------------
def recommend_hybrid(title, topn=10, per_genre=3):
    # a) Find movieId
    m = movies[movies.title == title]
    if m.empty:
        m = movies[movies.title.str.contains(title, regex=False)]
    if m.empty:
        raise ValueError(f"No movie found matching '{title}'")
    mid = int(m.iloc[0].movieId)
    
    # b) Predicted rating = historical mean
    pred = round(feat_mean.get(mid, 0), 2)
    
    # c) Find latent‐factor neighbors
    #   - Lookup index in latent_df
    if mid not in latent_df.index:
        raise ValueError(f"MovieId {mid} missing latent factors")
    vec = latent_df.loc[mid].values.reshape(1, -1)
    dists, inds = nn_latent.kneighbors(vec, n_neighbors=topn+1)
    
    #   - Map back to movieIds, skip itself
    all_ids = latent_df.index[inds[0]].tolist()
    rec_ids = [i for i in all_ids if i != mid][:topn]
    rec_titles = [id_to_title[i] for i in rec_ids]
    
    # d) Genre‐based top‐K by mean rating
    genres = m.iloc[0].genres.split('|')
    by_genre = {}
    for g in genres:
        # filter train.csv for this genre
        mask = (pd.read_csv('train.csv')[g] == 1)  # inexpensive single‐column read
        top_ids = pd.read_csv('train.csv')[['movieId','movie_rating_mean']][mask] \
                    .nlargest(per_genre, 'movie_rating_mean') \
                    .movieId.tolist()
        by_genre[g] = [id_to_title[i] for i in top_ids]
    
    return {
        'input_movie': title,
        'predicted_rating': pred,
        'similar_movies': rec_titles,
        'by_genre': by_genre
    }

In [23]:
# -- 6. Example ------------------------------------------------------------
res = recommend_hybrid("Iron Man (2008)")
print("Input Movie:", res['input_movie'])
print("Predicted Rating:", res['predicted_rating'], "★")
print("Similar Movies:", res['similar_movies'])
print("Genre‐based Suggestions:")
for g, lst in res['by_genre'].items():
    print(f"  {g}: {lst}")

Input Movie: Iron Man (2008)
Predicted Rating: 3.83 ★
Similar Movies: ['Avengers, The (2012)', 'Captain America: The Winter Soldier (2014)', 'Captain America: The First Avenger (2011)', 'Captain America: Civil War (2016)', 'Thor (2011)', 'X-Men: First Class (2011)', 'Iron Man 2 (2010)', 'Avengers: Age of Ultron (2015)', 'Iron Man 3 (2013)', 'Ant-Man (2015)']
Genre‐based Suggestions:
  Action: ['To Be the Best (1993)', 'FB: Fighting Beat (2007)', 'Day of the Panther (1988)']
  Adventure: ['Bicycle Dreams (2009)', 'Colossus of the Arena (1962)', "Tarzan's Fight for Life (1958)"]
  Sci-Fi: ['Awaken (2013)', 'Pale (2016)', 'Awaken (2013)']


In [24]:
res = recommend_hybrid("Avatar")
print("Input Movie:", res['input_movie'])
print("Predicted Rating:", res['predicted_rating'], "★")
print("Similar Movies:", res['similar_movies'])
print("Genre‐based Suggestions:")
for g, lst in res['by_genre'].items():
    print(f"  {g}: {lst}")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [25]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [27]:
trainset

<surprise.trainset.Trainset at 0x67a96ced0>

In [34]:
joblib.dump(svd, 'svd_model.joblib')
joblib.dump(nn_latent, 'nn_latent.joblib')
latent_df.to_pickle('latent_df.pkl')

In [53]:
movie_titles = pd.read_csv('../movies.csv')
movie_titles = movie_titles[['movieId', 'title']]
data = pd.concat([X_train, X_test], ignore_index = True)
data = data.merge(movie_titles, on = 'movieId')
data.to_csv('movie_feats.csv', index = False)
print('Exported.........')

Exported.........


In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

# Prepare Surprise data
ratings = pd.read_csv('ratings.csv')[['userId','movieId','rating']]
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings, reader)

# Define parameter grid for SVD
param_grid = {
    'n_factors': [20, 50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.05]
}

# Grid search
GS = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse', 'mae'],
    cv=3,
    n_jobs=-1
)
GS.fit(data)

# Best parameters
print(GS.best_params['rmse'])
print(GS.best_score['rmse'])

In [58]:
import polars as pl
import joblib
import pandas as pd  # Needed for SVD model, nn, and latent_df.pkl
from sklearn.neighbors import NearestNeighbors

# --- Load models and data ---
svd_model = joblib.load('svd_model.joblib')
nn_latent = joblib.load('nn_latent.joblib')
latent_df = pd.read_pickle('latent_df.pkl')
movie_feats = pl.read_csv('../movies.csv')

# --- Build lookups ---
movies = movie_feats.select(['movieId', 'title', 'genres']).unique()
id_to_title = dict(zip(movies['movieId'].to_list(), movies['title'].to_list()))
feat_mean = dict(zip(movie_feats['movieId'].to_list(), movie_feats['movie_rating_mean'].to_list()))

# --- Recommendation function ---
def recommend_hybrid(title, topn=10, per_genre=3):
    # a) Find movieId
    m = movies.filter(pl.col("title") == title)
    if m.is_empty():
        m = movies.filter(pl.col("title").str.contains(title))
    if m.is_empty():
        raise ValueError(f"No movie found matching '{title}'")
    mid = m[0, "movieId"]
    
    # b) Predicted rating = historical mean
    pred = round(feat_mean.get(mid, 0), 2)
    
    # c) Find latent‐factor neighbors
    if mid not in latent_df.index:
        raise ValueError(f"MovieId {mid} missing latent factors")
    vec = latent_df.loc[mid].values.reshape(1, -1)
    dists, inds = nn_latent.kneighbors(vec, n_neighbors=topn+1)
    
    all_ids = latent_df.index[inds[0]].tolist()
    rec_ids = [i for i in all_ids if i != mid][:topn]
    rec_titles = [id_to_title[i] for i in rec_ids]
    
    # d) Genre-based top-K by mean rating
    genres = m[0, "genres"].split('|')
    by_genre = {}
    for g in genres:
        if g not in movie_feats.columns:
            continue  # skip unknown genres
        top_ids = movie_feats.filter(pl.col(g) == 1) \
            .select(['movieId', 'movie_rating_mean']) \
            .sort('movie_rating_mean', descending=True) \
            .head(per_genre)['movieId'].to_list()
        by_genre[g] = [id_to_title[i] for i in top_ids if i in id_to_title]
    
    return {
        'input_movie': title,
        'predicted_rating': pred,
        'similar_movies': rec_titles,
        'by_genre': by_genre
    }

ColumnNotFoundError: "movie_rating_mean" not found

In [56]:
movie_feats = pl.read_csv('movie_feats.csv')
print(movie_feats.columns)

['movieId', 'movie_rating_count', 'movie_rating_mean', 'movie_rating_var', 'movie_rating_std', 'year', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'title']


In [None]:
res = recommend_hybrid("Avatar")
print("Input Movie:", res['input_movie'])
print("Predicted Rating:", res['predicted_rating'], "★")
print("Similar Movies:", res['similar_movies'])
print("Genre‐based Suggestions:")
for g, lst in res['by_genre'].items():
    print(f"  {g}: {lst}")