# Load the model

In [3]:
import pandas as pd

# Load training data
train_df = pd.read_csv('train.csv')

# Features and Target let's use only movie features and skip user features
X_train = train_df.drop(columns=['Unnamed: 0','userId','movieId','rating', 'user_last_rating_ts', 'user_last_rating_ordinal', 'user_rating_count','user_rating_mean','user_rating_var','user_rating_std'])
y_train = train_df['rating']

In [4]:
X_train

Unnamed: 0,movie_rating_count,movie_rating_mean,movie_rating_var,movie_rating_std,year,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,668,3.101048,1.420434,1.191819,1995,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,18242,3.715355,0.967558,0.983645,1995,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1124,3.430160,0.924103,0.961303,1995,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,1179,3.486853,1.092145,1.045057,1995,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,3235,3.288099,1.025353,1.012597,1996,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20000071,18474,4.026145,0.782865,0.884797,2007,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20000072,14182,3.491045,0.884871,0.940676,2007,1,0,0,0,0,...,0,1,1,0,0,0,1,1,0,0
20000073,2059,3.122875,1.155934,1.075144,2008,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
20000074,4100,3.451220,1.260001,1.122498,2008,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X_train.columns

Index(['movie_rating_count', 'movie_rating_mean', 'movie_rating_var',
       'movie_rating_std', 'year', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [6]:
# Load Test Set
test_df = pd.read_csv('test.csv')

# Prepare X_test and y_test
X_test = test_df.drop(columns=['Unnamed: 0','userId', 'movieId', 'rating', 'user_last_rating_ts', 'user_last_rating_ordinal', 'user_rating_count','user_rating_mean','user_rating_var','user_rating_std'])
y_test = test_df['rating']

# Using Surprise Module - For Training and Better Prediction  
  
# SVD and NearestNeighbors

In [7]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from sklearn.neighbors import NearestNeighbors

In [8]:
# -- 1. Load raw ratings (to learn latent factors) -------------------------
ratings = pd.read_csv('../ratings.csv')[['userId','movieId','rating']]

# Build Surprise dataset & train full SVD model
reader = Reader(rating_scale = (0.1, 5.0))
data = Dataset.load_from_df(ratings, reader)
trainset = data.build_full_trainset()
svd = SVD(n_factors=100, random_state=42)
svd.fit(trainset)

# -- 2. Extract movie‐factor matrix ---------------------------------------
# svd.qi is shape (n_items, n_factors), aligned to trainset inner IDs
# Map inner IDs → raw movieId, build DataFrame of latent vectors
movie_inner_ids = list(trainset._raw2inner_id_items.keys())
movie_ids = [int(trainset.to_raw_iid(i)) for i in range(len(movie_inner_ids))] 

latent_df = pd.DataFrame(
    svd.qi, 
    index=movie_ids,  # raw movieId as index
)
latent_df.index.name = 'movieId'

# -- 3. Fit NearestNeighbors on those latent vectors ----------------------
nn_latent = NearestNeighbors(n_neighbors=19, metric='cosine', algorithm='brute')
nn_latent.fit(latent_df.values)

# -- 4. Load movie features & titles for “predicted rating” & genre lists -
movie_feats = pd.read_csv('train.csv')[[
    'movieId','movie_rating_mean'
]]
movies = pd.read_csv('../movies.csv')[['movieId','title','genres']]

# Build lookups
id_to_title = dict(zip(movies.movieId, movies.title))
feat_mean = dict(zip(movie_feats.movieId, movie_feats.movie_rating_mean))

In [7]:
# -- 5. Recommendation function -------------------------------------------
def recommend_hybrid(title, topn=10, per_genre=3):
    # a) Find movieId
    m = movies[movies.title == title]
    if m.empty:
        m = movies[movies.title.str.contains(title, regex=False)]
    if m.empty:
        raise ValueError(f"No movie found matching '{title}'")
    mid = int(m.iloc[0].movieId)
    
    # b) Predicted rating = historical mean
    pred = round(feat_mean.get(mid, 0), 2)
    
    # c) Find latent‐factor neighbors
    #   - Lookup index in latent_df
    if mid not in latent_df.index:
        raise ValueError(f"MovieId {mid} missing latent factors")
    vec = latent_df.loc[mid].values.reshape(1, -1)
    dists, inds = nn_latent.kneighbors(vec, n_neighbors=topn+1)
    
    #   - Map back to movieIds, skip itself
    all_ids = latent_df.index[inds[0]].tolist()
    rec_ids = [i for i in all_ids if i != mid][:topn]
    rec_titles = [id_to_title[i] for i in rec_ids]
    
    # d) Genre‐based top‐K by mean rating
    genres = m.iloc[0].genres.split('|')
    by_genre = {}
    for g in genres:
        # filter train.csv for this genre
        mask = (pd.read_csv('train.csv')[g] == 1)  # inexpensive single‐column read
        top_ids = pd.read_csv('train.csv')[['movieId','movie_rating_mean']][mask] \
                    .nlargest(per_genre, 'movie_rating_mean') \
                    .movieId.tolist()
        by_genre[g] = [id_to_title[i] for i in top_ids]
    
    return {
        'input_movie': title,
        'predicted_rating': pred,
        'similar_movies': rec_titles,
        'by_genre': by_genre
    }

In [8]:
# -- 6. Example ------------------------------------------------------------
res = recommend_hybrid("Iron Man (2008)")

Input Movie: Iron Man (2008)
Predicted Rating: 3.83 ★
Similar Movies: ['Avengers, The (2012)', 'Captain America: The Winter Soldier (2014)', 'Captain America: The First Avenger (2011)', 'Captain America: Civil War (2016)', 'Thor (2011)', 'X-Men: First Class (2011)', 'Iron Man 2 (2010)', 'Avengers: Age of Ultron (2015)', 'Iron Man 3 (2013)', 'Ant-Man (2015)']
Genre‐based Suggestions:
  Action: ['To Be the Best (1993)', 'FB: Fighting Beat (2007)', 'Day of the Panther (1988)']
  Adventure: ['Bicycle Dreams (2009)', 'Colossus of the Arena (1962)', "Tarzan's Fight for Life (1958)"]
  Sci-Fi: ['Awaken (2013)', 'Pale (2016)', 'Awaken (2013)']


In [10]:
print("Input Movie:", res['input_movie'])
print("\nPredicted Rating:", res['predicted_rating'], "★")
print("\nSimilar Movies:", res['similar_movies'])
print("\nGenre‐based Suggestions:")
for g, lst in res['by_genre'].items():
    print(f"\n  {g}: {lst}")

Input Movie: Iron Man (2008)

Predicted Rating: 3.83 ★

Similar Movies: ['Avengers, The (2012)', 'Captain America: The Winter Soldier (2014)', 'Captain America: The First Avenger (2011)', 'Captain America: Civil War (2016)', 'Thor (2011)', 'X-Men: First Class (2011)', 'Iron Man 2 (2010)', 'Avengers: Age of Ultron (2015)', 'Iron Man 3 (2013)', 'Ant-Man (2015)']

Genre‐based Suggestions:

  Action: ['To Be the Best (1993)', 'FB: Fighting Beat (2007)', 'Day of the Panther (1988)']

  Adventure: ['Bicycle Dreams (2009)', 'Colossus of the Arena (1962)', "Tarzan's Fight for Life (1958)"]

  Sci-Fi: ['Awaken (2013)', 'Pale (2016)', 'Awaken (2013)']


In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

# Prepare Surprise data
ratings = pd.read_csv('../ratings.csv')[['userId','movieId','rating']]
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings, reader)

# Define parameter grid for SVD
param_grid = {
    'n_factors': [20, 50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.05]
}

# Grid search
GS = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse', 'mae'],
    cv=3,
    n_jobs=-1
)
GS.fit(data)

# Best parameters
print(GS.best_params['rmse'])
print(GS.best_score['rmse'])



In [9]:
import joblib

joblib.dump(svd, 'svd_model.joblib')
joblib.dump(nn_latent, 'nn_latent.joblib')
latent_df.to_pickle('latent_df.pkl')

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('movie_feats.csv')

In [3]:
data.columns

Index(['movieId', 'movie_rating_count', 'movie_rating_mean',
       'movie_rating_var', 'movie_rating_std', 'year', 'Action', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'title'],
      dtype='object')

In [11]:
data2 = pd.read_csv('/Users/poppinadityagmail.com/Developer/DataScience_Projects/Netflix_Movie_Recomender_System/deployment/data/movies.csv')

In [12]:
data2.columns

Index(['Unnamed: 0', 'movieId', 'title', 'genres', 'imdbId', 'tmdbId'], dtype='object')

In [13]:
data2 = data2.merge(data[['movieId', 'year']], on = 'movieId', how = 'left')

In [15]:
data2

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,imdbId,tmdbId,year
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995.0
1,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995.0
2,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995.0
3,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995.0
4,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995.0
...,...,...,...,...,...,...,...
25003466,62418,209157,We (2018),Drama,6671244,499546.0,2018.0
25003467,62419,209159,Window of the Soul (2001),Documentary,297986,63407.0,2001.0
25003468,62420,209163,Bad Poems (2018),Comedy|Drama,6755366,553036.0,2018.0
25003469,62421,209169,A Girl Thing (2001),(no genres listed),249603,162892.0,2001.0


In [16]:
data2 = data2.drop(columns=['Unnamed: 0'])
data2.columns

Index(['movieId', 'title', 'genres', 'imdbId', 'tmdbId', 'year'], dtype='object')

In [19]:
data2.tail()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,year
25003466,209157,We (2018),Drama,6671244,499546.0,2018.0
25003467,209159,Window of the Soul (2001),Documentary,297986,63407.0,2001.0
25003468,209163,Bad Poems (2018),Comedy|Drama,6755366,553036.0,2018.0
25003469,209169,A Girl Thing (2001),(no genres listed),249603,162892.0,2001.0
25003470,209171,Women of Devil's Island (1962),Action|Adventure|Drama,55323,79513.0,1962.0


In [20]:
data2.to_csv('/Users/poppinadityagmail.com/Developer/DataScience_Projects/Netflix_Movie_Recomender_System/deployment/data/movies_data.csv')