In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import joblib
import pickle
import os

In [4]:
data_path = 'ml-100k/'  # change if needed
# u.data columns: user id | item id | rating | timestamp
ratings = pd.read_csv(os.path.join(data_path, 'u.data'), sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# u.item: movie id | movie title | release date | video release date | IMDb URL | genres (19 binary flags)
# Note: the file is pipe-separated
movie_cols = [
    'movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'
]
# there are 19 genre columns after that; we'll name them g0..g18 first then map
genre_cols = [f'g{i}' for i in range(19)]
all_cols = movie_cols + genre_cols
movies = pd.read_csv(os.path.join(data_path, 'u.item'), sep='|', names=all_cols, encoding='latin-1')

# u.genre maps the 19 genre names in order - load to label the genre columns
genre_map = []
with open(os.path.join(data_path, 'u.genre'), encoding='latin-1') as f:
    for line in f:
        if line.strip():
            g, _ = line.strip().split('|')
            genre_map.append(g)

movies_genres = movies.copy()
movies_genres.columns = movie_cols + genre_map

movies_genres.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
# Drop unnecessary columns
movies_genres = movies_genres[['movie_id', 'title'] + genre_map]
# Make a genres string for content-based filtering
movies_genres['genres_str'] = movies_genres[genre_map].apply(lambda row: ' '.join([g for g in genre_map if row[g] == 1]), axis=1)

# Ratings basic info
print('ratings shape', ratings.shape)
print('movies shape', movies_genres.shape)

ratings shape (100000, 4)
movies shape (1682, 22)


In [6]:
# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))
ds = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)
trainset, testset = train_test_split(ds, test_size=0.2, random_state=42)

algo = SVD(n_factors=50, random_state=42)
algo.fit(trainset)

# Optional evaluate on testset
from surprise import accuracy
predictions = algo.test(testset)
print('RMSE:', accuracy.rmse(predictions, verbose=False))

RMSE: 0.934761145254825


In [7]:
os.makedirs('artifacts', exist_ok=True)
joblib.dump(algo, 'artifacts/svd_model.pkl')
movies_genres.to_csv('artifacts/movies.csv', index=False)
ratings.to_csv('artifacts/ratings.csv', index=False)

In [8]:
# TF-IDF on the short genre strings (could also use title tokens but genres are small)
tfidf = TfidfVectorizer(analyzer='word')
genre_matrix = tfidf.fit_transform(movies_genres['genres_str'])

# Save vectorizer and matrix
joblib.dump(tfidf, 'artifacts/tfidf_genre.pkl')
# genre_matrix is sparse — save with pickle
with open('artifacts/genre_matrix.pkl', 'wb') as f:
    pickle.dump(genre_matrix, f)

In [9]:
def get_collab_recommendations(algo, user_id, movies_df, ratings_df, n=10):
    # Already seen movies
    seen = set(ratings_df[ratings_df.user_id == user_id].movie_id)
    all_movie_ids = movies_df['movie_id'].unique()
    preds = []
    for mid in all_movie_ids:
        if mid in seen:
            continue
        try:
            pred = algo.predict(user_id, int(mid))
            preds.append((mid, pred.est))
        except Exception:
            # if prediction fails for unknown users/items
            continue
    preds.sort(key=lambda x: x[1], reverse=True)
    top = preds[:n]
    return movies_df[movies_df.movie_id.isin([m for m,_ in top])].merge(pd.DataFrame(top, columns=['movie_id','est']), on='movie_id').sort_values('est', ascending=False)

# For content-based: given a movie id, find similar movies
from sklearn.metrics.pairwise import cosine_similarity

def get_content_recommendations(movie_id, movies_df, genre_matrix, topn=10):
    # Map movie id to index
    idx = movies_df.index[movies_df.movie_id == movie_id].tolist()
    if not idx:
        return pd.DataFrame()
    idx = idx[0]
    cosine_similarities = linear_kernel(genre_matrix[idx], genre_matrix).flatten()
    related_docs_indices = cosine_similarities.argsort()[::-1]
    # skip itself
    related_docs_indices = [i for i in related_docs_indices if i != idx]
    top_indices = related_docs_indices[:topn]
    return movies_df.iloc[top_indices].assign(score=cosine_similarities[top_indices])

In [10]:
# Example: get collab recs for user 1
print(get_collab_recommendations(algo, user_id=1, movies_df=movies_genres, ratings_df=ratings, n=10))

# Example: content recs for movie_id 1
with open('artifacts/genre_matrix.pkl','rb') as f:
    gm = pickle.load(f)
print(get_content_recommendations(1, movies_genres, gm, topn=10))

   movie_id                                              title  unknown  \
5       483                                  Casablanca (1942)        0   
2       408                              Close Shave, A (1995)        0   
7       513                              Third Man, The (1949)        0   
6       511                          Lawrence of Arabia (1962)        0   
9       923                       Raise the Red Lantern (1991)        0   
4       480                          North by Northwest (1959)        0   
0       285                              Secrets & Lies (1996)        0   
1       302                           L.A. Confidential (1997)        0   
8       525                              Big Sleep, The (1946)        0   
3       474  Dr. Strangelove or: How I Learned to Stop Worr...        0   

   Action  Adventure  Animation  Children's  Comedy  Crime  Documentary  ...  \
5       0          0          0           0       0      0            0  ...   
2       0     