In [None]:
# To run locally on Mac.

%load_ext autoreload
%autoreload 2

import os, sys
# compute the absolute path to your project root:
root = os.path.abspath(os.path.join(os.getcwd(), '..'))
print(root)
# insert it at the front of Python’s module search path:
sys.path.insert(0, root)
data_dir = "../data"
config_file = "../configs/config_dcn_v2_sequential.yaml"
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/Users/harshadakumbhare/Documents/GitHub/akshaydaf/recommender-system


In [None]:
# To run in colab
# from google.colab import drive
# drive.mount('/content/drive/')

# %cd '/content/drive/MyDrive/gatech_coursework/deep_learning_final_project/GitHub_akshaydaf/recommender-system'
# data_dir = "./data"
# config_file = "./configs/config_dcn_v2_sequential.yaml"
# device = torch.device("cuda") 

In [None]:
# Load, clean and create train, val, test data
import pandas as pd
from data_utils.preprocess import (
    load_movielens, clean_and_filter,
    get_user_sequences, split_sequences,
    build_examples
)

def users_preprocessing(users):
    age_mapping = {
        1: 0,
        18: 1,
        25: 2,
        35: 3,
        45: 4,
        50: 5,
        56: 6
    }
    users['AgeEncoded'] = users['Age'].map(age_mapping)

    gender_mapping = {
        'F' : 0,
        'M' : 1,
    }
    users['GenderEncoded'] = users['Gender'].map(gender_mapping)

    users['Zip-codeEncoded'] = users['Zip-code'].str[:5].astype(int)
    users['Zip-codeEncoded'], unique_train_zips = pd.factorize(users['Zip-codeEncoded'])

    user_columns = ['UserID', 'AgeEncoded', 'GenderEncoded', 'Zip-codeEncoded', 'Occupation']
    return users[user_columns]


def movies_preprocessing(movies):
    movies['GenresList'] = movies['Genres'].str.split('|')
    movies_df_exploded = movies.explode('GenresList')
    one_hot = pd.get_dummies(movies_df_exploded['GenresList'], dtype=int)
    one_hot_exploded = one_hot.groupby(movies_df_exploded.index).max()
    movies = pd.concat([movies, one_hot_exploded], axis=1)

    # Preprocess years
    movies[["Title", "Year"]] = movies["Title"].str.extract(r'^(.*) \((\d{4})\)$')
    movies["Year"] = pd.to_numeric(movies["Year"], errors="coerce")

    median_year = int(movies["Year"].median(skipna=True))
    movies["Year"] = movies["Year"].fillna(median_year).astype(int)
    movies['Year'], unique_train_years = pd.factorize(movies['Year'])

    movies_columns = ['MovieID', 'Action', 'Adventure',
       'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western', 'Year']
    return movies[movies_columns]

# 1) load & filter
ratings, users, movies = load_movielens(data_dir)
# Keep all ratings for training DCN v2 model.
ratings, users, movies = clean_and_filter(ratings, users, movies, rating_threshold=4)

user_seqs   = get_user_sequences(ratings)
user_splits = split_sequences(user_seqs, train_ratio=0.8, val_ratio=0.1)

# 3) global item set
all_movies = set(movies["MovieID"].unique())

# 4) build examples for train/val
train_exs = build_examples(user_splits, all_movies, K=5, split="train")
val_exs   = build_examples(user_splits, all_movies, K=5, split="val")
test_exs  = build_examples(user_splits, all_movies, K=5, split="test")

users = users_preprocessing(users)
movies = movies_preprocessing(movies)

In [None]:
# 5) build examples for train/val for DCN V2
# 
def build_dcnv2_input(examples, users, movies):
    df = pd.DataFrame()
    for row in examples:
        UserID = row['UserID']
        positive = row['positive']
        negative = row['negatives'][0]
        combined = pd.concat([users[users['UserID']==UserID].reset_index(drop=True), movies[movies['MovieID']==positive].reset_index(drop=True)], axis=1)
        combined['rating'] = 1
        df = pd.concat([df, combined],ignore_index=True)
        combined = pd.concat([users[users['UserID']==UserID].reset_index(drop=True), movies[movies['MovieID']==negative].reset_index(drop=True)], axis=1)
        combined['rating'] = 0
        df = pd.concat([df, combined],ignore_index=True)
    return examples

def build_dcnv2_input(examples, users, movies):
    user_ids = []
    movie_ids = []
    ratings = []

    for row in examples:
        uid = row['UserID']
        pos = row['positive']
        neg = row['negatives'][0]

        # Collect positive sample
        user_ids.append(uid)
        movie_ids.append(pos)
        ratings.append(1)

        # Collect negative sample
        user_ids.append(uid)
        movie_ids.append(neg)
        ratings.append(0)

    # Create DataFrame all at once
    df = pd.DataFrame({
        'UserID': user_ids,
        'MovieID': movie_ids,
        'Rating': ratings
    })

    # Merge user and movie features
    df = df.merge(users, on='UserID', how='left')
    df = df.merge(movies, on='MovieID', how='left')
    return df


train_df = build_dcnv2_input(train_exs, users, movies)
val_df   = build_dcnv2_input(val_exs, users, movies)
test_df   = build_dcnv2_input(test_exs, users, movies)


In [None]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(891492, 26)
(102444, 26)
(108350, 26)


In [None]:

from models.sequential_dcn_v2 import DCNV2_Sequential
from models.vanilla_nn import TwoLayerNet
from trainer import Trainer
from data_utils.datasets import CustomDataset
import torch
from torch.utils.data import DataLoader, TensorDataset
import argparse
import yaml
from config import Config
import pandas as pd

print("Running on", device)

with open(config_file, 'r') as file:
    config_dict = yaml.safe_load(file)
    config = Config(config_dict=config_dict)
print(config)
config.train.device = device
target_column = 'Rating'
train_df = train_df.astype(int)
val_df = val_df.astype(int)

def create_x_sparse_input(df, sparse_feature_info):
    X_sparse_input = {
        name: torch.tensor(df[name].values, device=device)
        for name, (vocab_size, embed_size) in sparse_feature_info.items()
    }
    return X_sparse_input

# Generate sparse input.
sparse_feature_info = {
    # name: (vocab_size, embed_size)
    "UserID": (6500, 64),       # uid 1 6040 users, 64-dim embedding
    "MovieID": (4000, 64),        # movie_id 1 3952 items, 64-dim embedding
    "Occupation": (21, 8),        # occupation 0 20 items, 64-dim embedding
    "AgeEncoded": (8, 4),        # age_encoded 1 7 age, 64-dim embedding
    "Zip-codeEncoded": (3500, 64),       
    "Year": (81, 8),       
}

# Generate dense input.
dense_columns = list(set(train_df.columns) - set(sparse_feature_info.keys()) - {target_column})
num_dense_features = len(dense_columns)

train_X_dense_input = torch.tensor(train_df[dense_columns].values, device=device)
train_y = torch.tensor(train_df[target_column].values, dtype=torch.float32, device=device)
train_dataset = CustomDataset(create_x_sparse_input(train_df, sparse_feature_info), train_X_dense_input, train_y)
train_loader = DataLoader(train_dataset, batch_size=config.train.batch_size, shuffle=True)

val_X_dense_input = torch.tensor(val_df[dense_columns].values, device=device)
val_y = torch.tensor(val_df[target_column].values, dtype=torch.float32, device=device)
val_dataset = CustomDataset(create_x_sparse_input(val_df, sparse_feature_info), val_X_dense_input, val_y)
val_loader = DataLoader(val_dataset, batch_size=config.train.batch_size, shuffle=True)


model = DCNV2_Sequential(sparse_feature_info=sparse_feature_info, num_dense_features=num_dense_features,
                         cross_layers=config.network.num_cross_layers, deep_hidden_dims=config.network.hidden_dims,
                         dropout_rate=config.network.dropout, device = device)

trainer = Trainer(model, None, config, train_loader=train_loader, lr=float(config.train.lr), val_loader=val_loader)

trainer.fit()


Running on cpu
<config.Config object at 0x2d97064d0>
Trainer init device: cpu


  8%|▊         | 141/1742 [00:11<02:46,  9.59it/s]

In [None]:
def evaluate_DCNV2Model(
    model: torch.nn.Module,
    user_splits: dict,
    global_items: set,
    device: torch.device,
    *,
    candidate_size: int = 100,
    k: int = 10,
    negative_sampler=uniform_negative_sampler,
    users: pd.DataFrame,
    movies: pd.DataFrame,
    sparse_feature_info: any
) -> dict:
    """Evaluate a ranking model.

    Args:
        model (torch.nn.Module): The trained recommender model.
        user_splits (dict): Dictionary mapping users to (train_seq, val_seq,
                            test_seq).
        global_items (set): Full set of all item IDs.
        device (torch.device): Device for inference.
        candidate_size (int, optional): Total candidates = 1 pos +
                                         (candidate_size-1) negs. Defaults to 100.
        k (int, optional): Cutoff for Hit@k and NDCG@k. Defaults to 10.
        negative_sampler (function, optional): Function to sample negative IDs.
                                               Defaults to uniform_negative_sampler.

    Returns:
        dict: Dictionary with averaged metrics: Hit@k, NDCG@k, MRR, MAP.
    """
    model.eval()
    hits, ndcgs, mrrs, aps = [], [], [], []

    temp = 0
    for user, (train_seq, val_seq, test_seq) in user_splits.items():
        if not test_seq:
            continue

        # 1) Build the “prefix” and the held‑out positive item
        prefix = train_seq + val_seq
        pos_item = test_seq[0]

        # 2) Sample negatives
        negs = negative_sampler(prefix, global_items - {pos_item},
                                candidate_size - 1)

        # 3) Build candidate list
        candidates = [pos_item] + negs

        # 4) Score all candidates in one forward pass
        items_t = torch.tensor(candidates, dtype=torch.long, device=device)
        with torch.no_grad():
            #####################################################################################
            # Main logic.
            # Filter required movie_id data and keep first one.
            df = movies[(movies['MovieID'].isin(items_t.tolist()))].copy()
            

            df.loc[:,'UserID']=user
            df['AgeEncoded'] = users.loc[users['UserID'] == user, 'AgeEncoded'].values[0]
            df['GenderEncoded'] = users.loc[users['UserID'] == user, 'GenderEncoded'].values[0]
            df['Zip-codeEncoded'] = users.loc[users['UserID'] == user, 'Zip-codeEncoded'].values[0]
            df['Occupation'] = users.loc[users['UserID'] == user, 'Occupation'].values[0]

            # To keep movie order same as in items_t.
            order = {v: i for i, v in enumerate(items_t.cpu().tolist())}
            df['__key'] = df['MovieID'].map(order)
            df = df.sort_values('__key').drop(columns='__key')

            sparse_columns = sparse_feature_info.keys()
            X_sparse_input = {
                name: torch.tensor(df[name].values, dtype=torch.int64, device=device)
                for name, (a, b) in sparse_feature_info.items()
            }
            target_column = 'Rating'

            # Generate dense input.
            dense_columns = list(set(df.columns) - set(sparse_columns) - {target_column})

            X_dense_input = torch.tensor(df[dense_columns].values, dtype=torch.int64, device=device)
            # end of main logic.
            #####################################################################

            if temp==0:
                temp = 1
                for name, tensor in X_sparse_input.items():
                  print(f"Feature name: {name}")
                  print(f"Tensor shape: {tensor.shape}")
                  print(f"Device: {tensor.device}")
                  print(tensor[:5])
                print(X_dense_input.shape)

            scores = model(X_sparse_input, X_dense_input).cpu().numpy()
            if temp==1:
                temp = 2
                print(scores)
        # 5) Compute the rank of the positive item (index 0 before sorting)
        ranking = np.argsort(-scores)
        rank = np.where(ranking == 0)[0][0] + 1  # 1‑based

        # 6) Accumulate metrics
        hits.append(hit_at_k(rank, k))
        ndcgs.append(ndcg_at_k(rank, k))
        mrrs.append(mrr(rank))
        aps.append(average_precision(rank))

    # 7) Return average over all users
    return {
        f"Hit@{k}": np.mean(hits),
        f"NDCG@{k}": np.mean(ndcgs),
        "MRR": np.mean(mrrs),
        "MAP": np.mean(aps)
    }


In [None]:
# Evaluate Model
from evaluation import evaluate_DCNV2Model

all_movies = set(movies["MovieID"].unique())
evaluate_DCNV2Model(model=model, user_splits=user_splits, global_items=all_movies, device=device, users=users, movies=movies,
                    sparse_feature_info=sparse_feature_info) 


{'UserID': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]), 'MovieID': tensor([  14,   64,   67,  142,  144,  151,  163,  199,  239,  267,  321,  335,
         392,  398,  402,  478,  545,  587,  609,  627,  654,  697,  703,  710,
         743,  776,  820,  838,  879,  891,  916, 1000, 1014, 1021, 1061, 1193,
        1206, 1273, 1307, 1315, 1317, 1350, 1376, 1385, 1474, 1492, 1563, 1599,
        1641, 1645, 1729, 1755, 1870, 1901, 1903, 1946, 2034, 2063, 2083, 2129,
        2135, 2145, 2157, 2185, 2225, 2281, 2284, 2347, 2363, 2366, 2381, 2420,
        2428, 2528, 2744, 2876, 3006, 3008, 3053, 3057, 3206, 3225, 3327, 3399,
        3408, 3503, 3525, 3535, 3545, 3582, 3623, 3647, 3674, 3686, 3770, 37

{'Hit@10': 0.17464788732394365,
 'NDCG@10': 0.09162334640655408,
 'MRR': 0.0895953758476729,
 'MAP': 0.0895953758476729}