In [231]:
%load_ext autoreload
%autoreload 2

import os, sys
# compute the absolute path to your project root:
root = os.path.abspath(os.path.join(os.getcwd(), '..'))
print(root)
# insert it at the front of Python’s module search path:
sys.path.insert(0, root)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/Users/harshadakumbhare/Documents/GitHub/akshaydaf/recommender-system


In [232]:
# Load and clean data.
import pandas as pd
from data_utils.preprocess import (
    load_movielens, clean_and_filter
)

# users_df = pd.read_csv('data/users.dat', delimiter='::', engine='python', names=['uid', 'gender', 'age', 'occupation', 'zip'])
# movies_df = pd.read_csv('data/movies.dat', delimiter='::', engine='python', names=['movie_id', 'title', 'genre'], encoding='latin-1')
# ratings_df = pd.read_csv('data/ratings.dat', delimiter='::', engine='python', names=['uid', 'movie_id', 'rating', 'ts'])

data_dir = "../data"
# 1) load & filter
ratings, users, movies = load_movielens(data_dir)
# Keep all ratings for training DCN v2 model.
ratings, users, movies = clean_and_filter(ratings, users, movies, rating_threshold=0)

In [233]:
# User preprocessing.

age_mapping = {
    1: 0,
    18: 1,
    25: 2,
    35: 3,
    45: 4,
    50: 5,
    56: 6
}

users['AgeEncoded'] = users['Age'].map(age_mapping)

gender_mapping = {
    'F' : 0,
    'M' : 1,
}

users['GenderEncoded'] = users['Gender'].map(gender_mapping)

users['Zip-codeEncoded'] = users['Zip-code'].str[:5].astype(int)

users.head()

# Final features: uid, age_encoded, gender_encoded, zip_encoded, occupation, 

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,AgeEncoded,GenderEncoded,Zip-codeEncoded
0,0,F,1,10,48067,0,0,48067
1,1,M,56,16,70072,6,1,70072
2,2,M,25,15,55117,2,1,55117
3,3,M,45,7,2460,4,1,2460
4,4,M,25,20,55455,2,1,55455


In [234]:
# Movie pre processing

movies['GenresList'] = movies['Genres'].str.split('|')
movies_df_exploded = movies.explode('GenresList')
one_hot = pd.get_dummies(movies_df_exploded['GenresList'], dtype=int)
one_hot_exploded = one_hot.groupby(movies_df_exploded.index).max()
movies = pd.concat([movies, one_hot_exploded], axis=1)

movies.head()

# movies.columns

Unnamed: 0,MovieID,Title,Genres,GenresList,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,Toy Story (1995),Animation|Children's|Comedy,"[Animation, Children's, Comedy]",0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Jumanji (1995),Adventure|Children's|Fantasy,"[Adventure, Children's, Fantasy]",0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,3,Waiting to Exhale (1995),Comedy|Drama,"[Comedy, Drama]",0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Father of the Bride Part II (1995),Comedy,[Comedy],0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Rating preprocessing

# drop rows with rating 3
ratings = ratings[ratings['Rating'] != 3]

ratings['RatingEncoded'] = ratings['Rating'].mask(ratings['Rating'] < 3, 0)
ratings['RatingEncoded'] = ratings['RatingEncoded'].mask(ratings['Rating'] > 3, 1)



print(ratings.head())
print(ratings['RatingEncoded'].value_counts())



   uid  movie_id  rating         ts
0    1      1193       5  978300760
1    1       661       3  978302109
2    1       914       3  978301968
3    1      3408       4  978300275
4    1      2355       5  978824291
RatingEncoded
1    575281
0    163731
Name: count, dtype: int64


In [242]:
# Merge and write datasets.

user_ratings_df = pd.merge(ratings, users, on='UserID')

# Merge right to keep all the movies. Some users may have NULL data. This is needed for evaluation.
df = pd.merge(user_ratings_df, movies, on='MovieID', how='right')

print(df.head())
df.to_csv('../data/dataset_dcn_v2.csv')

   UserID  MovieID  Rating    Timestamp            Datetime  RatingEncoded  \
0     0.0        0     5.0  978824268.0 2001-01-06 23:37:48            1.0   
1     5.0        0     4.0  978237008.0 2000-12-31 04:30:08            1.0   
2     7.0        0     4.0  978233496.0 2000-12-31 03:31:36            1.0   
3     8.0        0     5.0  978225952.0 2000-12-31 01:25:52            1.0   
4     9.0        0     5.0  978226474.0 2000-12-31 01:34:34            1.0   

  Gender   Age  Occupation Zip-code  ...  Fantasy  Film-Noir  Horror Musical  \
0      F   1.0        10.0    48067  ...        0          0       0       0   
1      F  50.0         9.0    55117  ...        0          0       0       0   
2      M  25.0        12.0    11413  ...        0          0       0       0   
3      M  25.0        17.0    61614  ...        0          0       0       0   
4      F  35.0         1.0    95370  ...        0          0       0       0   

  Mystery Romance  Sci-Fi  Thriller  War  Western 

In [249]:
# Read data
data = pd.read_csv('../data/dataset_dcn_v2.csv')


user_columns = ['UserID', 'AgeEncoded', 'GenderEncoded', 'Zip-codeEncoded', 'Occupation']

movies_columns = ['MovieID', 'Action', 'Adventure',
       'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western']

ratings_columns = ['RatingEncoded']

# Training data
training_columns = user_columns+movies_columns+ratings_columns
training_df = data[training_columns]
training_df = training_df.dropna()
training_df = training_df.astype('int64')

# Evaluation data.
evaluation_columns = user_columns+movies_columns
evaluation_df = data[evaluation_columns]


In [250]:
training_df.dtypes

training_df = training_df.sample(100000)


In [253]:

from models.sequential_dcn_v2 import DCNV2_Sequential
from models.vanilla_nn import TwoLayerNet
from trainer import Trainer
from data_utils.datasets import CustomDataset
import torch
from torch.utils.data import DataLoader, TensorDataset
import argparse
import yaml
from config import Config
import pandas as pd

config_file = "../configs/config_dcn_v2_sequential.yaml"

with open(config_file, 'r') as file:
    config_dict = yaml.safe_load(file)
    config = Config(config_dict=config_dict)
print(config)
target_column = 'RatingEncoded'
df = training_df

# Generate sparse input.
sparse_feature_info = {
    # name: (vocab_size, embed_size)
    "UserID": (6500, 64),       # uid 1 6040 users, 64-dim embedding
    "MovieID": (4000, 64),        # movie_id 1 3952 items, 64-dim embedding
    "Occupation": (21, 8),        # occupation 0 20 items, 64-dim embedding
    "AgeEncoded": (8, 4),        # age_encoded 1 7 age, 64-dim embedding
    "Zip-codeEncoded": (100000, 128),        # zip_encoded 231 99945 zip, 64-dim embedding
}
sparse_columns = sparse_feature_info.keys()
X_sparse_input = {
    name: torch.tensor(df[name].values)
    for name, (vocab_size, embed_size) in sparse_feature_info.items()
}

# Generate dense input.
dense_columns = list(set(df.columns) - set(sparse_columns) - {target_column})
num_dense_features = len(dense_columns)
X_dense_input = torch.tensor(df[dense_columns].values)
y = torch.tensor(df[target_column].values, dtype=torch.float32)

dataset = CustomDataset(X_sparse_input, X_dense_input, y)
loader = DataLoader(dataset, batch_size=config.train.batch_size, shuffle=True)


model = DCNV2_Sequential(sparse_feature_info=sparse_feature_info, num_dense_features=num_dense_features,
                         cross_layers=config.network.num_cross_layers, deep_hidden_dims=config.network.hidden_dims)

trainer = Trainer(model, None, config, loader, float(config.train.lr))

trainer.fit()


<config.Config object at 0x2b007f850>


100%|██████████| 196/196 [01:01<00:00,  3.20it/s]


[Epoch 1] Train Loss: 8.2659


100%|██████████| 196/196 [00:08<00:00, 24.16it/s]


{'loss': 0.002346870636343956, 'auc': 0.5491875410079956, 'accuracy': 0.7734599709510803, 'precision': 0.7772437334060669, 'recall': 0.9923419952392578}


100%|██████████| 196/196 [01:05<00:00,  2.98it/s]


[Epoch 2] Train Loss: 0.7639


100%|██████████| 196/196 [00:15<00:00, 12.85it/s]


{'loss': 0.0012273213922977448, 'auc': 0.5555386543273926, 'accuracy': 0.7434200048446655, 'precision': 0.7797008156776428, 'recall': 0.9327540397644043}


100%|██████████| 196/196 [00:39<00:00,  4.93it/s]


[Epoch 3] Train Loss: 0.5914


100%|██████████| 196/196 [00:06<00:00, 28.72it/s]


{'loss': 0.0010763820526003838, 'auc': 0.597252607345581, 'accuracy': 0.7773000001907349, 'precision': 0.7779251933097839, 'recall': 0.9977051615715027}


100%|██████████| 196/196 [00:41<00:00,  4.76it/s]


[Epoch 4] Train Loss: 0.5522


100%|██████████| 196/196 [00:06<00:00, 28.97it/s]


{'loss': 0.001082548856139183, 'auc': 0.6060603260993958, 'accuracy': 0.7777000069618225, 'precision': 0.777518093585968, 'recall': 0.9993682503700256}


100%|██████████| 196/196 [00:39<00:00,  4.96it/s]


[Epoch 5] Train Loss: 0.5337


100%|██████████| 196/196 [00:06<00:00, 29.52it/s]

{'loss': 0.0010224693968892097, 'auc': 0.6461194753646851, 'accuracy': 0.7777400016784668, 'precision': 0.7819039225578308, 'recall': 0.9894412755966187}





In [None]:
# for sparse encoding

for name, (vocab_size, embed_size) in sparse_feature_info.items():
    print("Name:", name, training_df[name].min(), training_df[name].max())

# Old
# Name: uid 1 6040
# Name: movie_id 1 3952
# Name: occupation 0 20
# Name: age_encoded 1 7
# Name: zip_encoded 231 99945
# Name: UserID 0 6039
# Name: MovieID 0 3882
# Name: Occupation 0 20
# Name: AgeEncoded 0 6
# Name: Zip-codeEncoded 231 99945


Name: UserID 0 6039
Name: MovieID 0 3882
Name: Occupation 0 20
Name: AgeEncoded 0 6
Name: Zip-codeEncoded 231 99945


In [None]:
# Evaluate Model

from data_utils.preprocess import get_user_sequences, split_sequences
from evaluation import evaluate_DCNV2Model

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("Running on", device)

# Keep rating > 4 to get positive ratings for evaluation data generation.
ratings = ratings[ratings["Rating"] >= 4].reset_index(
        drop=True
    )

# build per-user sequences & splits
user_seqs   = get_user_sequences(ratings)
user_splits = split_sequences(user_seqs, train_ratio=0.8, val_ratio=0.1)



# 3) global item set
all_movies = set(movies["MovieID"].unique())
evaluate_DCNV2Model(model=model, user_splits=user_splits, global_items=all_movies, device=device, data=evaluation_df
                    ,user_columns=user_columns, movies_columns=movies_columns, target_column=target_column,
                    sparse_feature_info=sparse_feature_info) 


Running on cpu
X_sparse_input {'UserID': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]), 'MovieID': tensor([3346,   15,   33,   40,  149,  165,  173,  182,  210,  311,  381,  424,
         427,  441,  461,  475,  526,  540,  555,  615,  619,  642,  645,  670,
         842,  869,  871, 1037, 1043, 1088, 1097, 1099, 1131, 1149, 1170, 1245,
        1283, 1387, 1425, 1469, 1486, 1515, 1534, 1687, 1757, 1775, 1784, 1833,
        2029, 2098, 2194, 2225, 2263, 2289, 2293, 2296, 2301, 2323, 2327, 2416,
        2508, 2536, 2573, 2629, 2630, 2641, 2711, 2753, 2767, 2771, 2829, 2852,
        2867, 2927, 2976, 3075, 3079, 3108, 3198, 3235, 3300, 3387, 3438, 3459,
        3506, 3534, 3566, 3623, 3637, 3658, 36

{'Hit@10': 0.11565865782932891,
 'NDCG@10': 0.054842668114711314,
 'MRR': 0.05986575571497283,
 'MAP': 0.05986575571497283}