In [None]:
!pip install -U deepctr-torch
# !pip install reco-utils==9.9.0

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

In [None]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
import torch

In [None]:
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names, VarLenSparseFeat
from deepctr_torch.models import *

In [None]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [None]:
COL_NAME = ['uid','mid','rating','timestamp']
ratings = pd.read_csv('ml-1m/ratings.dat',sep='::', header=None, engine='python', names=COL_NAME)

COL_NAME = ['mid','movie_name','movie_genre'] 
items = pd.read_csv('ml-1m/movies.dat',sep='::', header=None, engine='python', names=COL_NAME)

COL_NAME = ['uid','user_fea1','user_fea2','user_fea3','user_fea4']
users = pd.read_csv('ml-1m/users.dat',sep='::', header=None, engine='python', names=COL_NAME)

ratings = ratings.join(items.set_index('mid'), on = 'mid', how = 'left')
ratings = ratings.join(users.set_index('uid'), on = 'uid', how = 'left')

In [None]:
sparse_features = ['uid', 'mid', 'movie_name', 'user_fea1', 'user_fea2', 'user_fea3', 'user_fea4']

In [None]:
for feat in sparse_features:
    lbe = LabelEncoder()
    ratings[feat] = lbe.fit_transform(ratings[feat])

In [None]:
key2index = {}

In [None]:
genres_list = list(map(split, ratings['movie_genre'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)

genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [None]:
fixlen_feature_columns = [SparseFeat(feat, ratings[feat].nunique(), embedding_dim=4) for feat in sparse_features]

varlen_feature_columns = [VarLenSparseFeat(SparseFeat('movie_genre', vocabulary_size=len(key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')] 

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [None]:
target = ratings.pop('rating')
y = np.zeros_like(target)
y[target>3] = 1
X = ratings

In [None]:
X_train, X_test, y_train, y_test, g_train, g_test = train_test_split(X, y, genres_list, test_size=0.2, random_state=42)

In [None]:
train_model_input = {name: X_train[name] for name in sparse_features}
train_model_input['movie_genre'] = g_train

test_model_input = {name: X_test[name] for name in sparse_features}
test_model_input['movie_genre'] = g_test

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
models = [FiBiNET, AFM, DeepFM, 
          xDeepFM, WDL, PNN, 
          CCPM, NFM, DCN, ONN]

In [None]:
res = {}

In [None]:
for model_name in models:
    # print('-' * 10 + f"Started model {model_name.__name__}" + '-' * 10)
    if model_name.__name__ in ['PNN', 'DIN', 'DIEN']:
        model = model_name(dnn_feature_columns=dnn_feature_columns, 
                           task='binary',
                           l2_reg_embedding=1e-5, device=device)
    else:   
        model = model_name(linear_feature_columns=linear_feature_columns, 
                           dnn_feature_columns=dnn_feature_columns, 
                           task='binary', 
                           l2_reg_embedding=1e-5, device=device)

    model.compile("adam", "binary_crossentropy",
                    metrics=["logloss", "auc"], )
    model.fit(train_model_input, y_train, batch_size=256,epochs=20,verbose=0,validation_split=0.0)

    res[model_name.__name__] = model.evaluate(test_model_input, y_test)
    # print(f"logloss={res[model_name.__name__]['logloss']:.4f}   auc={res[model_name.__name__]['auc']:.4f}")

In [None]:
res

{'AFM': {'auc': 0.7946905449616923, 'logloss': 0.5370212439945969},
 'CCPM': {'auc': 0.8019150553637175, 'logloss': 0.5306061214845735},
 'DCN': {'auc': 0.8031605356273024, 'logloss': 0.5272173049426052},
 'DeepFM': {'auc': 0.804853262910007, 'logloss': 0.5253414205088675},
 'FiBiNET': {'auc': 0.8097533435504148, 'logloss': 0.5264089652203264},
 'NFM': {'auc': 0.7947001640896278, 'logloss': 0.5371351416460893},
 'ONN': {'auc': 0.775800398728564, 'logloss': 0.5772175185230914},
 'PNN': {'auc': 0.8118524787710026, 'logloss': 0.5217028688360951},
 'WDL': {'auc': 0.7999925197064117, 'logloss': 0.5302618263096803},
 'xDeepFM': {'auc': 0.8077183401631445, 'logloss': 0.5235092325863476}}

In [None]:
def get_metrics(groups):
    metrics =  pd.DataFrame.from_records(groups, columns=groups.keys(), index=list(groups.values())[0].keys())
    metrics.index.name = 'metric'
    return metrics

def get_ate(groups, control_name):
    """Get Average Treatment Effect
    groups - dictionary where keys - names of models, values - dicts of pairs <metric_name>, <metric_value>
    control_name - name of baseline model
    
    return pd.DataFrame (rows corresponds to metrics, cols corresponds to models and ATE with respect to control)
    """
    metrics = get_metrics(groups)
    return metrics.subtract(metrics[control_name], axis='index').drop(columns=control_name) * 100

In [None]:
for model in res:
    res[model]['log-loss'] = res[model]['logloss']

In [None]:
for model in res:
    del res[model]['logloss']

In [None]:
a = get_metrics(res)
b = get_ate(res, 'FiBiNET')

In [None]:
pd.set_option("display.precision", 3)