In [1]:
# Basic setup
import pandas as pd
import numpy as np
import os
import gc
from tqdm import tqdm

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Visualization and debugging
import seaborn as sns
import matplotlib.pyplot as plt

# Time processing
from datetime import datetime

# File locations
DATA_PATH = "/kaggle/input"
TRAIN_PATH = f"{DATA_PATH}/train-data/train_data.parquet"
TEST_PATH = f"{DATA_PATH}/test-data/test_data.parquet"
EVENT_PATH = f"{DATA_PATH}/other-data/add_event.parquet"
TRANS_PATH = f"{DATA_PATH}/other-data/add_trans.parquet"
OFFER_PATH = f"{DATA_PATH}/other-data/offer_metadata.parquet"

In [2]:
# Load files
train_df = pd.read_parquet(TRAIN_PATH)
test_df = pd.read_parquet(TEST_PATH)
events_df = pd.read_parquet(EVENT_PATH)
txn_df = pd.read_parquet(TRANS_PATH)
offers_df = pd.read_parquet(OFFER_PATH)

# Quick peek
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Events shape:", events_df.shape)
print("Transactions shape:", txn_df.shape)
print("Offers shape:", offers_df.shape)

train_df.head()

Train shape: (770164, 372)
Test shape: (369301, 371)
Events shape: (21457473, 5)
Transactions shape: (6339465, 9)
Offers shape: (4164, 12)


Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,,-9999.0,0.0,,28.0,0.0,0.0,337.0,0.0,0.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,,,0.0,,87.0,0.0,0.0,1010.0,2.0,0.0019801980198019
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,,,0.0,,23.0,0.0,0.0,1010.0,2.0,0.0019801980198019
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,,-9999.0,0.0,,277.0,1.0,0.003610108303249,337.0,0.0,0.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,,-9999.0,0.0,,359.0,0.0,0.0,337.0,0.0,0.0


In [3]:
train_df["id2"].dtype

dtype('O')

In [4]:
train_df["id3"].dtype

dtype('O')

In [5]:
train_df["y"].dtype

dtype('O')

In [6]:
unique_pairs_df = train_df[['id2', 'id3']].drop_duplicates()
print(f"Number of unique (id2, id3) pairs: {len(unique_pairs_df)}")

Number of unique (id2, id3) pairs: 747696


### **Dropping columns with over 80% missing data**

In [7]:
# Calculate percentage of missing values in each column
missing_percentage = train_df.isnull().mean() * 100

# Find columns with more than 80% missing values
cols_over_80_null = missing_percentage[missing_percentage > 80].index.tolist()

# Print count and column names
print(f"Number of columns with >80% missing values: {len(cols_over_80_null)}")
print("Columns:")
print(cols_over_80_null)

Number of columns with >80% missing values: 40
Columns:
['f3', 'f4', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f33', 'f34', 'f36', 'f37', 'f64', 'f66', 'f70', 'f79', 'f80', 'f81', 'f84', 'f88', 'f92', 'f112', 'f114', 'f117', 'f118', 'f120', 'f121', 'f122', 'f135', 'f136', 'f154', 'f176', 'f189', 'f205', 'f220', 'f221', 'f360']


In [8]:
train_df.drop(columns = cols_over_80_null, inplace = True)

In [9]:
train_df.shape

(770164, 332)

In [10]:
train_df.duplicated().sum()

0

### **Dropping columns with only 1 unique value**

In [11]:
def drop_single_unique_value_columns(df):
    drop_cols = []

    for col in df.columns:

        # Count non-null unique values
        nunique = df[col].dropna().nunique()

        # Drop if there's only one unique value (like only 0 or only "Y")
        if nunique == 1:
            drop_cols.append(col)

    return drop_cols

In [12]:
drop_cols = drop_single_unique_value_columns(train_df)

In [13]:
print(drop_cols)

['f23', 'f24', 'f25', 'f62', 'f102', 'f128', 'f129', 'f144', 'f145', 'f226', 'f229', 'f236', 'f238', 'f240', 'f243', 'f245', 'f246', 'f248', 'f249', 'f258', 'f259', 'f260', 'f262', 'f266', 'f267', 'f268', 'f270', 'f271', 'f277', 'f279', 'f281', 'f286', 'f287', 'f290', 'f291', 'f294', 'f295', 'f298', 'f300', 'f301', 'f303', 'f304', 'f307', 'f308', 'f309', 'f334', 'f335']


In [14]:
train_df.drop(columns = drop_cols, inplace = True)

In [15]:
train_df.shape

(770164, 285)

In [16]:
# train_df["f104"].astype(str).str.strip().eq("0.0").any()

In [17]:
# train_df["f104"].dtype


In [18]:
# train_df["f104"].apply(type).value_counts()

In [19]:
# count_zeros = train_df["f103"].astype(str).str.strip().eq("0.0").sum()
# print(count_zeros)

In [20]:
# train_df["f354"].value_counts()

In [21]:
# redundant (from feature engineering)
train_df.drop(columns = ["f333"], inplace = True)

In [22]:
train_df.shape

(770164, 284)

In [23]:
# ctr ratio and impressions are redundant as ctr and clicks are retained
redundant_cols= ["f320",
"f321",
"f322",
"f323",
"f324",
"f325",
"f326",
"f327",
"f328",
"f329",
"f330",
"f331"
]

In [24]:
train_df.drop(columns = redundant_cols, inplace = True)

In [25]:
train_df.shape

(770164, 272)

In [26]:
# redundant (from feature engineering)
train_df.drop(columns = ["f225"], inplace = True)
train_df.shape

(770164, 271)

In [27]:
# Remove total impressions and total ctr as they are redundant (can be calculated by summation)

train_df.drop(columns = ["f103", "f113"], inplace = True)
train_df.shape

(770164, 269)

In [28]:
# Remove ctr ratios - very sparse and many null values

train_df.drop(columns = ["f115", "f116", "f119"], inplace = True)
train_df.shape

(770164, 266)

### **Some optional preprocessing**

In [29]:
# # Count of columns with dtype 'object'
# num_object_cols = (train_df.dtypes == 'O').sum()
# print(f"Number of object (string) columns: {num_object_cols}")

In [30]:
# numeric_like_object_cols = []

# for col in train_df.columns:
#     if train_df[col].dtype == 'O':  # Object dtype
#         # Try converting non-null values to numeric
#         converted = pd.to_numeric(train_df[col].dropna(), errors='coerce')
#         # If all non-null values are successfully converted (i.e., no NaNs introduced), keep it
#         if converted.notna().all():
#             numeric_like_object_cols.append(col)

# print(f"Numeric-like object columns: {len(numeric_like_object_cols)}")


In [31]:
# # Detect columns with 2 unique values (excluding NaNs)
# cols_with_2_uniques = [
#     col for col in numeric_like_object_cols 
#     if train_df[col].dropna().nunique() == 2
# ]

# print(f"Columns with 2 unique values: {cols_with_2_uniques}")

In [32]:
# # Detect columns with 3 unique values (excluding NaNs)
# cols_with_3_uniques = [
#     col for col in numeric_like_object_cols 
#     if train_df[col].dropna().nunique() == 3
# ]

# print(f"Columns with 3 unique values: {cols_with_3_uniques}")

In [33]:
# # Detect columns with 5 unique values (excluding NaNs)
# cols_with_5_uniques = [
#     col for col in numeric_like_object_cols 
#     if train_df[col].dropna().nunique() == 5
# ]

# print(f"Columns with 5 unique values: {cols_with_5_uniques}")

In [34]:
# train_df["f48"].value_counts()

### **Imputation of the Null Values**

In [35]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("/kaggle/input/new-data-dict/new_data_dictionary.csv")

# Clean the "What to fill ?" column
what_to_fill_col = df["What to fill ?"].astype(str).str.strip()

# Extract column lists based on fill strategy
fill_with_zero     = df[what_to_fill_col == "0"]["masked_column"].tolist()
fill_with_mean     = df[what_to_fill_col == "mean"]["masked_column"].tolist()
fill_with_median   = df[what_to_fill_col == "median"]["masked_column"].tolist()
fill_with_mode     = df[what_to_fill_col == "mode"]["masked_column"].tolist()
fill_with_Unknown  = df[what_to_fill_col == "Unknown"]["masked_column"].tolist()
fill_with_N        = df[what_to_fill_col == "Assign to N"]["masked_column"].tolist()
flag_features      = df[what_to_fill_col == "0 with flag"]["masked_column"].tolist()

# Print counts
print(f"Number of columns to fill with 0         : {len(fill_with_zero)}")
print(f"Number of columns to fill with mean      : {len(fill_with_mean)}")
print(f"Number of columns to fill with median    : {len(fill_with_median)}")
print(f"Number of columns to fill with mode      : {len(fill_with_mode)}")
print(f"Number of columns to fill with 'Unknown' : {len(fill_with_Unknown)}")
print(f"Number of columns to assign to 'N'       : {len(fill_with_N)}")
print(f"Number of flag features                  : {len(flag_features)}")

Number of columns to fill with 0         : 213
Number of columns to fill with mean      : 3
Number of columns to fill with median    : 12
Number of columns to fill with mode      : 6
Number of columns to fill with 'Unknown' : 8
Number of columns to assign to 'N'       : 2
Number of flag features                  : 14


In [36]:
# Convert all your numerical columns (f1 to f366) to numeric, coerce errors to NaN if non-numeric values found

cols_to_convert = fill_with_zero + fill_with_mean + fill_with_median + fill_with_mode

for col in cols_to_convert:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')

In [37]:
#Fill Null Values

# Fill 0
train_df[fill_with_zero] = train_df[fill_with_zero].fillna(0)

# Fill mean
for col in fill_with_mean:
    train_df[col].fillna(train_df[col].mean(), inplace=True)

# Fill median
for col in fill_with_median:
    train_df[col].fillna(train_df[col].median(), inplace=True)

# Fill mode
for col in fill_with_mode:
    train_df[col].fillna(train_df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [38]:
train_df.isnull().any()

id1     False
id2     False
id3     False
id4     False
id5     False
        ...  
f362    False
f363    False
f364    False
f365    False
f366    False
Length: 266, dtype: bool

### **Handling Categorical Nulls**

In [39]:
# Fill with 'Unknown'
for col in fill_with_Unknown:
    train_df[col] = train_df[col].fillna('Unknown')

# Fill with 'N'
for col in fill_with_N:
    train_df[col] = train_df[col].fillna('N')

In [40]:
train_df.shape

(770164, 266)

### **Creating Binary Flags**

In [41]:
# Create flag columns in train_df
for col in flag_features:
    train_df[f"{col}_known"] = train_df[col].notna().astype(int)

  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)
  train_df[f"{col}_known"] = train_df[col].notna().astype(int)


In [42]:
train_df.shape

(770164, 280)

In [43]:
for col in flag_features:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')

In [44]:
# Fill 0
train_df[flag_features] = train_df[flag_features].fillna(0)

In [45]:
no_missing = not train_df.isnull().any().any()
print(no_missing)  # → True if all values are filled, False if any column has NaNs

True


In [46]:
train_df.shape

(770164, 280)

In [47]:
# # Saving the dataset (with all preprocessing done till now)

# train_df.to_csv("train_dataset.csv")

### **Handling skewness**

**Visualize Skewness**

In [48]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# import scipy.stats as stats

# def visualize_skewness(df, column):
#     data = df[column].dropna().astype(float)

#     plt.figure(figsize=(16, 5))

#     # Q-Q Plot
#     plt.subplot(1, 3, 1)
#     stats.probplot(data, dist="norm", plot=plt)
#     plt.title(f"Q-Q Plot for {column}")

#     # Histogram with KDE
#     plt.subplot(1, 3, 2)
#     sns.histplot(data, kde=True, bins=30)
#     plt.title(f"Histogram + KDE for {column}")

#     # Boxplot
#     plt.subplot(1, 3, 3)
#     sns.boxplot(x=data)
#     plt.title(f"Boxplot for {column}")

#     plt.tight_layout()
#     plt.show()

In [49]:
# visualize_skewness(train_df, "f159")

In [50]:
# Converting a numerical feature into numeric type, skipped during above preprocessing because it has 0 null values
no_null_feature = ["f350"]

for col in no_null_feature:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')

In [51]:
train_df["f350"].dtype

dtype('int64')

In [52]:
# this includes continuous numerical columns and one hot encoded columns both
numerical_features = fill_with_zero + fill_with_mean + fill_with_median + fill_with_mode + flag_features + no_null_feature
len (numerical_features)

249

In [53]:
print (numerical_features)

['f1', 'f2', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f28', 'f29', 'f30', 'f31', 'f38', 'f44', 'f45', 'f46', 'f47', 'f49', 'f51', 'f58', 'f59', 'f60', 'f61', 'f63', 'f65', 'f67', 'f68', 'f69', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f85', 'f86', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f123', 'f124', 'f125', 'f126', 'f127', 'f130', 'f131', 'f132', 'f133', 'f134', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f217', 'f218', 'f219', 'f222', 'f227

In [54]:
# # Filter numeric cols with high skew & not binary
# log_candidates = [
#     col for col in skewed_numerical_features
#     if train_df[col].astype(float).nunique() > 5 and train_df[col].astype(float).skew() > 1
# ]

# print(log_candidates)

In [55]:
# len (log_candidates)

In [56]:
# # Check skewness numerically
# # pd.set_option('display.max_rows', None)
# train_df[log_candidates].skew().sort_values(ascending=False)

In [57]:
# from sklearn.preprocessing import PowerTransformer

# # Use only numerical float columns
# pt = PowerTransformer(method='yeo-johnson', standardize=False)

# # Apply only to training set first (fit + transform)
# train_df[log_candidates] = pt.fit_transform(train_df[log_candidates].astype(float))

In [58]:
# # pd.reset_option('display.max_rows', None)
# train_df[log_candidates].skew().sort_values(ascending=False)

In [59]:
# no_missing = not train_df.isnull().any().any()
# print(no_missing)  # → True if all values are filled, False if any column has NaNs

### **Outlier Detection & Imputation**

In [60]:
skewed_numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [61]:
# # Filter numeric cols with high skew & not binary
outlier_candidates = [
    col for col in skewed_numerical_features
    if train_df[col].astype(float).nunique() > 5
]

print(outlier_candidates)

['f1', 'f2', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f22', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f35', 'f38', 'f39', 'f40', 'f41', 'f43', 'f44', 'f45', 'f46', 'f47', 'f49', 'f51', 'f58', 'f59', 'f60', 'f61', 'f63', 'f65', 'f67', 'f68', 'f69', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f82', 'f83', 'f85', 'f86', 'f87', 'f89', 'f90', 'f91', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f123', 'f124', 'f125', 'f126', 'f127', 'f130', 'f131', 'f132', 'f133', 'f134', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f190', 'f191', 'f192', 'f193', 'f1

In [62]:
len (outlier_candidates)

198

In [63]:
skewed_yeojohnson = train_df[outlier_candidates].astype(float).skew()

In [64]:
from scipy.stats import zscore

## Define a threshold for skewness above which median should be used
high_skew_cols = skewed_yeojohnson[skewed_yeojohnson.abs() > 1].index.tolist()
low_skew_cols = skewed_yeojohnson[skewed_yeojohnson.abs() <= 1].index.tolist()

## Detect and replace outliers
for col in outlier_candidates:
    z = zscore(train_df[col].astype(float))
    outliers = np.abs(z) > 3
    num_outliers = outliers.sum()
    print(f"{col}: {num_outliers} outliers")

    if num_outliers > 0:
        if col in high_skew_cols:
            impute_val = train_df[col].astype(float).median()
        else:
            impute_val = train_df[col].astype(float).mean()

        # Replace only the outlier rows
        train_df.loc[outliers, col] = impute_val

f1: 18594 outliers
f2: 16166 outliers
f5: 5453 outliers
f6: 0 outliers
f7: 10232 outliers
f8: 2251 outliers
f9: 0 outliers
f10: 11711 outliers
f11: 1192 outliers
f12: 4174 outliers
f22: 7517 outliers
f26: 16942 outliers
f27: 21440 outliers
f28: 15419 outliers
f29: 13465 outliers
f30: 20237 outliers
f31: 3308 outliers
f32: 6002 outliers
f35: 12125 outliers
f38: 7739 outliers
f39: 829 outliers
f40: 456 outliers
f41: 3760 outliers
f43: 7367 outliers
f44: 4718 outliers
f45: 15701 outliers
f46: 12847 outliers
f47: 10721 outliers
f49: 9698 outliers
f51: 1807 outliers
f58: 11557 outliers
f59: 5510 outliers
f60: 12844 outliers
f61: 1266 outliers
f63: 2670 outliers
f65: 19096 outliers
f67: 5460 outliers
f68: 9058 outliers
f69: 14777 outliers
f72: 5696 outliers
f73: 9072 outliers
f74: 19611 outliers
f75: 359 outliers
f76: 9162 outliers
f77: 18434 outliers
f78: 15808 outliers
f82: 18339 outliers
f83: 16944 outliers
f85: 39983 outliers
f86: 14221 outliers
f87: 4885 outliers
f89: 4437 outliers
f90:

### **Standardization**

In [65]:
# the list of features which shall be standardized (avoid categorical/binary/one hot encoded columns)

# Filter continuous features from numerical_features
continuous_features = [
    col for col in numerical_features
    if train_df[col].nunique() > 2
]

print("Continuous features: \n", continuous_features)
print("Number of Continuous features: ", len(continuous_features))

Continuous features: 
 ['f1', 'f2', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f28', 'f29', 'f30', 'f31', 'f38', 'f44', 'f45', 'f46', 'f47', 'f49', 'f51', 'f58', 'f59', 'f60', 'f61', 'f63', 'f65', 'f67', 'f68', 'f69', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f85', 'f86', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f123', 'f124', 'f125', 'f126', 'f127', 'f130', 'f131', 'f132', 'f133', 'f134', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f206', 'f207', 'f209', 'f210', 'f211', 'f212', 'f217', 'f218', 'f219'

In [66]:
# list of features that are either one hot encoded or numerically encoded (categorical)
num_categorical = [
    col for col in numerical_features
    if train_df[col].nunique() <= 2
]

print("Numerical Categorical features: \n", num_categorical)
print("Number of Numerical Categorical features: ", len(num_categorical))

Numerical Categorical features: 
 ['f208', 'f227', 'f228', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f237', 'f239', 'f241', 'f242', 'f244', 'f247', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f261', 'f263', 'f264', 'f265', 'f269', 'f272', 'f273', 'f274', 'f275', 'f276', 'f278', 'f280', 'f282', 'f283', 'f284', 'f285', 'f288', 'f289', 'f292', 'f293', 'f296', 'f297', 'f299', 'f302', 'f305', 'f306', 'f332', 'f359']
Number of Numerical Categorical features:  51


In [67]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_df[continuous_features] = scaler.fit_transform(train_df[continuous_features])

train_df.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f5,f6,...,f90_known,f91_known,f104_known,f105_known,f106_known,f107_known,f108_known,f109_known,f110_known,f111_known
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,-0.521425,-0.662805,-1.013951,-1.272144,...,1,1,1,1,1,0,1,1,1,1
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,-0.521425,-0.662805,-1.013951,-1.272144,...,1,1,1,1,1,0,1,1,1,1
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,-0.521425,-0.662805,-1.013951,-1.272144,...,1,1,1,1,1,0,1,1,1,1
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,-0.521425,-0.662805,-1.013951,-1.272144,...,1,1,1,1,1,0,1,1,1,1
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,-0.521425,-0.662805,-1.013951,-1.272144,...,1,1,1,1,1,0,1,1,1,1


In [68]:
# Count of columns with dtype 'object'
num_object_cols = (train_df[numerical_features].dtypes == 'O').sum()
print(f"Number of object (string) columns: {num_object_cols}")

Number of object (string) columns: 0


In [69]:
# Count of columns with dtype 'object'
num_object_cols = (train_df.dtypes == 'O').sum()
print(f"Number of object (string) columns: {num_object_cols}")

Number of object (string) columns: 17


**The above 17 columns represent the identifiers (id1-id5) and the categorical columns**

In [70]:
train_df["f332"].dtype

dtype('float64')

In [71]:
train_df.shape

(770164, 280)

### **Handling id4 (impression timestamp)**

In [72]:
train_df["id4"].dtype

dtype('O')

In [73]:
train_df["id4"] = pd.to_datetime(train_df["id4"])
train_df["id4"].dtype

dtype('<M8[ns]')

In [74]:
# Get max timestamp (most recent impression)
latest_time = train_df["id4"].max()

# Compute number of days since latest impression
train_df["days_since_latest"] = (latest_time - train_df["id4"]).dt.total_seconds() / (3600 * 24)  # in days

# Apply exponential decay (λ = 0.1 is typical; tune as needed)
lambda_ = 0.1
train_df["decayed_impression_score"] = np.exp(-lambda_ * train_df["days_since_latest"])

  train_df["days_since_latest"] = (latest_time - train_df["id4"]).dt.total_seconds() / (3600 * 24)  # in days
  train_df["decayed_impression_score"] = np.exp(-lambda_ * train_df["days_since_latest"])


In [75]:
train_df.drop(columns = ["days_since_latest"], inplace = True)

In [76]:
train_df.shape

(770164, 281)

### **Dropping irrelevant identifiers & storing label (`y`) separately**

In [77]:
# Creating a copy of the train_df to store triplets of id2, id3, and y

train_df_copy = train_df[["id2", "id3", "y"]].copy()
print (f"Shape of the triplet dataframe: {train_df_copy.shape}")
train_df_copy.head()

Shape of the triplet dataframe: (770164, 3)


Unnamed: 0,id2,id3,y
0,1366776,189706075,0
1,1366776,89227,0
2,1366776,35046,0
3,1366776,6275451,0
4,1366776,78053,0


In [78]:
train_df_copy['y'].dtype

dtype('O')

**Checking duplicate entries of `id2`-`id3` pair in `train_df_copy`**

In [79]:
has_duplicates = train_df_copy.duplicated(subset=["id2", "id3"]).any()
print("Duplicates exist:" if has_duplicates else "No duplicates.")

Duplicates exist:


In [80]:
dup_counts = train_df_copy.groupby(["id2", "id3"]).size()
dup_combos = dup_counts[dup_counts > 1].reset_index()[["id2", "id3"]]
print(dup_combos)

           id2        id3
0      1000639  118034590
1      1000639     122267
2      1000639      16099
3      1000639     176404
4      1000639     184034
...        ...        ...
21439  1909794      93516
21440  1909794      95537
21441  1909794      96930
21442  1910264      23311
21443  1910264     807513

[21444 rows x 2 columns]


In [81]:
# Checking the label (y) consistency for these duplicated rows

dups_with_labels = train_df_copy.merge(dup_combos, on=["id2", "id3"], how="inner")

# Count labels per duplicated pair
label_analysis = dups_with_labels.groupby(["id2", "id3"])["y"].value_counts().unstack(fill_value=0)

# Split into categories
mixed_labels = label_analysis[(label_analysis['0'] > 0) & (label_analysis['1'] > 0)]
only_zeros = label_analysis[(label_analysis['0'] > 0) & (label_analysis['1'] == 0)]
only_ones = label_analysis[(label_analysis['1'] > 0) & (label_analysis['0'] == 0)]

# Display results
print("Number of (id2, id3) pairs with mixed labels (0 and 1):", mixed_labels.shape[0])
print(mixed_labels.head())

print("\nNumber of (id2, id3) pairs with only label 0:", only_zeros.shape[0])
print(only_zeros.head())

print("\nNumber of (id2, id3) pairs with only label 1:", only_ones.shape[0])
print(only_ones.head())

Number of (id2, id3) pairs with mixed labels (0 and 1): 1305
y               0  1
id2     id3         
1008125 117520  1  1
        689367  1  1
1009206 97067   1  1
1016376 2788    1  1
1018814 281783  1  1

Number of (id2, id3) pairs with only label 0: 20119
y                  0  1
id2     id3            
1000639 118034590  2  0
        122267     2  0
        16099      2  0
        176404     2  0
        184034     2  0

Number of (id2, id3) pairs with only label 1: 20
y                  0  1
id2     id3            
1029720 973281     0  2
1042939 62259      0  2
1042985 92870      0  2
1157059 403431     0  2
1231632 876665835  0  2


In [82]:
dup_combos = dup_counts[dup_counts > 1]

In [83]:
total_duplicates = dup_combos.sum()
print(f"Total number of duplicate rows: {total_duplicates}")

Total number of duplicate rows: 43912


In [84]:
all_twos = (dup_combos == 2).all()
print("All duplicated pairs appear exactly twice:" if all_twos else "Some pairs appear more than twice.")

Some pairs appear more than twice.


In [85]:
max_count = dup_combos.max()
print(f"The maximum count of any (id2, id3) pair is: {max_count}")

The maximum count of any (id2, id3) pair is: 3


In [86]:
train_df_copy["id2"].nunique()

46550

In [87]:
unique_pairs_df = train_df_copy[['id2', 'id3']].drop_duplicates()
print(f"Number of unique (id2, id3) pairs: {len(unique_pairs_df)}")

Number of unique (id2, id3) pairs: 747696


In [88]:
train_df.drop(columns = ["id1", "id2", "id3", "id4", "id5", "y"], inplace = True)
train_df.shape

(770164, 275)

### **Encoding categorical columns and computing embeddings**

In [89]:
categorical_columns = fill_with_Unknown + fill_with_N + ["f349"]

print("Categorical features: \n", categorical_columns)
print("Number of Categorical features: ", len(categorical_columns))

Categorical features: 
 ['f42', 'f48', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354', 'f50', 'f52', 'f349']
Number of Categorical features:  11


In [90]:
for col in categorical_columns:
    print (train_df[col].value_counts())

f42
Unknown    319428
R          221306
G          164856
P           39548
S           25026
Name: count, dtype: int64
f48
Unknown    503711
2.0        106076
1.0         68086
3.0         62921
4.0         29370
Name: count, dtype: int64
f53
NY         428934
Unknown    254467
NN          86763
Name: count, dtype: int64
f54
Unknown    254467
A          228409
F          169506
C           55019
D           41905
E           18094
B            2764
Name: count, dtype: int64
f55
Unknown    254467
-          209663
H          160422
I           35657
D           34697
M           22232
W           15853
T           14670
A           12027
C            9467
G            1009
Name: count, dtype: int64
f56
Unknown    254467
B          232674
G          150244
D           72022
S           60757
Name: count, dtype: int64
f57
Unknown    432892
H          238536
P           84822
A           13303
Z             611
Name: count, dtype: int64
f354
Rest       537421
Unknown    141991
Phase_1    

In [91]:
# Binary features — convert to int (0/1)
train_df["f50"] = (train_df["f50"] == "Y").astype(int)
train_df["f52"] = (train_df["f52"] == "Y").astype(int)

# These are the categorical columns for embeddings
categorical_columns_for_embedding = [
    "f42", "f48", "f53", "f54", "f55", "f56", "f57", "f354", "f349"
]

In [92]:
train_df["f50"].dtype

dtype('int64')

In [93]:
train_df["f52"].dtype

dtype('int64')

In [94]:
train_df["f50"].value_counts()

f50
0    483835
1    286329
Name: count, dtype: int64

In [95]:
# Save LabelEncoders for inverse transformation
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in categorical_columns_for_embedding:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    label_encoders[col] = le


In [96]:
import torch
import torch.nn as nn

# Get number of unique values for each categorical column
categorical_cardinalities = [train_df[col].nunique() for col in categorical_columns_for_embedding]

# Define embedding sizes using a common heuristic: min(50, (num_unique + 1) // 2)
embedding_sizes = [(cardinality, min(50, (cardinality + 1) // 2)) for cardinality in categorical_cardinalities]

# Example embedding module
class CategoricalEmbeddingModel(nn.Module):
    def __init__(self, embedding_sizes):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_embeddings=cardinality, embedding_dim=embed_dim)
            for cardinality, embed_dim in embedding_sizes
        ])

    def forward(self, x_categorical):
        # x_categorical should be tensor of shape (batch_size, num_categorical_columns_for_embedding)
        embedded = [emb_layer(x_categorical[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        return torch.cat(embedded, dim=1)  # shape: (batch_size, total_embedding_dim)

# Initialize the model
embedding_model = CategoricalEmbeddingModel(embedding_sizes)

# Convert categorical columns to encoded integer labels
from sklearn.preprocessing import LabelEncoder

for col in categorical_columns_for_embedding:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))  # Store label encoders if needed later

# Prepare tensor
x_categorical_tensor = torch.tensor(train_df[categorical_columns_for_embedding].values, dtype=torch.long)

# Forward pass through the embedding model
embedded_output = embedding_model(x_categorical_tensor)
print(embedded_output.shape)  # Should be (num_samples, total_embedding_dim)


torch.Size([770164, 28])


**Checking whether the 249 numerical_features are number or not**

In [97]:
all_numeric = all(
    pd.api.types.is_numeric_dtype(train_df[col]) for col in numerical_features
)
print(all_numeric)


True


**Checking whether the flagged columns `{col}_known` are numeric or not**

In [98]:
all_numeric = all(
    pd.api.types.is_numeric_dtype(train_df[col]) for col in flag_features
)
print(all_numeric)


True


In [99]:
train_df["f50"].dtype

dtype('int64')

In [100]:
train_df["f52"].dtype

dtype('int64')

In [101]:
train_df["decayed_impression_score"].dtype

dtype('float64')

In [102]:
train_df.head()

Unnamed: 0,f1,f2,f5,f6,f7,f8,f9,f10,f11,f12,...,f91_known,f104_known,f105_known,f106_known,f107_known,f108_known,f109_known,f110_known,f111_known,decayed_impression_score
0,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.898702
1,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.818244
2,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.742416
3,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.898673
4,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.898676


**Converting identifiers like `id2` and `id3` to numerical datatype and storing them in a tensor**

In [103]:
# Check if all values in each column are numeric (ignore missing/nulls)
id2_valid = train_df_copy['id2'].dropna().astype(str).str.isdigit().all()
id3_valid = train_df_copy['id3'].dropna().astype(str).str.isdigit().all()
y_valid = train_df_copy['y'].dropna().astype(str).str.isdigit().all()

print("All id2 values are integers:", id2_valid)
print("All id3 values are integers:", id3_valid)
print("All y (label) values are integers:", y_valid)

All id2 values are integers: True
All id3 values are integers: True
All y (label) values are integers: True


In [104]:
# Show non-numeric entries if they exist
non_numeric_id2 = train_df_copy[~train_df_copy['id2'].astype(str).str.isdigit()]
non_numeric_id3 = train_df_copy[~train_df_copy['id3'].astype(str).str.isdigit()]
non_numeric_y = train_df_copy[~train_df_copy['y'].astype(str).str.isdigit()]

print("Invalid id2 rows:\n", non_numeric_id2)
print("Invalid id3 rows:\n", non_numeric_id3)
print("Invalid y (label) rows:\n", non_numeric_y)

Invalid id2 rows:
 Empty DataFrame
Columns: [id2, id3, y]
Index: []
Invalid id3 rows:
 Empty DataFrame
Columns: [id2, id3, y]
Index: []
Invalid y (label) rows:
 Empty DataFrame
Columns: [id2, id3, y]
Index: []


In [105]:
train_df_copy['id2'] = train_df_copy['id2'].astype(int)
train_df_copy['id3'] = train_df_copy['id3'].astype(int)
train_df_copy['y'] = train_df_copy['y'].astype(int)

print (f"Data type of id2: {train_df_copy['id2'].dtype}")
print (f"Data type of id3: {train_df_copy['id3'].dtype}")
print (f"Data type of y: {train_df_copy['y'].dtype}")

Data type of id2: int64
Data type of id3: int64
Data type of y: int64


In [106]:
train_df_copy.head()

Unnamed: 0,id2,id3,y
0,1366776,189706075,0
1,1366776,89227,0
2,1366776,35046,0
3,1366776,6275451,0
4,1366776,78053,0


In [107]:
train_df_copy.to_parquet("train_df_copy.parquet", index = False)

In [108]:
train_df.head()

Unnamed: 0,f1,f2,f5,f6,f7,f8,f9,f10,f11,f12,...,f91_known,f104_known,f105_known,f106_known,f107_known,f108_known,f109_known,f110_known,f111_known,decayed_impression_score
0,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.898702
1,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.818244
2,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.742416
3,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.898673
4,-0.521425,-0.662805,-1.013951,-1.272144,-0.769574,-1.026593,-0.95839,-0.349999,0.23345,-1.022125,...,1,1,1,1,0,1,1,1,1,0.898676


In [109]:
train_df.shape

(770164, 275)

In [110]:
train_df.isnull().sum().any()

False

In [111]:
train_df_copy.isnull().sum()

id2    0
id3    0
y      0
dtype: int64

In [112]:
# 1. Drop the 9 categorical columns from train_df 
train_df_numerical = train_df.drop(columns = categorical_columns_for_embedding)

# 2. Convert the rest of the features to a tensor
numerical_tensor = torch.tensor(train_df_numerical.values, dtype=torch.float32)
identifiers_tensor = torch.tensor(train_df_copy.values, dtype=torch.int64)

# 3. Concatenate: numerical and embedding tensors
x_combined = torch.cat([numerical_tensor, embedded_output], dim=1)

print (identifiers_tensor.shape) # Should be (770164, 3)
print(x_combined.shape)  # Should be (770164, numerical_features + total_embedding_dim)

torch.Size([770164, 3])
torch.Size([770164, 294])


In [113]:
print(identifiers_tensor[:5]) # First 5 rows

tensor([[  1366776, 189706075,         0],
        [  1366776,     89227,         0],
        [  1366776,     35046,         0],
        [  1366776,   6275451,         0],
        [  1366776,     78053,         0]])


In [114]:
print(x_combined[:5])  # First 5 rows

tensor([[-0.5214, -0.6628, -1.0140,  ...,  0.6598, -0.9385,  0.3864],
        [-0.5214, -0.6628, -1.0140,  ..., -2.7465,  0.4886,  0.3988],
        [-0.5214, -0.6628, -1.0140,  ..., -2.7465,  0.4886,  0.3988],
        [-0.5214, -0.6628, -1.0140,  ...,  0.6598, -0.9385,  0.3864],
        [-0.5214, -0.6628, -1.0140,  ...,  0.6598, -0.9385,  0.3864]],
       grad_fn=<SliceBackward0>)


In [115]:
print(identifiers_tensor[:5])              # From tensor
print(list(dup_combos.index)[:5])          # From pandas

tensor([[  1366776, 189706075,         0],
        [  1366776,     89227,         0],
        [  1366776,     35046,         0],
        [  1366776,   6275451,         0],
        [  1366776,     78053,         0]])
[('1000639', '118034590'), ('1000639', '122267'), ('1000639', '16099'), ('1000639', '176404'), ('1000639', '184034')]


In [116]:
torch.isnan(x_combined).any()

tensor(False)

In [117]:
torch.isnan(identifiers_tensor).any()

tensor(False)

**Count of the duplicate rows**

In [118]:
import torch

# Step 1: Get duplicated (id2, id3) pairs as a set of tuples and convert to int
duplicate_pairs = set((int(id2), int(id3)) for id2, id3 in dup_combos.index.tolist())  # e.g., {(1000639, 118034590), ...}

# Step 2: Extract id2, id3 and y columns from tensor
id2_col = identifiers_tensor[:, 0]
id3_col = identifiers_tensor[:, 1]
y_col   = identifiers_tensor[:, 2]

# Step 3: Compare each (id2, id3) pair in the tensor with the duplicates
mask = torch.tensor(
    [(int(id2.item()), int(id3.item())) in duplicate_pairs for id2, id3 in zip(id2_col, id3_col)],
    dtype=torch.bool
)

# Step 4: Get only the duplicated rows from the tensor
duplicated_rows_tensor = x_combined[mask]

print(f"Duplicated rows in tensor: {duplicated_rows_tensor.shape}")
print(duplicated_rows_tensor[:5])  # view sample

Duplicated rows in tensor: torch.Size([43912, 294])
tensor([[-0.5214, -0.6628, -1.0140,  ...,  0.6598, -0.9385,  0.3864],
        [-0.5796, -0.6628, -0.2871,  ..., -2.7465,  0.4886,  0.3988],
        [-0.5796, -0.6628, -0.2871,  ..., -2.7465, -0.9385,  0.3864],
        [ 0.1766, -0.6628, -0.2443,  ...,  0.2689,  0.4886,  0.3988],
        [ 0.1766, -0.6628, -0.2443,  ...,  0.2689, -0.9385,  0.3864]],
       grad_fn=<SliceBackward0>)


### **Averaging the duplicated embeddings**

In [119]:
# Only use id2 and id3 to define uniqueness
unique_pairs = set([tuple(x[:2]) for x in identifiers_tensor.numpy().tolist()])
print(f"Unique id2-id3 pairs (ignoring label): {len(unique_pairs)}")

Unique id2-id3 pairs (ignoring label): 747696


In [120]:
import torch
from collections import defaultdict

# Step 1: Prepare a mapping from (id2, id3) → list of row indices
pair_to_indices = defaultdict(list)

for idx, (id2, id3) in enumerate(zip(id2_col.tolist(), id3_col.tolist())):
    pair_to_indices[(id2, id3)].append(idx)

# Step 2: For each unique (id2, id3), average features and apply "AnyClick" label logic
averaged_rows = []
anyclick_labels = []
new_id2_id3_list = []

for pair, indices in pair_to_indices.items():
    rows = x_combined[indices]                    # All x rows for that pair
    avg_row = rows.mean(dim=0)                    # Average features
    click_labels = y_col[indices]                 # Corresponding y labels
    label = int((click_labels == 1).any())        # 1 if ANY y == 1 in the group

    averaged_rows.append(avg_row)
    anyclick_labels.append(label)
    new_id2_id3_list.append(pair)


# Step 3: Convert to final tensors
averaged_tensor = torch.stack(averaged_rows)  # Shape: (unique_pairs, feature_dim)
anyclick_labels_tensor = torch.tensor(anyclick_labels)  # shape: (unique_pairs, )
new_identifiers_tensor = torch.tensor(new_id2_id3_list, dtype=torch.int64)  # Shape: (unique_pairs, 2)

# Combine identifiers and label for future use
final_identifiers_tensor = torch.cat([new_identifiers_tensor, anyclick_labels_tensor.unsqueeze(1)], dim=1)

# Final Shapes
print(f"Features:         {averaged_tensor.shape}")
print(f"Identifiers + y:  {final_identifiers_tensor.shape}")

Features:         torch.Size([747696, 294])
Identifiers + y:  torch.Size([747696, 3])


In [121]:
torch.isnan(averaged_tensor).any()

tensor(False)

In [122]:
torch.isnan(final_identifiers_tensor).any()

tensor(False)

In [123]:
id2_unique_count = torch.unique(final_identifiers_tensor[:, 0]).numel()
id3_unique_count = torch.unique(final_identifiers_tensor[:, 1]).numel()

print(f"Unique id2s: {id2_unique_count}")
print(f"Unique id3s: {id3_unique_count}")

Unique id2s: 46550
Unique id3s: 757


**Converting the `new_identifiers_tensor` to a Dataframe: `train_df_averaged`**

In [124]:
import pandas as pd

# Convert final_identifiers_tensor to numpy and then to DataFrame
train_df_averaged = pd.DataFrame(
    final_identifiers_tensor.numpy(), 
    columns=['id2', 'id3', 'y']
)

print(f"Shape of Averaged Training Data: {train_df_averaged.shape}")
train_df_averaged

Shape of Averaged Training Data: (747696, 3)


Unnamed: 0,id2,id3,y
0,1366776,189706075,0
1,1366776,89227,0
2,1366776,35046,0
3,1366776,6275451,0
4,1366776,78053,0
...,...,...,...
747691,1896641,87731,0
747692,1896641,505604,0
747693,1896641,25212,0
747694,1900765,95157,0


In [125]:
train_df_averaged.to_parquet("train_df_averaged.parquet", index=False)

### **Defining the User Tower Model**

In [126]:
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init

class UserTower(nn.Module):
    def __init__(self, input_dim, embedding_dim=64):  # 64 is the user representation size
        super(UserTower, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)

        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(128, embedding_dim)

        # Apply He initialization (Kaiming)
        self._init_weights()

    def _init_weights(self):
        for layer in [self.fc1, self.fc2, self.out]:
            init.kaiming_normal_(layer.weight, nonlinearity='relu')
            if layer.bias is not None:
                init.zeros_(layer.bias)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        return self.out(x)  # Final output embedding

In [127]:
user_tower = UserTower(input_dim=averaged_tensor.shape[1])
with torch.no_grad():
    user_embedding = user_tower(averaged_tensor)

print(user_embedding.shape)  # Should be (747696, 64) — a dense user embedding

torch.Size([747696, 64])


In [128]:
import torch
torch.save(user_embedding, "user_embedding.pt")

### **Free the Memory**

In [129]:
import gc
import torch

# 1. Delete the tensors
del train_df
gc.collect()
del x_categorical_tensor
gc.collect()
del embedded_output
gc.collect()
del identifiers_tensor
gc.collect()
del x_combined
gc.collect()
del duplicated_rows_tensor
gc.collect()
del averaged_tensor
gc.collect()
del new_identifiers_tensor
gc.collect()

# 2. Run garbage collection
gc.collect()

# 3. If using GPU (optional, safe to include regardless)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

### **Creating the Industry Embedding from the transactions data**

In [130]:
txn_df.isnull().sum()

id2        0
f367       0
f368       0
f369       0
f370       0
f371       0
f372       0
id8     1304
f374       0
dtype: int64

In [131]:
txn_df["id8"].nunique()

5597

In [132]:
txn_df["id2"].nunique()

194115

In [133]:
txn_df["id8"].value_counts()

id8
59420000    541576
58120000    285921
58120600    219204
55410000    214695
54110100    198839
             ...  
49590000         1
83510600         1
55710404         1
73591302         1
73350114         1
Name: count, Length: 5597, dtype: int64

In [134]:
txn_df.shape

(6339465, 9)

In [135]:
# Removing the rows where id8 = Null
txn_df = txn_df.dropna(subset=['id8'])

In [136]:
txn_df.shape

(6338161, 9)

In [137]:
txn_df["f369"].value_counts()

f369
D    6114933
C     223228
Name: count, dtype: int64

In [138]:
txn_df.head()

Unnamed: 0,id2,f367,f368,f369,f370,f371,f372,id8,f374
0,2896709,15.6,PBR,D,2023-10-16,19:16:52,202310,59639998,DSE
1,2855047,6.4,PR,D,2023-10-14,13:01:16,202310,59639998,DSE
2,2497175,13.99,PBR,D,2023-10-14,00:31:48,202310,59639998,DSE
3,2655364,15.14,PGC,D,2023-10-13,12:37:25,202310,59639998,DSE
4,2855047,2.12,PR,D,2023-10-09,16:51:21,202310,59639998,DSE


In [139]:
txn_df['is_debit'] = (txn_df['f369'] == 'D').astype(int)
txn_df['is_credit'] = (txn_df['f369'] == 'C').astype(int)

In [140]:
txn_df["f370"].dtype

dtype('O')

In [141]:
# Converting transaction date into exponential decay recency score

txn_df['f370'] = pd.to_datetime(txn_df['f370'])
latest_txn_time = txn_df['f370'].max()
txn_df['days_since_txn'] = (latest_txn_time - txn_df['f370']).dt.total_seconds() / (3600 * 24)
lambda_ = 0.1  # Decay factor; tune if needed
txn_df['recency_score'] = np.exp(-lambda_ * txn_df['days_since_txn'])

# Drop redundant column
txn_df.drop(columns = ["days_since_txn"], inplace = True)
txn_df.shape

(6338161, 12)

In [142]:
txn_df["f370"].nunique()

33

In [143]:
txn_df["recency_score"].nunique()

33

In [144]:
txn_agg = txn_df.groupby('id8').agg({
    # Amount and user behavior
    'f367': ['sum', 'mean', 'std'],

     # Unique users
    'id2': 'nunique',
    
    # Debit/Credit counts
    'is_debit': 'sum',
    'is_credit': 'sum',
    
}).reset_index()

# Flatten multi-level column names
txn_agg.columns = ['_'.join(col).strip('_') for col in txn_agg.columns]

# Aggregate recency scores separately
recency_scores = txn_df.groupby('id8')['recency_score'].mean().reset_index()
recency_scores.columns = ['id8', 'decayed_recency_score']

# Merge recency scores into txn_agg
txn_agg = txn_agg.merge(recency_scores, on='id8', how='left')

# Printing the newly created aggregated Dataframe
txn_agg.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id8,f367_sum,f367_mean,f367_std,id2_nunique,is_debit_sum,is_credit_sum,decayed_recency_score
0,1150000,20.34,20.34,,1,1,0,0.049787
1,1199901,11.61,11.61,,1,1,0,0.074274
2,1610100,317.43,158.715,150.309688,2,2,0,0.420539
3,1619902,265.0,44.166667,44.606801,3,6,0,0.374245
4,1619909,1930.83,37.131346,33.830189,21,52,0,0.296255


In [145]:
txn_agg.shape

(5597, 8)

In [146]:
txn_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5597 entries, 0 to 5596
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id8                    5597 non-null   object 
 1   f367_sum               5597 non-null   float64
 2   f367_mean              5597 non-null   float64
 3   f367_std               4479 non-null   float64
 4   id2_nunique            5597 non-null   int64  
 5   is_debit_sum           5597 non-null   int64  
 6   is_credit_sum          5597 non-null   int64  
 7   decayed_recency_score  5597 non-null   float64
dtypes: float64(4), int64(3), object(1)
memory usage: 349.9+ KB


In [147]:
txn_agg.isnull().sum()

id8                         0
f367_sum                    0
f367_mean                   0
f367_std                 1118
id2_nunique                 0
is_debit_sum                0
is_credit_sum               0
decayed_recency_score       0
dtype: int64

In [148]:
# Filling the null values of standard deviation column with 0
# This is because std becomes undefined for one entry mathematically, so we fill with 0 (no variation for one entry)

txn_agg['f367_std'] = txn_agg['f367_std'].fillna(0.0)

In [149]:
txn_agg.isnull().sum()

id8                      0
f367_sum                 0
f367_mean                0
f367_std                 0
id2_nunique              0
is_debit_sum             0
is_credit_sum            0
decayed_recency_score    0
dtype: int64

### **Creating transactions aggregated data for each `id2`-`id8` pair**

**Instead of just aggregating on the basis of `id8` (industry code), we will now aggregate on the basis of both `id2` and `id8` (customer and industry)**
- now each entry will show the transaction behaviour of that customer in that industry
- this is done because industry_features (which is created later using aggregated information of both transactions and offers) shall also include `id2` in order to join it with the `new_events_df`(also created later)

In [150]:
txn_agg_2 = txn_df.groupby(['id8', 'id2']).agg({
    # Amount and user behavior
    'f367': ['sum', 'mean', 'std'],
    
    # Debit/Credit counts
    'is_debit': 'sum',
    'is_credit': 'sum',
    
}).reset_index()

txn_agg_2.columns = ['_'.join(col).strip('_') for col in txn_agg_2.columns]

recency_scores = txn_df.groupby(['id8', 'id2'])['recency_score'].mean().reset_index()
recency_scores.columns = ['id8', 'id2', 'decayed_recency_score']

# Merge with aggregated data
txn_agg_2 = txn_agg_2.merge(recency_scores, on=['id8', 'id2'], how='left')

txn_agg_2.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id8,id2,f367_sum,f367_mean,f367_std,is_debit_sum,is_credit_sum,decayed_recency_score
0,1150000,2091970,20.34,20.34,,1,0,0.049787
1,1199901,2549728,11.61,11.61,,1,0,0.074274
2,1610100,2238916,265.0,265.0,,1,0,0.740818
3,1610100,2323271,52.43,52.43,,1,0,0.100259
4,1619902,2100642,109.0,27.25,15.085865,4,0,0.272532


In [151]:
txn_agg_2.shape

(2841195, 8)

In [152]:
txn_agg_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2841195 entries, 0 to 2841194
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   id8                    object 
 1   id2                    int32  
 2   f367_sum               float64
 3   f367_mean              float64
 4   f367_std               float64
 5   is_debit_sum           int64  
 6   is_credit_sum          int64  
 7   decayed_recency_score  float64
dtypes: float64(4), int32(1), int64(2), object(1)
memory usage: 162.6+ MB


In [153]:
txn_agg_2.isnull().sum()

id8                            0
id2                            0
f367_sum                       0
f367_mean                      0
f367_std                 1746122
is_debit_sum                   0
is_credit_sum                  0
decayed_recency_score          0
dtype: int64

In [154]:
# Filling the null values of standard deviation column with 0
# This is because std becomes undefined for one entry mathematically, so we fill with 0 (no variation for one entry)

txn_agg_2['f367_std'] = txn_agg_2['f367_std'].fillna(0.0)

In [155]:
txn_agg_2.isnull().sum()

id8                      0
id2                      0
f367_sum                 0
f367_mean                0
f367_std                 0
is_debit_sum             0
is_credit_sum            0
decayed_recency_score    0
dtype: int64

### **Embeddings for offer_metadata (Aggregation techniques)**

In [156]:
offers_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id3,id9,f375,f376,f377,id10,id11,f378,f374,id8,id12,id13
0,70687,FO5O,2,5.0,,1,,N,,,2018-01-01 00:00:00,2099-12-31 23:59:59
1,900002526,UGE,2,100.0,,1,,N,,,2014-10-20 00:00:00,2099-12-31 23:59:59
2,900002864,UTP,1,100.0,,1,,N,,,2016-07-19 00:00:00,2099-12-31 23:59:59
3,19508,o,2,,,1,,N,,,2019-06-02 17:00:00,2028-12-31 16:59:59
4,35903,o,2,,,1,,N,,,2019-06-02 17:00:00,2028-12-31 16:59:59


In [157]:
offers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4164 entries, 0 to 4163
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id3     4164 non-null   int32  
 1   id9     4164 non-null   object 
 2   f375    4164 non-null   int32  
 3   f376    978 non-null    float64
 4   f377    0 non-null      object 
 5   id10    4164 non-null   object 
 6   id11    0 non-null      object 
 7   f378    4164 non-null   object 
 8   f374    3887 non-null   object 
 9   id8     3887 non-null   object 
 10  id12    4164 non-null   object 
 11  id13    4164 non-null   object 
dtypes: float64(1), int32(2), object(9)
memory usage: 358.0+ KB


In [158]:
offers_df.shape

(4164, 12)

In [159]:
offers_df.isnull().sum()

id3        0
id9        0
f375       0
f376    3186
f377    4164
id10       0
id11    4164
f378       0
f374     277
id8      277
id12       0
id13       0
dtype: int64

In [160]:
offers_df["f375"].value_counts()

f375
1    2824
2    1340
Name: count, dtype: int64

In [161]:
offers_df["f376"].nunique()

21

In [162]:
offers_df["f376"].value_counts()

f376
100.0    236
10.0     172
3.0       92
15.0      83
5.0       73
20.0      61
6.0       56
2.0       41
4.0       40
8.0       34
30.0      29
50.0      15
25.0      15
12.0      13
40.0       8
7.0        3
7.1        2
16.0       2
1.0        1
35.0       1
45.0       1
Name: count, dtype: int64

In [163]:
offers_df["id11"].nunique() # -- no use of this column, redundant

0

In [164]:
offers_df["id11"].isnull().sum() 

4164

In [165]:
offers_df["id8"].nunique()

300

In [166]:
offers_df["f374"].nunique()

133

In [167]:
offers_df["id10"].nunique()

2

In [168]:
# Drop rows with missing industry code
offers_df = offers_df.dropna(subset=['id8'])

offers_df.shape

(3887, 12)

In [169]:
offers_df.isnull().sum()

id3        0
id9        0
f375       0
f376    3022
f377    3887
id10       0
id11    3887
f378       0
f374       0
id8        0
id12       0
id13       0
dtype: int64

In [170]:
offers_df["f376"].dtype

dtype('float64')

In [171]:
# Checking if the Discount column contains zero entries

(offers_df["f376"] == 0.0).any()

False

In [172]:
# Filling the Null values in the discount column (f376) with 0 - denoting no discount

offers_df["f376"] = offers_df["f376"].fillna(0)

offers_df["f376"].isnull().sum()

0

In [173]:
offers_df["id13"].nunique()

654

In [174]:
import numpy as np
import pandas as pd

# Create binary flags for id10
offers_df['is_id10_1'] = (offers_df['id10'] == '1').astype(int)
offers_df['is_id10_2'] = (offers_df['id10'] == '2').astype(int)

# Ensure datetime format
offers_df['id12'] = pd.to_datetime(offers_df['id12'], errors='coerce')
offers_df['id13'] = pd.to_datetime(offers_df['id13'], errors='coerce')

# Offer duration (days active)
offers_df['offers_duration'] = (offers_df['id13'] - offers_df['id12']).dt.days

# # Compute offers recency score
# latest_end_date = offers_df['id13'].max()
# lambda_ = 0.05  # Tuneable
# offers_df['offers_recency_score'] = np.exp(-lambda_ * ((latest_end_date - offers_df['id13']).dt.days))

# Convert f375 to 1/2 integer scale
offers_df['f375'] = offers_df['f375'].astype(int)

# Aggregation
offers_agg = offers_df.groupby('id8').agg({
    'f375': 'mean',                    # Avg redemption type (between 1 and 2)
    'is_id10_1': 'sum',                # Count of type 1 industries
    'is_id10_2': 'sum',                # Count of type 2 industries
    'f376': ['mean', 'std'],           # Discount stats
    'offers_duration': 'mean',         # Avg offers duration
    # 'offers_recency_score': 'mean',    # Offer freshness
    'id3': 'count'                     # Total number of offerss per industry (id3 is offer id)
}).reset_index()

# Rename columns
offers_agg.columns = [
    'id8',
    'avg_redemption_type',
    'type_1_industry_count',
    'type_2_industry_count',
    'discount_mean',
    'discount_std',
    'avg_offer_duration',
    # 'avg_offer_recency_score',
    'num_offers'
]

# Returning the aggregated dataframe
offers_agg.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id8,avg_redemption_type,type_1_industry_count,type_2_industry_count,discount_mean,discount_std,avg_offer_duration,num_offers
0,7420000,2.0,1,0,0.0,,107.0,1
1,20860104,2.0,5,0,8.0,0.0,33.2,5
2,27110000,1.142857,13,1,0.0,0.0,103.428571,14
3,27110100,1.0,0,1,0.0,,92.0,1
4,27410000,1.0,4,1,0.0,0.0,86.0,5


In [175]:
offers_agg.shape

(300, 8)

In [176]:
offers_agg.isnull().sum()

id8                       0
avg_redemption_type       0
type_1_industry_count     0
type_2_industry_count     0
discount_mean             0
discount_std             88
avg_offer_duration        0
num_offers                0
dtype: int64

In [177]:
# Filling the NaNs of standard deviation column with 0
offers_agg['discount_std'] = offers_agg['discount_std'].fillna(0.0)

In [178]:
offers_agg.isnull().sum()

id8                      0
avg_redemption_type      0
type_1_industry_count    0
type_2_industry_count    0
discount_mean            0
discount_std             0
avg_offer_duration       0
num_offers               0
dtype: int64

In [179]:
# offers_agg["avg_offer_recency_score"].nunique()

In [180]:
# offers_agg["avg_offer_recency_score"].value_counts()

### **Merged Transactions Info and Offers Info via id8**

In [181]:
industry_features = pd.merge(txn_agg, offers_agg, on='id8', how='inner')
industry_features.head()

Unnamed: 0,id8,f367_sum,f367_mean,f367_std,id2_nunique,is_debit_sum,is_credit_sum,decayed_recency_score,avg_redemption_type,type_1_industry_count,type_2_industry_count,discount_mean,discount_std,avg_offer_duration,num_offers
0,7420000,1055097.0,324.845135,740.917589,2293,3183,65,0.303679,2.0,1,0,0.0,0.0,107.0,1
1,20860104,36.94,36.94,0.0,1,1,0,0.165299,2.0,5,0,8.0,0.0,33.2,5
2,27110000,2650835.34,43.411482,232.604498,42410,59792,1271,0.301821,1.142857,13,1,0.0,0.0,103.428571,14
3,27110100,460142.87,24.986038,283.902123,11198,14599,3817,0.304311,1.0,0,1,0.0,0.0,92.0,1
4,27410000,2546605.15,517.287254,2970.687404,3181,4799,124,0.31888,1.0,4,1,0.0,0.0,86.0,5


In [182]:
industry_features.shape

(290, 15)

In [183]:
industry_features_2 = pd.merge(txn_agg_2, offers_agg, on='id8', how='inner')
industry_features_2.head()

Unnamed: 0,id8,id2,f367_sum,f367_mean,f367_std,is_debit_sum,is_credit_sum,decayed_recency_score,avg_redemption_type,type_1_industry_count,type_2_industry_count,discount_mean,discount_std,avg_offer_duration,num_offers
0,7420000,2000081,36.37,36.37,0.0,1,0,0.201897,2.0,1,0,0.0,0.0,107.0,1
1,7420000,2000149,457.15,457.15,0.0,1,0,0.040762,2.0,1,0,0.0,0.0,107.0,1
2,7420000,2000212,42.65,42.65,0.0,1,0,0.040762,2.0,1,0,0.0,0.0,107.0,1
3,7420000,2000290,49.88,49.88,0.0,1,0,0.082085,2.0,1,0,0.0,0.0,107.0,1
4,7420000,2000587,267.3,267.3,0.0,1,0,1.0,2.0,1,0,0.0,0.0,107.0,1


In [184]:
(industry_features_2["is_debit_sum"] > 1).any()

True

In [185]:
cols_to_test = ['is_debit_sum', 'is_credit_sum', 'type_1_industry_count', 'type_2_industry_count', 'num_offers']

for col in cols_to_test:
    print(f"\nValue counts for column: {col}")
    print(industry_features_2[col].value_counts())


Value counts for column: is_debit_sum
is_debit_sum
1      1257371
2       384156
3       160989
4        89515
5        54435
        ...   
321          1
147          1
377          1
257          1
142          1
Name: count, Length: 289, dtype: int64

Value counts for column: is_credit_sum
is_credit_sum
0      1992597
1        97945
2        17657
3         5084
4         2496
        ...   
192          1
107          1
71           1
63           1
103          1
Name: count, Length: 73, dtype: int64

Value counts for column: type_1_industry_count
type_1_industry_count
1      203106
2      190085
4      117266
3      114057
13      93597
5       91505
7       89362
44      79238
11      78105
537     75178
29      74204
6       68932
92      63303
58      56475
19      53507
18      49952
0       43283
9       42547
49      39746
131     39546
8       38837
52      35992
84      34242
60      29626
200     27613
51      24503
30      22108
14      21860
82      21521
53      205

In [186]:
industry_features_2["is_debit_sum"].max()

3295

In [187]:
industry_features_2.shape

(2119923, 15)

In [188]:
industry_features_2["id2"].nunique()

188715

In [189]:
industry_features_2["id8"].nunique()

290

In [190]:
num_unique_pairs = industry_features_2[['id2', 'id8']].drop_duplicates().shape[0]
print("Number of unique (id2, id8) pairs:", num_unique_pairs)

Number of unique (id2, id8) pairs: 2119923


In [191]:
# The maximum number of unique id8 values that any single id2 is associated with.
max_id8s_per_id2 = industry_features_2.groupby('id2')['id8'].nunique().max()
print("Maximum number of id8s any id2 is associated with:", max_id8s_per_id2)

Maximum number of id8s any id2 is associated with: 73


In [192]:
industry_features_2.columns

Index(['id8', 'id2', 'f367_sum', 'f367_mean', 'f367_std', 'is_debit_sum',
       'is_credit_sum', 'decayed_recency_score', 'avg_redemption_type',
       'type_1_industry_count', 'type_2_industry_count', 'discount_mean',
       'discount_std', 'avg_offer_duration', 'num_offers'],
      dtype='object')

In [193]:
# Standardizing the numerical columns of industry_features_2

from sklearn.preprocessing import StandardScaler

# Identify continuous features (excluding binary and ID columns)
exclude_cols = ['id2', 'id8', 'decayed_recency_score']  
feat_cols = [c for c in industry_features_2.columns if c not in exclude_cols]

# Standardize only the continuous features
scaler = StandardScaler()
industry_features_2[feat_cols] = scaler.fit_transform(industry_features_2[feat_cols])

In [194]:
industry_features_2.head()

Unnamed: 0,id8,id2,f367_sum,f367_mean,f367_std,is_debit_sum,is_credit_sum,decayed_recency_score,avg_redemption_type,type_1_industry_count,type_2_industry_count,discount_mean,discount_std,avg_offer_duration,num_offers
0,7420000,2000081,-0.06462,-0.082546,-0.066099,-0.207857,-0.124024,0.201897,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309
1,7420000,2000149,0.010954,0.158429,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309
2,7420000,2000212,-0.063492,-0.078949,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309
3,7420000,2000290,-0.062193,-0.074809,-0.066099,-0.207857,-0.124024,0.082085,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309
4,7420000,2000587,-0.023144,0.049705,-0.066099,-0.207857,-0.124024,1.0,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309


In [195]:
# Check for duplicates of the same (id2, id8) pair
duplicate_pairs = industry_features_2.duplicated(subset=['id2', 'id8'])

# See if there are any such rows
has_duplicates = duplicate_pairs.any()

print("Any duplicate (id2, id8) pairs?", has_duplicates)


Any duplicate (id2, id8) pairs? False


### **Joining Datasets in Progress...**

### **Free up Memory**

In [196]:
import gc
import torch

# 1. Delete the tensors
del txn_df
gc.collect()
del txn_agg
gc.collect()
del offers_df
gc.collect()
del industry_features
gc.collect()
del txn_agg_2
gc.collect()
del offers_agg
gc.collect()

# 2. Run garbage collection
gc.collect()

# 3. If using GPU (optional, safe to include regardless)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

### **Feature Engineering for id4 (Impression ts) and id7 (Click ts) from Events Data**

In [197]:
import pandas as pd
import numpy as np

# Step 1: Ensure id4 is datetime
events_df['id4'] = pd.to_datetime(events_df['id4'], errors='coerce')

# Step 2: Compute the latest timestamp in the entire dataset (used as reference)
latest_impression_time = events_df['id4'].max()

# Step 3: Get latest impression per (id2, id3) pair
latest_impressions = events_df.groupby(['id2', 'id3'])['id4'].max().reset_index()
latest_impressions.columns = ['id2', 'id3', 'latest_id4']

# Step 4: Compute "days since latest" impression
latest_impressions['days_since_latest'] = (latest_impression_time - latest_impressions['latest_id4']).dt.total_seconds() / (3600 * 24)

# Step 5: Compute exponential recency score
λ = 0.1  # Tunable decay factor
latest_impressions['recency_score'] = np.exp(-λ * latest_impressions['days_since_latest'])

# Final columns: id2, id3, recency_score
new_events_df = latest_impressions[['id2', 'id3', 'recency_score']]

new_events_df.head()

Unnamed: 0,id2,id3,recency_score
0,2000001,795739,0.456941
1,2000002,982820,0.520139
2,2000003,107213587,0.461675
3,2000003,187169,0.46169
4,2000003,23352,0.46168


In [198]:
new_events_df.shape

(12793562, 3)

In [199]:
new_events_df["id3"].nunique()

923

In [200]:
# Computing CTR for each customer-offer (id2-id3) pair

# Ensure datetime (just in case)
events_df['id7'] = pd.to_datetime(events_df['id7'], errors='coerce')

# Step 1: Create a binary flag for clicks
events_df['is_clicked'] = events_df['id7'].notnull().astype(int)

# Step 2: Group by (id2, id3) to compute impressions and clicks
ctr_df = events_df.groupby(['id2', 'id3']).agg(
    impressions=('id4', 'count'),       # Total events (impressions)
    clicks=('is_clicked', 'sum')        # Sum of 1s = total clicks
).reset_index()

# Step 3: Compute CTR safely (avoid division by zero)
ctr_df['ctr'] = ctr_df['clicks'] / ctr_df['impressions']

# Step 4: Merge CTR with new_events_df
new_events_df = new_events_df.merge(
    ctr_df[['id2', 'id3', 'ctr']],
    on=['id2', 'id3'],
    how='left'
)

new_events_df.head()

Unnamed: 0,id2,id3,recency_score,ctr
0,2000001,795739,0.456941,0.0
1,2000002,982820,0.520139,0.0
2,2000003,107213587,0.461675,0.0
3,2000003,187169,0.46169,0.0
4,2000003,23352,0.46168,0.0


In [201]:
# Checking normalisation for recency_score
(new_events_df["recency_score"] > 1).any()

False

In [202]:
# Checking normalisation for ctr
(new_events_df["ctr"] > 1).any()

False

In [203]:
new_events_df.isnull().sum()

id2              0
id3              0
recency_score    0
ctr              0
dtype: int64

In [204]:
new_events_df.shape

(12793562, 4)

In [205]:
new_events_df.duplicated().any()

False

In [206]:
new_events_df.drop_duplicates()

Unnamed: 0,id2,id3,recency_score,ctr
0,2000001,795739,0.456941,0.0
1,2000002,982820,0.520139,0.0
2,2000003,107213587,0.461675,0.0
3,2000003,187169,0.461690,0.0
4,2000003,23352,0.461680,0.0
...,...,...,...,...
12793557,2899654,65699008,0.976312,0.0
12793558,2899655,14996,0.291568,0.0
12793559,2899655,5411231,0.291567,0.0
12793560,2899655,795739,0.291567,0.0


In [207]:
# The maximum number of unique id2 values that any single id3 is associated with.
max_id2s_per_id3 = new_events_df.groupby('id3')['id2'].nunique().max()
print("Maximum number of id2s any id3 is associated with:", max_id2s_per_id3)

Maximum number of id2s any id3 is associated with: 37936


### **Merging the newly created events dataset with `id8`-`id2` pair from industry_features_2**

**Creating vector embedding of `dim = 32` for each `id8`-`id2` pair from `industry_features_2`**

In [208]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustIndEncoder(nn.Module):
    def __init__(self, in_dim=15, hid=64, out_dim=32):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hid)
        self.bn1 = nn.BatchNorm1d(hid)
        self.fc2 = nn.Linear(hid, out_dim)
        nn.init.kaiming_normal_(self.fc1.weight, nonlinearity='relu')
        nn.init.kaiming_normal_(self.fc2.weight, nonlinearity='relu')
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        return self.fc2(x)  # (batch_size, 32)

encoder = CustIndEncoder(in_dim=industry_features_2.shape[1] - 2)  # minus id2,id8

# Prepare tensor of shape (2.2M, feature_dim)
feat_cols = [c for c in industry_features_2.columns if c not in ('id2','id8')]
X_ci = torch.tensor(industry_features_2[feat_cols].values, dtype=torch.float32)

# Compute embeddings for all (id2,id8) rows
with torch.no_grad():
    emb_ci = encoder(X_ci)  # (2119923, 32)

In [209]:
# Store back into the DataFrame:
industry_features_2['ci_emb'] = list(emb_ci.numpy())  # each row gets a 32‑vector
industry_features_2.head()

Unnamed: 0,id8,id2,f367_sum,f367_mean,f367_std,is_debit_sum,is_credit_sum,decayed_recency_score,avg_redemption_type,type_1_industry_count,type_2_industry_count,discount_mean,discount_std,avg_offer_duration,num_offers,ci_emb
0,7420000,2000081,-0.06462,-0.082546,-0.066099,-0.207857,-0.124024,0.201897,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.30181828, 0.23704995, 0.89154184, 0.902370..."
1,7420000,2000149,0.010954,0.158429,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.41838616, 0.29809868, 0.95096123, 0.818207..."
2,7420000,2000212,-0.063492,-0.078949,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.35612556, 0.2655586, 0.9437541, 0.91418624..."
3,7420000,2000290,-0.062193,-0.074809,-0.066099,-0.207857,-0.124024,0.082085,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.342164, 0.25928515, 0.93102026, 0.9088013,..."
4,7420000,2000587,-0.023144,0.049705,-0.066099,-0.207857,-0.124024,1.0,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.06040801, 0.109972715, 0.6046903, 0.707993..."


In [210]:
industry_features_2.isnull().sum()

id8                      0
id2                      0
f367_sum                 0
f367_mean                0
f367_std                 0
is_debit_sum             0
is_credit_sum            0
decayed_recency_score    0
avg_redemption_type      0
type_1_industry_count    0
type_2_industry_count    0
discount_mean            0
discount_std             0
avg_offer_duration       0
num_offers               0
ci_emb                   0
dtype: int64

In [211]:
# Checking the dimensions of the embedding
industry_features_2['list_length'] = industry_features_2['ci_emb'].apply(len)
(industry_features_2['list_length'] != 32).any()

False

In [212]:
industry_features_2.drop(columns = ['list_length'], inplace = True)
industry_features_2.head()

Unnamed: 0,id8,id2,f367_sum,f367_mean,f367_std,is_debit_sum,is_credit_sum,decayed_recency_score,avg_redemption_type,type_1_industry_count,type_2_industry_count,discount_mean,discount_std,avg_offer_duration,num_offers,ci_emb
0,7420000,2000081,-0.06462,-0.082546,-0.066099,-0.207857,-0.124024,0.201897,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.30181828, 0.23704995, 0.89154184, 0.902370..."
1,7420000,2000149,0.010954,0.158429,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.41838616, 0.29809868, 0.95096123, 0.818207..."
2,7420000,2000212,-0.063492,-0.078949,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.35612556, 0.2655586, 0.9437541, 0.91418624..."
3,7420000,2000290,-0.062193,-0.074809,-0.066099,-0.207857,-0.124024,0.082085,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.342164, 0.25928515, 0.93102026, 0.9088013,..."
4,7420000,2000587,-0.023144,0.049705,-0.066099,-0.207857,-0.124024,1.0,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[-0.06040801, 0.109972715, 0.6046903, 0.707993..."


In [213]:
industry_features_2.shape

(2119923, 16)

In [214]:
industry_features_2["id2"].nunique()

188715

In [215]:
num_unique_pairs = industry_features_2[['id2', 'id8']].drop_duplicates().shape[0]
print("Number of unique (id2, id8) pairs:", num_unique_pairs)

Number of unique (id2, id8) pairs: 2119923


In [216]:
industry_features_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2119923 entries, 0 to 2119922
Data columns (total 16 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   id8                    object 
 1   id2                    int32  
 2   f367_sum               float64
 3   f367_mean              float64
 4   f367_std               float64
 5   is_debit_sum           float64
 6   is_credit_sum          float64
 7   decayed_recency_score  float64
 8   avg_redemption_type    float64
 9   type_1_industry_count  float64
 10  type_2_industry_count  float64
 11  discount_mean          float64
 12  discount_std           float64
 13  avg_offer_duration     float64
 14  num_offers             float64
 15  ci_emb                 object 
dtypes: float64(13), int32(1), object(2)
memory usage: 250.7+ MB


In [217]:
new_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12793562 entries, 0 to 12793561
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   id2            int32  
 1   id3            object 
 2   recency_score  float64
 3   ctr            float64
dtypes: float64(2), int32(1), object(1)
memory usage: 341.6+ MB


In [218]:
train_df_averaged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747696 entries, 0 to 747695
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   id2     747696 non-null  int64
 1   id3     747696 non-null  int64
 2   y       747696 non-null  int64
dtypes: int64(3)
memory usage: 17.1 MB


### **Download Datasets**

In [208]:
industry_features_2.to_csv("industry_features_2.csv", index=False)
new_events_df.to_csv("new_events_df.csv", index=False)
train_df_averaged.to_csv("train_df_averaged.csv", index=False)

In [209]:
import torch

torch.save(user_embedding, "user_embedding.pt")

### **Post Session Crash, Code Starts here**

### **Load the Datasets**

In [219]:
# Basic setup
import pandas as pd
import numpy as np
import os
import gc
from tqdm import tqdm

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Visualization and debugging
import seaborn as sns
import matplotlib.pyplot as plt

# Time processing
from datetime import datetime

In [220]:
industry_features_2 = pd.read_parquet("/kaggle/input/new-dataset-5/industry_features_2.parquet")
industry_features_2.head()

Unnamed: 0,id8,id2,f367_sum,f367_mean,f367_std,is_debit_sum,is_credit_sum,decayed_recency_score,avg_redemption_type,type_1_industry_count,type_2_industry_count,discount_mean,discount_std,avg_offer_duration,num_offers,ci_emb
0,7420000,2000081,-0.06462,-0.082546,-0.066099,-0.207857,-0.124024,0.201897,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[0.76429987, -0.08314366, -0.17707737, -0.5732..."
1,7420000,2000149,0.010954,0.158429,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[0.7853725, -0.18316688, -0.2903962, -0.593909..."
2,7420000,2000212,-0.063492,-0.078949,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[0.7329364, -0.11038102, -0.26233298, -0.55286..."
3,7420000,2000290,-0.062193,-0.074809,-0.066099,-0.207857,-0.124024,0.082085,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[0.7423198, -0.10513524, -0.24120899, -0.55893..."
4,7420000,2000587,-0.023144,0.049705,-0.066099,-0.207857,-0.124024,1.0,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[1.020146, -0.009802185, 0.20149122, -0.697463..."


In [221]:
# Checking the dimensions of the embedding
industry_features_2['list_length'] = industry_features_2['ci_emb'].apply(len)
(industry_features_2['list_length'] != 32).any()

False

In [222]:
industry_features_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2119923 entries, 0 to 2119922
Data columns (total 17 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   id8                    object 
 1   id2                    int32  
 2   f367_sum               float64
 3   f367_mean              float64
 4   f367_std               float64
 5   is_debit_sum           float64
 6   is_credit_sum          float64
 7   decayed_recency_score  float64
 8   avg_redemption_type    float64
 9   type_1_industry_count  float64
 10  type_2_industry_count  float64
 11  discount_mean          float64
 12  discount_std           float64
 13  avg_offer_duration     float64
 14  num_offers             float64
 15  ci_emb                 object 
 16  list_length            int64  
dtypes: float64(13), int32(1), int64(1), object(2)
memory usage: 266.9+ MB


In [223]:
industry_features_2.drop(columns = ['list_length'], inplace = True)
industry_features_2.head()

Unnamed: 0,id8,id2,f367_sum,f367_mean,f367_std,is_debit_sum,is_credit_sum,decayed_recency_score,avg_redemption_type,type_1_industry_count,type_2_industry_count,discount_mean,discount_std,avg_offer_duration,num_offers,ci_emb
0,7420000,2000081,-0.06462,-0.082546,-0.066099,-0.207857,-0.124024,0.201897,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[0.76429987, -0.08314366, -0.17707737, -0.5732..."
1,7420000,2000149,0.010954,0.158429,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[0.7853725, -0.18316688, -0.2903962, -0.593909..."
2,7420000,2000212,-0.063492,-0.078949,-0.066099,-0.207857,-0.124024,0.040762,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[0.7329364, -0.11038102, -0.26233298, -0.55286..."
3,7420000,2000290,-0.062193,-0.074809,-0.066099,-0.207857,-0.124024,0.082085,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[0.7423198, -0.10513524, -0.24120899, -0.55893..."
4,7420000,2000587,-0.023144,0.049705,-0.066099,-0.207857,-0.124024,1.0,2.380141,-0.436577,-0.345509,-0.76059,-0.882791,-0.048124,-0.43309,"[1.020146, -0.009802185, 0.20149122, -0.697463..."


In [224]:
industry_features_2.shape

(2119923, 16)

In [225]:
industry_features_2.isnull().sum().any()

False

In [226]:
new_events_df = pd.read_parquet("/kaggle/input/new-dataset-6/new_events_df.parquet")
new_events_df.head()

Unnamed: 0,id2,id3,recency_score,ctr
0,2000001,795739,0.456941,0.0
1,2000002,982820,0.520139,0.0
2,2000003,107213587,0.461675,0.0
3,2000003,187169,0.46169,0.0
4,2000003,23352,0.46168,0.0


In [227]:
new_events_df.shape

(12793562, 4)

In [228]:
new_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12793562 entries, 0 to 12793561
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   id2            int32  
 1   id3            object 
 2   recency_score  float64
 3   ctr            float64
dtypes: float64(2), int32(1), object(1)
memory usage: 341.6+ MB


**Merging the embeddings with offer ids (`id3`)**

In [229]:
import pandas as pd
import numpy as np

# Step 1: Create a lookup dictionary for id2 → ci_emb
id2_emb_map = dict(zip(industry_features_2['id2'], industry_features_2['ci_emb']))

# Step 2: Define function to apply embedding from lookup
def attach_ci_emb(chunk):
    chunk['ci_emb'] = chunk['id2'].map(id2_emb_map)
    return chunk

# Step 3: Split new_events_df into manageable chunks
chunk_size = 500_000  # Tune based on available memory
num_chunks = (len(new_events_df) // chunk_size) + 1

chunks = []
for i in range(num_chunks):
    start = i * chunk_size
    end = (i + 1) * chunk_size
    chunk = new_events_df.iloc[start:end].copy()
    chunk = attach_ci_emb(chunk)
    chunks.append(chunk)

# Step 4: Concatenate all processed chunks
offer2ci = pd.concat(chunks, ignore_index=True)

# Step 5: Drop rows where embedding not found - where id2 do not match between new_events_df and industry_feature_2
offer2ci = offer2ci.dropna(subset=['ci_emb'])

print("Merged shape:", offer2ci.shape)
offer2ci.head()

Merged shape: (5498738, 5)


Unnamed: 0,id2,id3,recency_score,ctr,ci_emb
134,2000010,176404,0.309727,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."
135,2000010,184034,0.932269,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."
136,2000010,23398,0.759511,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."
137,2000010,31794,0.93227,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."
138,2000010,5718,0.309727,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."


In [230]:
offer2ci.isnull().sum()

id2              0
id3              0
recency_score    0
ctr              0
ci_emb           0
dtype: int64

In [231]:
# Checking the dimensions of the embedding
offer2ci['list_length'] = offer2ci['ci_emb'].apply(len)
(offer2ci['list_length'] != 32).any()

False

In [232]:
offer2ci.head()

Unnamed: 0,id2,id3,recency_score,ctr,ci_emb,list_length
134,2000010,176404,0.309727,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122...",32
135,2000010,184034,0.932269,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122...",32
136,2000010,23398,0.759511,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122...",32
137,2000010,31794,0.93227,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122...",32
138,2000010,5718,0.309727,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122...",32


In [233]:
offer2ci.drop(columns = ['list_length'], inplace = True)
offer2ci.head()

Unnamed: 0,id2,id3,recency_score,ctr,ci_emb
134,2000010,176404,0.309727,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."
135,2000010,184034,0.932269,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."
136,2000010,23398,0.759511,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."
137,2000010,31794,0.93227,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."
138,2000010,5718,0.309727,0.0,"[0.11537871, -0.52318406, -0.41542834, 0.14122..."


In [234]:
offer2ci["id2"].nunique()

112784

In [235]:
offer2ci["id3"].nunique()

865

**Only one vector embedding assigned to each unique offer id (`id3`)**

In [236]:
import numpy as np

# Step 1: Prepare raw data
offer_ids = offer2ci['id3'].values
embs      = np.stack(offer2ci['ci_emb'].values, axis=0)  # shape (N_rows, 32)
ctrs      = offer2ci['ctr'].values
recencies = offer2ci['recency_score'].values

# Step 2: Initialize aggregation containers
emb_sums = {}
ctr_sums = {}
recency_max = {}
counts = {}

# Step 3: Loop through all rows and accumulate
for oid, vec, ctr, rec in zip(offer_ids, embs, ctrs, recencies):
    if oid not in emb_sums:
        emb_sums[oid] = vec.copy()
        ctr_sums[oid] = ctr
        recency_max[oid] = rec
        counts[oid] = 1
    else:
        emb_sums[oid] += vec
        ctr_sums[oid] += ctr
        recency_max[oid] = max(recency_max[oid], rec)
        counts[oid] += 1

# Step 4: Final per-offer output dictionary
item_embs_dict = {
    oid: {
        'mean_ci_emb': emb_sums[oid] / counts[oid],
        'mean_ctr': ctr_sums[oid] / counts[oid],
        'max_recency': recency_max[oid]
    }
    for oid in emb_sums
}

In [237]:
# Printing the dictionary

for i, (oid, features) in enumerate(item_embs_dict.items()):
    print(f"Offer ID: {oid}")
    print(f"  mean_ci_emb (shape {features['mean_ci_emb'].shape}): {features['mean_ci_emb']}")
    print(f"  mean_ctr: {features['mean_ctr']}")
    print(f"  max_recency: {features['max_recency']}")
    print("-" * 40)
    
    if i == 4:  # stop after 5 entries
        break

Offer ID: 176404
  mean_ci_emb (shape (32,)): [ 0.07407805  0.2746413  -1.1541016  -0.3455035  -0.16299927  0.06888774
  0.20126404  0.2718584  -0.39210606  0.3324101  -0.5336381  -0.54025257
 -0.290221   -0.08080662  0.3865123  -0.19144778 -0.07731491 -0.23414448
 -0.6254087   0.2828568  -0.4607382   0.52854913 -0.26970848 -0.4544078
 -0.3889842  -0.5133744   0.08778016 -0.02401526 -0.27835527 -0.03683136
 -0.35195854 -0.46189   ]
  mean_ctr: 0.009607113827723338
  max_recency: 0.9999545149233664
----------------------------------------
Offer ID: 184034
  mean_ci_emb (shape (32,)): [ 0.07174673  0.28379712 -1.1815124  -0.35866165 -0.15932688  0.06679387
  0.20567134  0.27528894 -0.39183986  0.33889383 -0.5499515  -0.54820424
 -0.2897854  -0.07638008  0.3931     -0.20249546 -0.08285243 -0.22743516
 -0.6204503   0.27974778 -0.45876107  0.5286038  -0.26579097 -0.47734362
 -0.3882834  -0.524697    0.09428831 -0.01363788 -0.27542967 -0.03104591
 -0.36885172 -0.47756046]
  mean_ctr: 0.01227

**Converting the dict into a dataframe**

In [238]:
import pandas as pd

# Step 5: Convert dictionary to DataFrame
item_df = pd.DataFrame.from_dict(item_embs_dict, orient='index')

# Optional: Reset index to make 'id3' a column instead of index
item_df = item_df.reset_index().rename(columns={'index': 'id3'})

print(item_df.shape)
item_df.head()

(865, 4)


Unnamed: 0,id3,mean_ci_emb,mean_ctr,max_recency
0,176404,"[0.07407805, 0.2746413, -1.1541016, -0.3455035...",0.009607,0.999955
1,184034,"[0.07174673, 0.28379712, -1.1815124, -0.358661...",0.012276,0.999951
2,23398,"[0.068889834, 0.27875704, -1.1535425, -0.35130...",0.006144,0.999961
3,31794,"[0.06897099, 0.3038569, -1.2212487, -0.3552527...",0.054602,0.999962
4,5718,"[0.080034964, 0.2702342, -1.1526369, -0.347990...",0.005627,0.999832


In [239]:
item_df.isnull().sum()

id3            0
mean_ci_emb    0
mean_ctr       0
max_recency    0
dtype: int64

In [240]:
# Checking the dimensions of the embedding
item_df['list_length'] = item_df['mean_ci_emb'].apply(len)
(item_df['list_length'] != 32).any()

False

In [241]:
item_df.drop(columns = ['list_length'], inplace = True)
item_df.head()

Unnamed: 0,id3,mean_ci_emb,mean_ctr,max_recency
0,176404,"[0.07407805, 0.2746413, -1.1541016, -0.3455035...",0.009607,0.999955
1,184034,"[0.07174673, 0.28379712, -1.1815124, -0.358661...",0.012276,0.999951
2,23398,"[0.068889834, 0.27875704, -1.1535425, -0.35130...",0.006144,0.999961
3,31794,"[0.06897099, 0.3038569, -1.2212487, -0.3552527...",0.054602,0.999962
4,5718,"[0.080034964, 0.2702342, -1.1526369, -0.347990...",0.005627,0.999832


**Converting the `mean_ci_emb` into 32 separate columns (`dim = 32`) instead of lists**

In [242]:
import pandas as pd
import numpy as np

# Step 1: Expand the mean_ci_emb column into 32 separate columns
emb_cols = pd.DataFrame(item_df['mean_ci_emb'].tolist(), 
                        columns=[f'ci_emb_{i}' for i in range(32)])

# Step 2: Concatenate with the rest of item_df (excluding original mean_ci_emb)
item_embs_df = pd.concat([item_df.drop(columns=['mean_ci_emb']), emb_cols], axis=1)

print(item_embs_df.shape)
item_embs_df.head()


(865, 35)


Unnamed: 0,id3,mean_ctr,max_recency,ci_emb_0,ci_emb_1,ci_emb_2,ci_emb_3,ci_emb_4,ci_emb_5,ci_emb_6,...,ci_emb_22,ci_emb_23,ci_emb_24,ci_emb_25,ci_emb_26,ci_emb_27,ci_emb_28,ci_emb_29,ci_emb_30,ci_emb_31
0,176404,0.009607,0.999955,0.074078,0.274641,-1.154102,-0.345504,-0.162999,0.068888,0.201264,...,-0.269708,-0.454408,-0.388984,-0.513374,0.08778,-0.024015,-0.278355,-0.036831,-0.351959,-0.46189
1,184034,0.012276,0.999951,0.071747,0.283797,-1.181512,-0.358662,-0.159327,0.066794,0.205671,...,-0.265791,-0.477344,-0.388283,-0.524697,0.094288,-0.013638,-0.27543,-0.031046,-0.368852,-0.47756
2,23398,0.006144,0.999961,0.06889,0.278757,-1.153543,-0.351307,-0.156932,0.06657,0.203517,...,-0.265371,-0.452915,-0.377224,-0.51789,0.089454,-0.023409,-0.278999,-0.038303,-0.352812,-0.463697
3,31794,0.054602,0.999962,0.068971,0.303857,-1.221249,-0.355253,-0.159546,0.070333,0.198554,...,-0.286841,-0.478586,-0.416443,-0.535675,0.075576,-0.026504,-0.265779,-0.046006,-0.362975,-0.461894
4,5718,0.005627,0.999832,0.080035,0.270234,-1.152637,-0.34799,-0.164516,0.063571,0.201354,...,-0.257521,-0.462187,-0.391506,-0.510224,0.079053,-0.019742,-0.283018,-0.036124,-0.362298,-0.469362


In [243]:
item_embs_df.columns

Index(['id3', 'mean_ctr', 'max_recency', 'ci_emb_0', 'ci_emb_1', 'ci_emb_2',
       'ci_emb_3', 'ci_emb_4', 'ci_emb_5', 'ci_emb_6', 'ci_emb_7', 'ci_emb_8',
       'ci_emb_9', 'ci_emb_10', 'ci_emb_11', 'ci_emb_12', 'ci_emb_13',
       'ci_emb_14', 'ci_emb_15', 'ci_emb_16', 'ci_emb_17', 'ci_emb_18',
       'ci_emb_19', 'ci_emb_20', 'ci_emb_21', 'ci_emb_22', 'ci_emb_23',
       'ci_emb_24', 'ci_emb_25', 'ci_emb_26', 'ci_emb_27', 'ci_emb_28',
       'ci_emb_29', 'ci_emb_30', 'ci_emb_31'],
      dtype='object')

In [244]:
item_embs_df["max_recency"].max()

1.0

In [245]:
item_embs_df["max_recency"].min()

0.30096960690904456

In [246]:
item_embs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 865 entries, 0 to 864
Data columns (total 35 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id3          865 non-null    object 
 1   mean_ctr     865 non-null    float64
 2   max_recency  865 non-null    float64
 3   ci_emb_0     865 non-null    float32
 4   ci_emb_1     865 non-null    float32
 5   ci_emb_2     865 non-null    float32
 6   ci_emb_3     865 non-null    float32
 7   ci_emb_4     865 non-null    float32
 8   ci_emb_5     865 non-null    float32
 9   ci_emb_6     865 non-null    float32
 10  ci_emb_7     865 non-null    float32
 11  ci_emb_8     865 non-null    float32
 12  ci_emb_9     865 non-null    float32
 13  ci_emb_10    865 non-null    float32
 14  ci_emb_11    865 non-null    float32
 15  ci_emb_12    865 non-null    float32
 16  ci_emb_13    865 non-null    float32
 17  ci_emb_14    865 non-null    float32
 18  ci_emb_15    865 non-null    float32
 19  ci_emb_1

**Converting the `id3` of `item_embs_df` from object type to `int64`**

In [247]:
item_embs_df['id3'] = item_embs_df['id3'].astype('int64')
print(item_embs_df.dtypes)

id3              int64
mean_ctr       float64
max_recency    float64
ci_emb_0       float32
ci_emb_1       float32
ci_emb_2       float32
ci_emb_3       float32
ci_emb_4       float32
ci_emb_5       float32
ci_emb_6       float32
ci_emb_7       float32
ci_emb_8       float32
ci_emb_9       float32
ci_emb_10      float32
ci_emb_11      float32
ci_emb_12      float32
ci_emb_13      float32
ci_emb_14      float32
ci_emb_15      float32
ci_emb_16      float32
ci_emb_17      float32
ci_emb_18      float32
ci_emb_19      float32
ci_emb_20      float32
ci_emb_21      float32
ci_emb_22      float32
ci_emb_23      float32
ci_emb_24      float32
ci_emb_25      float32
ci_emb_26      float32
ci_emb_27      float32
ci_emb_28      float32
ci_emb_29      float32
ci_emb_30      float32
ci_emb_31      float32
dtype: object


In [248]:
item_embs_df.isnull().sum().any()

False

### **Number of common offer ids between `train_df` (`train_df_copy`) and `item_embs_df`**

In [249]:
# import pandas as pd

# def check_common_offer_ids(train_df_copy, item_embs_df, id_column='id3'):
#     """
#     Checks how many unique offer IDs are common between train and item embedding datasets.
    
#     Args:
#         train_df_copy (pd.DataFrame): Training dataset id3s.
#         item_embs_df (pd.DataFrame): Item embeddings dataset.
#         id_column (str): Column name representing offer ID (default: 'id3').
        
#     Returns:
#         dict: Summary containing counts
#     """
#     train_offer_ids       =   set(train_df_copy[id_column].unique())
#     item_embs_offer_ids   =   set(item_embs_df[id_column].unique())
    
#     common_offer_ids      =   train_offer_ids.intersection(item_embs_offer_ids)
    
#     result = {
#         "train_unique_offer_ids": len(train_offer_ids),
#         "item_embs_unique_offer_ids": len(item_embs_offer_ids),
#         "common_offer_ids": len(common_offer_ids)
#     }
    
#     return result


# result = check_common_offer_ids(train_df_copy, item_embs_df)

# print("Summary:")
# for key, value in result.items():
#     print(f"{key}: {value}")

### **Number of common offer ids between `test_df` and `item_embs_df`**

In [250]:
# import pandas as pd

# def check_common_offer_ids(test_df, item_embs_df, id_column='id3'):
#     """
#     Checks how many unique offer IDs are common between test and item embedding datasets.
    
#     Args:
#         test_df (pd.DataFrame): Testing dataset id3s.
#         item_embs_df (pd.DataFrame): Item embeddings dataset.
#         id_column (str): Column name representing offer ID (default: 'id3').
        
#     Returns:
#         dict: Summary containing counts
#     """
#     test_offer_ids       =   set(test_df[id_column].unique())
#     item_embs_offer_ids  =   set(item_embs_df[id_column].unique())
    
#     common_offer_ids     =   test_offer_ids.intersection(item_embs_offer_ids)
    
#     result = {
#         "test_unique_offer_ids": len(test_offer_ids),
#         "item_embs_unique_offer_ids": len(item_embs_offer_ids),
#         "common_offer_ids": len(common_offer_ids)
#     }
    
#     return result


# result = check_common_offer_ids(test_df, item_embs_df)

# print("Summary:")
# for key, value in result.items():
#     print(f"{key}: {value}")

### **Number of common offer ids between `train_df` and `test_df`**

In [251]:
# import pandas as pd

# def check_common_offer_ids(train_df_copy, test_df, id_column='id3'):
#     """
#     Checks how many unique offer IDs are common between train and test datasets.
    
#     Args:
#         train_df_copy (pd.DataFrame): Training dataset id3s.
#         test_df (pd.DataFrame): Testing dataset id3s.
#         id_column (str): Column name representing offer ID (default: 'id3').
        
#     Returns:
#         dict: Summary containing counts
#     """
#     train_offer_ids    =   set(train_df[id_column].unique())
#     test_offer_ids     =   set(test_df[id_column].unique())
    
#     common_offer_ids   =   train_offer_ids.intersection(test_offer_ids)
    
#     result = {
#         "train_unique_offer_ids": len(train_offer_ids),
#         "test_unique_offer_ids": len(test_offer_ids),
#         "common_offer_ids": len(common_offer_ids)
#     }
    
#     return result


# result = check_common_offer_ids(train_df_copy, test_df)

# print("Summary:")
# for key, value in result.items():
#     print(f"{key}: {value}")

### **Building the Items (Offers) Tower Architecture**

In [252]:
import torch
import pandas as pd

# Assuming item_embs_df is your final DataFrame
# Exclude 'id3' (it's the offer id, not a feature)
item_features_df = item_embs_df.drop(columns=['id3'])

# Convert to PyTorch tensor
item_input_tensor = torch.tensor(item_features_df.values, dtype=torch.float32)

print(item_input_tensor.shape)  # Should be (865, 34) → 32-d embedding + 1 CTR + 1 recency

torch.Size([865, 34])


In [253]:
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init

class ItemTower(nn.Module):
    def __init__(self, input_dim, embedding_dim=64):  # Final embedding size = 64
        super(ItemTower, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)

        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)

        self.dropout = nn.Dropout(0.2)
        self.out = nn.Linear(64, embedding_dim)

        self._init_weights()

    def _init_weights(self):
        for layer in [self.fc1, self.fc2, self.out]:
            init.kaiming_normal_(layer.weight, nonlinearity='relu')
            if layer.bias is not None:
                init.zeros_(layer.bias)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        return self.out(x)

In [254]:
item_tower = ItemTower(input_dim=item_input_tensor.shape[1], embedding_dim=64)
with torch.no_grad():
    item_embedding = item_tower(item_input_tensor)

print(item_embedding.shape)  # Should be (865, 64)

torch.Size([865, 64])


### **Loading the `user_embedding` tensor**

In [255]:
import torch

# Load the tensor
user_embedding = torch.load("/kaggle/input/new-user-embedding/user_embedding (1).pt")
print(user_embedding.shape) # Should be (747696, 64)

torch.Size([747696, 64])


In [256]:
type(user_embedding)

torch.Tensor

### **Loading the `train_df_averaged` dataframe**

**This dataframe contains only `id2` and `id3` (`int64` format) after averaging the duplicate rows with the same `id2`-`id3` pair in the training data**

In [257]:
train_df_averaged = pd.read_parquet("/kaggle/input/new-train-df-averaged/train_df_averaged.parquet")

print(f"Shape of the train averaged data: {train_df_averaged.shape}")
train_df_averaged.head()

Shape of the train averaged data: (747696, 3)


Unnamed: 0,id2,id3,y
0,1366776,189706075,0
1,1366776,89227,0
2,1366776,35046,0
3,1366776,6275451,0
4,1366776,78053,0


In [258]:
train_df_averaged.isnull().sum()

id2    0
id3    0
y      0
dtype: int64

In [259]:
train_df_averaged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747696 entries, 0 to 747695
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   id2     747696 non-null  int64
 1   id3     747696 non-null  int64
 2   y       747696 non-null  int64
dtypes: int64(3)
memory usage: 17.1 MB


### **Creating Lookup Embedding Dictionaries**

In [260]:
import torch

# A. User dict — keyed by (id2, id3)
user_df = train_df_averaged[['id2', 'id3']].copy()
user_df['emb_index'] = range(len(user_df))  # to map to user_embedding rows

# Map (id2, id3) → user embedding
user_dict = {
    (row.id2, row.id3): user_embedding[row.emb_index]
    for _, row in user_df.iterrows()
}

# B. Item dict — keyed by id3 (assume order matches)
unique_item_ids = item_embs_df['id3'].tolist()  # Use your item_embs_df id3
item_dict = {
    id3: item_embedding[idx]
    for idx, id3 in enumerate(unique_item_ids)
}

In [261]:
# Printing the user_dict

for i, (key, value) in enumerate(user_dict.items()):
    print(f"{key}: {value}")
    if i >= 1:
        break

(1366776, 189706075): tensor([ 1.3147, -0.3340,  0.0460,  0.8703, -0.2112, -0.4048, -0.8537, -0.0279,
         1.0923,  0.2303,  1.6393,  0.7350,  0.6652, -0.4836,  0.5160,  0.6767,
        -0.5574, -0.1991,  0.2946, -0.3059,  0.6233,  0.9233,  0.2280,  1.1472,
        -0.4959, -0.2782,  0.3556,  0.0962,  1.3734,  0.2468,  0.1084, -0.8467,
         0.7906, -1.0927,  0.9175, -0.1942, -0.2701,  0.5825,  0.1865, -0.4412,
         0.9341, -0.5465,  0.7604,  1.3847,  0.1844,  1.1734, -0.2488, -0.8466,
        -0.8898,  1.1021,  0.1183,  0.6243, -0.3858,  0.8607, -1.0025,  0.9792,
        -0.0176, -1.1017, -0.6438,  0.1403,  0.2864, -0.0399,  0.2352, -1.0797])
(1366776, 89227): tensor([ 0.1888,  0.7250, -0.0696,  0.6640, -0.1850, -0.3519, -0.1113, -0.1358,
         1.3268,  0.5400,  0.8938,  0.3964, -0.1310, -0.1874,  0.5257,  0.3886,
         0.3846, -0.8961, -0.9273, -0.1167,  0.6958, -0.6397,  0.3611,  0.0349,
        -0.9621,  0.1525, -0.1762,  0.0505, -0.4647, -0.6340, -0.6094, -1.0658,

In [262]:
# Print total number of users (keys)
print(f"\nTotal number of user-offer pair in user_dict: {len(user_dict)}")


Total number of user-offer pair in user_dict: 747696


In [263]:
for key in user_dict.keys():
    print(type(key))
    break  # just print the dtype of the first key

<class 'tuple'>


In [264]:
# Inspect the types of elements inside the first key tuple
for key in user_dict.keys():
    print(f"Key: {key}")
    print(f"Type of key[0]: {type(key[0])}")
    print(f"Type of key[1]: {type(key[1])}")
    break  

Key: (1366776, 189706075)
Type of key[0]: <class 'numpy.int64'>
Type of key[1]: <class 'numpy.int64'>


In [265]:
# Printing the item_dict

for i, (key, value) in enumerate(item_dict.items()):
    print(f"{key}: {value}")
    if i >= 1:
        break


176404: tensor([-0.1587,  0.0637,  0.3635,  0.1030,  0.0382,  0.3102,  0.3028,  0.0371,
        -0.0325, -0.1043,  0.0690,  0.5710,  0.0304, -0.3389,  0.3351,  0.3428,
        -0.0678,  0.1817, -0.2916, -0.1922, -0.4435, -0.2195,  0.1153,  0.6057,
        -0.2944, -0.0186, -0.3247,  0.4952,  0.4307,  0.1240,  0.0913, -0.3882,
         0.9533, -0.1198,  0.2527,  0.2804,  0.1598, -0.4970, -0.0152, -0.0194,
         0.2710,  0.0744,  0.1787, -0.3781,  1.0545,  0.0386,  0.1399, -0.4610,
        -0.2317,  0.2909, -0.0712, -0.4793,  0.4826,  0.1336,  0.4207,  0.3976,
         0.2492, -0.4699, -0.1336,  0.3557,  0.0054, -0.2290, -0.0809,  0.4141])
184034: tensor([ 0.2039,  0.1861,  0.2204,  0.0338,  0.1428,  0.2906,  0.1173,  0.1384,
        -0.1720, -0.0669,  0.0967,  0.2722, -0.1171, -0.0357,  0.4285,  0.1480,
        -0.2413, -0.2198, -0.0434, -0.3404,  0.0806, -0.6579,  0.0703,  0.8620,
        -0.2814, -0.0506, -0.3472,  0.6411,  0.3297,  0.2446,  0.0965, -0.3632,
         0.6890, -0.130

In [266]:
# Print total number of offers (keys)
print(f"\nTotal number of offers in item_dict: {len(item_dict)}")


Total number of offers in item_dict: 865


In [267]:
for key in item_dict.keys():
    print(type(key))
    break  # just print the dtype of the first key

<class 'int'>


### **Creating item embeddings for missing offer ids (in train/test but not in `item_embs_df`)**

- **Global average of all the known item embeddings (865 known)**
- **Has to be used in `test_df` but optional for `train_df`, consider drop that missing id3 row**

In [268]:
# import torch

# # Step 1: Compute global mean embedding
# all_item_embs = torch.stack(list(item_dict.values()))  # shape: (num_items, 64)
# global_mean_emb = all_item_embs.mean(dim=0)

# # Step 2: Impute missing entries
# for missing_id3 in ['754452', '453195']:
#     item_dict[missing_id3] = global_mean_emb.clone()  # clone to avoid reference sharing

**Removing the missing entry from user_dict (NOT RECOMMENDED for `test_df`)**

In [269]:
user_dict.pop((1088379, 754452), None)
print(f"\nTotal number of user-offer pair in user_dict after removal: {len(user_dict)}")


Total number of user-offer pair in user_dict after removal: 747695


### **Loading the train data only with identifiers and labels** (`train_df_copy`)

In [270]:
train_df_copy = pd.read_parquet("/kaggle/input/new-train-df-copy/train_df_copy.parquet")

print(f"Shape of the train_df_copy: {train_df_copy.shape}")
train_df_copy.head()

Shape of the train_df_copy: (770164, 3)


Unnamed: 0,id2,id3,y
0,1366776,189706075,0
1,1366776,89227,0
2,1366776,35046,0
3,1366776,6275451,0
4,1366776,78053,0


**Removing the missing entry from `train_df_copy` as well**

In [271]:
train_df_copy = train_df_copy[~((train_df_copy["id2"] == 1088379) & (train_df_copy["id3"] == 754452))]

In [272]:
print(train_df_copy[(train_df_copy["id2"] == 1088379) & (train_df_copy["id3"] == 754452)])
# Should return an empty DataFrame

Empty DataFrame
Columns: [id2, id3, y]
Index: []


In [273]:
train_df_copy.shape

(770163, 3)

### **Training the Model**

**STEP 1: Imports**

In [274]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

**STEP 2: Custom PyTorch Dataset class**

In [275]:
class RecSysDataset(nn.Module):
    def __init__(self, df, user_emb_dict, item_emb_dict):
        self.df = df.reset_index(drop=True)
        self.user_emb_dict = user_emb_dict
        self.item_emb_dict = item_emb_dict

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        uid, iid = row['id2'], row['id3']
        label = torch.tensor(row['y'], dtype=torch.float32)

        user_emb = self.user_emb_dict[(uid, iid)]
        item_emb = self.item_emb_dict[iid]

        return user_emb.detach(), item_emb.detach(), label

**STEP 3: Define dot product scoring func**

In [276]:
def dot_product_score(user_vecs, item_vecs):
    return torch.sum(user_vecs * item_vecs, dim=1)

**STEP 4: Define a simple MLP Tower**

In [277]:
class TowerMLP(nn.Module):
    def __init__(self, input_dim, embedding_dim=64):
        super(TowerMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)

        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(128, embedding_dim)

        self._init_weights()

    def _init_weights(self):
        for layer in [self.fc1, self.fc2, self.out]:
            init.kaiming_normal_(layer.weight, nonlinearity='relu')
            if layer.bias is not None:
                init.zeros_(layer.bias)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        return self.out(x)

**STEP 5: Training Setup**

In [278]:
def train_towers(train_df_copy, user_dict, item_dict, epochs=5, batch_size=1024):
    dataset = RecSysDataset(train_df_copy, user_dict, item_dict)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    user_tower = TowerMLP(input_dim=64)
    item_tower = TowerMLP(input_dim=64)

    optimizer = torch.optim.Adam(
        list(user_tower.parameters()) + list(item_tower.parameters()), lr=1e-3)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        user_tower.train()
        item_tower.train()

        total_loss = 0.0
        for user_vecs, item_vecs, labels in loader:
            optimizer.zero_grad()
            u = user_tower(user_vecs)
            v = item_tower(item_vecs)

            logits = dot_product_score(u, v)
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(loader)
        print(f"Epoch {epoch + 1}: Loss = {avg_loss:.4f}")

    return user_tower, item_tower

**STEP 6: Score all id2-id3 pairs in val_df**

In [280]:
def get_pairwise_scores(val_df, user_tower, item_tower, user_dict, item_dict):
    user_tower.eval()
    item_tower.eval()

    val_df = val_df.reset_index(drop=True)

    with torch.no_grad():
        scores = []
        for i in range(len(val_df)):
            uid, iid = val_df.loc[i, 'id2'], val_df.loc[i, 'id3']
            if (uid, iid) not in user_dict or iid not in item_dict:
                scores.append(-1e9)  # Push missing ones to end
                continue
            u = user_tower(user_dict[(uid, iid)].unsqueeze(0))
            v = item_tower(item_dict[iid].unsqueeze(0))
            score = dot_product_score(u, v).item()
            scores.append(score)

    val_df["score"] = scores
    return val_df

**STEP 7: MAP@7 Evaluation**

In [281]:
def mapk(actual, predicted, k=7):
    def apk(a, p, k):
        if not a: return 0.0
        p = p[:k]
        score, hits = 0.0, 0.0
        for i, pi in enumerate(p):
            if pi in a and pi not in p[:i]:
                hits += 1.0
                score += hits / (i + 1.0)
        return score / min(len(a), k)

    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


def evaluate_map7_pairwise(val_df):
    actual = (
        val_df[val_df['y'] == 1]
        .groupby('id2')['id3']
        .apply(list)
    )

    predicted = (
        val_df
        .sort_values(['id2', 'score'], ascending=[True, False])
        .groupby('id2')['id3']
        .apply(list)
    )

    return mapk(actual.tolist(), predicted.tolist(), k=7)

**STEP 8: Execution pipeline**

In [282]:
# train_df_copy → Non Averaged train DataFrame
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(train_df_copy, groups=train_df_copy['id2']))

train_part = train_df_copy.iloc[train_idx].reset_index(drop=True)
val_part   = train_df_copy.iloc[val_idx].reset_index(drop=True)

# Step 1: Train towers
user_tower, item_tower = train_towers(train_part, user_dict, item_dict, epochs=20)

# Step 2: Predict scores on val_part
val_scored = get_pairwise_scores(val_part, user_tower, item_tower, user_dict, item_dict)

# Step 3: Evaluate
final_map7 = evaluate_map7_pairwise(val_scored)
print(f"\n✅ Final MAP@7 Score on Validation Set: {final_map7:.5f}")

Epoch 1: Loss = 0.3172
Epoch 2: Loss = 0.1657
Epoch 3: Loss = 0.1472
Epoch 4: Loss = 0.1402
Epoch 5: Loss = 0.1364
Epoch 6: Loss = 0.1335
Epoch 7: Loss = 0.1308
Epoch 8: Loss = 0.1286
Epoch 9: Loss = 0.1271
Epoch 10: Loss = 0.1251
Epoch 11: Loss = 0.1229
Epoch 12: Loss = 0.1214
Epoch 13: Loss = 0.1196
Epoch 14: Loss = 0.1175
Epoch 15: Loss = 0.1163
Epoch 16: Loss = 0.1145
Epoch 17: Loss = 0.1133
Epoch 18: Loss = 0.1115
Epoch 19: Loss = 0.1106
Epoch 20: Loss = 0.1091

✅ Final MAP@7 Score on Validation Set: 0.00968


In [64]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import Dataset, DataLoader
# import pandas as pd
# import numpy as np

# #######################################
# # Step 1: Dataset using (id2, id3) pair
# #######################################
# class RecSysDataset(nn.Module):
#     def __init__(self, df, user_emb_dict, item_emb_dict):
#         self.df = df.reset_index(drop=True)
#         self.user_emb_dict = user_emb_dict
#         self.item_emb_dict = item_emb_dict

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         row = self.df.iloc[idx]
#         uid, iid = row['id2'], row['id3']
#         label = torch.tensor(row['y'], dtype=torch.float32)

#         user_emb = self.user_emb_dict[(uid, iid)]
#         item_emb = self.item_emb_dict[iid]

#         return user_emb, item_emb, label


# #########################################
# # Step 2: Dot product scoring
# #########################################
# def dot_product_score(user_vecs, item_vecs):
#     return torch.sum(user_vecs * item_vecs, dim=1)


# #########################################
# # Step 3: Define trainable MLP towers
# #########################################
# class TowerMLP(nn.Module):
#     def __init__(self, input_dim=64, hidden_dim=128, output_dim=64):
#         super().__init__()
#         self.mlp = nn.Sequential(
#             nn.Linear(input_dim, hidden_dim),
#             nn.ReLU(),
#             nn.BatchNorm1d(hidden_dim),
#             nn.Dropout(0.3),
#             nn.Linear(hidden_dim, output_dim)
#         )

#     def forward(self, x):
#         return self.mlp(x)


# #########################################
# # Step 4: Training function
# #########################################
# def train_towers(train_df, user_dict, item_dict, epochs=5, batch_size=1024):
#     dataset = RecSysDataset(train_df, user_dict, item_dict)
#     loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

#     user_tower = TowerMLP()
#     item_tower = TowerMLP()

#     optimizer = torch.optim.Adam(
#         list(user_tower.parameters()) + list(item_tower.parameters()), lr=1e-3)
#     criterion = nn.BCEWithLogitsLoss()

#     for epoch in range(epochs):
#         user_tower.train()
#         item_tower.train()

#         total_loss = 0.0
#         for user_vecs, item_vecs, labels in loader:
#             optimizer.zero_grad()
#             u = user_tower(user_vecs)
#             v = item_tower(item_vecs)

#             logits = dot_product_score(u, v)
#             loss = criterion(logits, labels)

#             loss.backward()
#             optimizer.step()

#             total_loss += loss.item()

#         avg_loss = total_loss / len(loader)
#         print(f"Epoch {epoch + 1}: Loss = {avg_loss:.4f}")

#     return user_tower, item_tower


# #########################################
# # Step 5: Predict scores per (id2, id3)
# #########################################
# def get_pairwise_scores(df, user_tower, item_tower, user_dict, item_dict):
#     user_tower.eval()
#     item_tower.eval()

#     scores = []
#     with torch.no_grad():
#         for idx, row in df.iterrows():
#             uid, iid = row['id2'], row['id3']
#             user_emb = user_dict[(uid, iid)].unsqueeze(0)  # shape: [1, 64]
#             item_emb = item_dict[iid].unsqueeze(0)         # shape: [1, 64]

#             u_vec = user_tower(user_emb)
#             v_vec = item_tower(item_emb)

#             score = torch.sum(u_vec * v_vec, dim=1).item()
#             scores.append((uid, iid, score))

#     return pd.DataFrame(scores, columns=['id2', 'id3', 'score'])


# #########################################
# # Step 6: MAP@7 metric
# #########################################
# def mapk(actual, predicted, k=7):
#     def apk(a, p, k):
#         if not a: return 0.0
#         p = p[:k]
#         score, hits = 0.0, 0.0
#         for i, pi in enumerate(p):
#             if pi in a and pi not in p[:i]:
#                 hits += 1.0
#                 score += hits / (i + 1.0)
#         return score / min(len(a), k)

#     return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


# def evaluate_map7(train_df, scored_df):
#     actual = (
#         train_df[train_df['y'] == 1]
#         .groupby('id2')['id3']
#         .apply(list)
#     )

#     pred = (
#         scored_df.sort_values(by='score', ascending=False)
#         .groupby('id2')['id3']
#         .apply(lambda x: x.tolist())
#     )

#     pred = pred.reindex(actual.index).fillna([])

#     return mapk(actual.tolist(), pred.tolist(), k=7)


# # Step 4: Train
# user_tower, item_tower = train_towers(train_df_copy, user_dict, item_dict, epochs=5)

# # Step 5: Get scores for all user-item pairs
# scored_df = get_pairwise_scores(train_df_copy, user_tower, item_tower, user_dict, item_dict)

# # Step 6: Evaluate
# final_map7 = evaluate_map7(train_df_copy, scored_df)
# print(f"\n✅ Final MAP@7 Score: {final_map7:.5f}")
