In [1]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names

2024-12-25 19:35:54.581109: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-25 19:35:57.551336: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train = pd.read_csv('X_train_features.csv')

In [3]:
test = pd.read_csv("X_test_features.csv")

In [4]:
# sparse_features = set()
# for column in train.columns:
#     if (train[column] == 0).sum() >= len(train) * 0.5:
#         sparse_features.add(column)
# dense_features = set(list(set(train.columns) - sparse_features))

In [5]:
len(dense_features), len(sparse_features)

(415, 113)

In [4]:
sparse_features =  ["user_id", "merchant_id", "item_id", "cat_id", "brand_id", "time_stamp", "time_period", "gender"] + list(train.columns[10:18])
dense_features = list(train.columns[18:])

In [5]:
import xgboost as xgb
model = xgb.XGBClassifier()


In [6]:
model.load_model("xgb_model_best_noID.bin")
best_features = sorted(list(zip(model.feature_names_in_, model.feature_importances_)), reverse=True, key=lambda x: x[1])
top = set([x for x, i in best_features[:150]])

In [7]:
def get_top_features(top, features):  
    # Check if any item in the set is a substring of the column names
    top_feat = []
    for column in features:
        if column in top:
            top_feat.append(column)
            #print(f"Column '{column}' contains a substring from the set.")
    return top_feat

In [8]:
sparse_features = get_top_features(top, sparse_features)
dense_features = get_top_features(top, dense_features)

In [9]:
for feat in sparse_features:
    lbe = LabelEncoder()
    train[feat] = lbe.fit_transform(train[feat])
    test[feat] = lbe.fit_transform(test[feat])

In [10]:
mms = MinMaxScaler(feature_range=(0,1))
train[dense_features] = mms.fit_transform(train[dense_features])
test[dense_features] = mms.fit_transform(test[dense_features])

In [11]:
all = pd.concat([train, test])

In [12]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=all[feat].nunique(),embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

In [13]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [14]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['age_2',
 'users_merchant',
 'items_merchant',
 'categories_merchant',
 'dates_merchant',
 'periods_merchant',
 'items_user_merchant',
 'categories_user_merchant',
 'dates_user_merchant',
 'periods_user_merchant',
 'users_category',
 'items_category',
 'brands_category',
 'dates_category',
 'users_brand',
 'items_brand',
 'categories_brand',
 'merchants_brand',
 'action_types_brand',
 'items_user_brand',
 'categories_user_brand',
 'dates_user_brand',
 'periods_user_brand',
 'merchants_user_category',
 'brands_user_category',
 'action_types_user_category',
 'periods_user_category',
 'purchases_user',
 'carts_merchant',
 'clicks_brand',
 'carts_item',
 'clicks_user_merchant',
 'purchases_user_merchant',
 'favourites_user_merchant',
 'purchases_user_brand',
 'favourites_merchant_brand',
 'carts_ratio_user',
 'purchases_ratio_user',
 'total_actions_merchant',
 'carts_ratio_merchant',
 'total_actions_brand',
 'carts_ratio_cat',
 'purchases_ratio_cat',
 'favourites_ratio_cat',
 'total_actio

In [15]:
len(feature_names)

150

In [16]:
#train_model_input = {name:train[name] for name in feature_names}

test_model_input = {name:test[name] for name in feature_names}

In [17]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:1'

cuda ready...


In [18]:
from deepctr_torch.models import DeepFM

In [25]:
model = DeepFM(linear_feature_columns,dnn_feature_columns,
               task='binary',device=device,dnn_dropout=0.7)
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy','auc'])

In [28]:
del model

In [29]:
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

test_preds = np.zeros(len(test)) 
print(test_preds.shape)
for fold, (train_idx, val_idx) in enumerate(kf.split(train, train["label"])):
    print(f"Fold {fold}")
    X_t, X_v = train.iloc[train_idx], train.iloc[val_idx]
    y_t, y_v = train["label"].iloc[train_idx], train["label"].iloc[val_idx]
    train_model_input = {name:X_t[name] for name in feature_names}
    val_model_input = {name:X_v[name] for name in feature_names}
    
    model = DeepFM(linear_feature_columns,dnn_feature_columns,
               task='binary',device=device,dnn_dropout=0.4)
    model.compile(optimizer = 'adam',
                loss = 'binary_crossentropy',
                metrics = ['binary_crossentropy','auc'])
    
    print("Train")
    model.train()
    history = model.fit(train_model_input, y_t.values,batch_size=512,epochs=10,verbose=2, validation_split=0.0)
    model.eval()
    val_auc = model.evaluate(val_model_input, y_v.values, batch_size=512)["auc"]
    print("Final AUC score:", val_auc)
    pred = model.predict(test_model_input, 512)
    #print(pred)
    test_preds += pred.flatten() / kf.n_splits

    

(261477,)
Fold 0
Train
cuda:1
Train on 208691 samples, validate on 0 samples, 408 steps per epoch
Epoch 1/10
6s - loss:  0.2386 - binary_crossentropy:  0.2386 - auc:  0.5811
Epoch 2/10
6s - loss:  0.2215 - binary_crossentropy:  0.2215 - auc:  0.6547
Epoch 3/10
6s - loss:  0.2199 - binary_crossentropy:  0.2199 - auc:  0.6661
Epoch 4/10
6s - loss:  0.2195 - binary_crossentropy:  0.2195 - auc:  0.6677
Epoch 5/10
6s - loss:  0.2187 - binary_crossentropy:  0.2187 - auc:  0.6728
Epoch 6/10
6s - loss:  0.2186 - binary_crossentropy:  0.2186 - auc:  0.6725
Epoch 7/10
6s - loss:  0.2183 - binary_crossentropy:  0.2184 - auc:  0.6746
Epoch 8/10
6s - loss:  0.2183 - binary_crossentropy:  0.2183 - auc:  0.6762
Epoch 9/10
6s - loss:  0.2180 - binary_crossentropy:  0.2180 - auc:  0.6764
Epoch 10/10
6s - loss:  0.2179 - binary_crossentropy:  0.2179 - auc:  0.6773
Final AUC score: 0.6853346631615588
Fold 1
Train
cuda:1
Train on 208691 samples, validate on 0 samples, 408 steps per epoch
Epoch 1/10
7s - l

In [34]:
test_preds

array([0.10470224, 0.08669365, 0.08328481, ..., 0.08889342, 0.02131111,
       0.093456  ])

In [32]:
import deepctr_torch.callbacks as callbacks
history = model.fit(train_model_input,train["label"].values,batch_size=512,epochs=50,verbose=2,validation_split=0.2)


cuda:1
Train on 208691 samples, validate on 52173 samples, 408 steps per epoch
Epoch 1/50
6s - loss:  0.2404 - binary_crossentropy:  0.2404 - auc:  0.5790 - val_binary_crossentropy:  0.2203 - val_auc:  0.6637
Epoch 2/50
7s - loss:  0.2190 - binary_crossentropy:  0.2190 - auc:  0.6726 - val_binary_crossentropy:  0.2190 - val_auc:  0.6740
Epoch 3/50
6s - loss:  0.2184 - binary_crossentropy:  0.2184 - auc:  0.6773 - val_binary_crossentropy:  0.2235 - val_auc:  0.6758
Epoch 4/50
7s - loss:  0.2179 - binary_crossentropy:  0.2179 - auc:  0.6803 - val_binary_crossentropy:  0.2175 - val_auc:  0.6821
Epoch 5/50
6s - loss:  0.2176 - binary_crossentropy:  0.2177 - auc:  0.6812 - val_binary_crossentropy:  0.2183 - val_auc:  0.6821
Epoch 6/50
7s - loss:  0.2176 - binary_crossentropy:  0.2176 - auc:  0.6829 - val_binary_crossentropy:  0.2176 - val_auc:  0.6829
Epoch 7/50
6s - loss:  0.2173 - binary_crossentropy:  0.2174 - auc:  0.6826 - val_binary_crossentropy:  0.2172 - val_auc:  0.6838
Epoch 8/50


In [46]:
pred_ans = model.predict(test_model_input, 1024)

In [32]:
submit = pd.read_csv("/home/franklin/BigDataFinalProject/data_format1/test_format1.csv")
submit["prob"] = test_preds

In [33]:
submit

Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,0.104702
1,360576,1581,0.086694
2,98688,1964,0.083285
3,98688,3645,0.037750
4,295296,3361,0.062403
...,...,...,...
261472,228479,3111,0.050900
261473,97919,2341,0.039649
261474,97919,3971,0.088893
261475,32639,3536,0.021311


In [35]:
submit.to_csv("predictions_dpfm_150_5f", index=False)