In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amuxdata/test_data.parquet
/kaggle/input/amuxdata/add_event.parquet
/kaggle/input/amuxdata/685404e30cfdb_submission_template.csv
/kaggle/input/amuxdata/offer_metadata.parquet
/kaggle/input/amuxdata/add_trans.parquet
/kaggle/input/amuxdata/train_data.parquet


In [2]:
# Cell 1: Imports and Data Loading

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

# Load your data (example: train and test)
df_train = pd.read_parquet('/kaggle/input/amuxdata/train_data.parquet')
df_test = pd.read_parquet('/kaggle/input/amuxdata/test_data.parquet')

In [3]:
# Cell 2: Basic EDA

print(df_train.head())
print(df_train.info())
print(df_train.describe())
print("\nMissing values:\n", df_train.isnull().sum())
print("\nDuplicates:", df_train.duplicated().sum())

                                               id1      id2        id3  \
0  1366776_189706075_16-23_2023-11-02 22:22:00.042  1366776  189706075   
1      1366776_89227_16-23_2023-11-01 23:51:24.999  1366776      89227   
2      1366776_35046_16-23_2023-11-01 00:30:59.797  1366776      35046   
3    1366776_6275451_16-23_2023-11-02 22:21:32.261  1366776    6275451   
4      1366776_78053_16-23_2023-11-02 22:21:34.799  1366776      78053   

                       id4         id5  y   f1    f2    f3    f4  ...  f357  \
0  2023-11-02 22:22:00.042  2023-11-02  0  1.0  None  None  None  ...  None   
1  2023-11-01 23:51:24.999  2023-11-01  0  1.0  None  None  None  ...  None   
2  2023-11-01 00:30:59.797  2023-11-01  0  1.0  None  None  None  ...  None   
3  2023-11-02 22:21:32.261  2023-11-02  0  1.0  None  None  None  ...  None   
4  2023-11-02 22:21:34.799  2023-11-02  0  1.0  None  None  None  ...  None   

      f358 f359  f360   f361 f362               f363    f364 f365  \
0  -9999.0 

In [4]:
print(df_train['id2'].dtype)  # Check dtype

# Try converting to numeric
df_train['id2'] = pd.to_numeric(df_train['id2'], errors='coerce')
print(df_train['id2'].dtype)  # Should now be int or float

object
int64


In [5]:
# Convert columns that look numeric but are object to numeric dtype
for col in df_train.columns:
    if df_train[col].dtype == 'object':
        try:
            df_train[col] = pd.to_numeric(df_train[col])
            df_test[col] = pd.to_numeric(df_test[col])
        except Exception:
            pass  # If conversion fails, keep as object

# Now identify categorical/object columns (masked features)
cat_cols = df_train.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", cat_cols)

Categorical columns: ['id1', 'id4', 'id5', 'f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']


In [6]:
# Cell 3: Feature Engineering for Masked Features

# Identify categorical/object columns (masked features)
cat_cols = df_train.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", cat_cols)

# Frequency encode all at once
def freq_encode(df_train, df_test, cols):
    train_encoded = pd.DataFrame(index=df_train.index)
    test_encoded = pd.DataFrame(index=df_test.index)
    
    for col in cols:
        freq = df_train[col].value_counts()
        train_encoded[col + '_freq'] = df_train[col].map(freq)
        test_encoded[col + '_freq'] = df_test[col].map(freq)

    return train_encoded, test_encoded

train_freq, test_freq = freq_encode(df_train, df_test, cat_cols)

# Drop original masked columns and add new ones efficiently
df_train = pd.concat([df_train.drop(columns=cat_cols), train_freq], axis=1)
df_test = pd.concat([df_test.drop(columns=cat_cols), test_freq], axis=1)


Categorical columns: ['id1', 'id4', 'id5', 'f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']


In [9]:
# Clean df_test
df_test_clean = df_test.copy()

# Convert to numeric, coerce errors (e.g., strings → NaN)
df_test_clean = df_test_clean.apply(pd.to_numeric, errors='coerce')

# Fill missing values (you can also use imputation or other methods)
df_test_clean = df_test_clean.fillna(0)


TypeError: StandardScaler.transform() missing 1 required positional argument: 'X'

In [11]:
# Cell 4: Prepare Data for Modeling

# Replace with your actual target column name
TARGET = 'y'  # Example: replace with actual target

X = df_train.drop(columns=[TARGET])
y = df_train[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(np.isnan(X_train).sum(), np.isinf(X_train).sum())
print(np.isnan(X_val).sum(), np.isinf(X_val).sum())

X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
X_val = np.nan_to_num(X_val, nan=0.0, posinf=0.0, neginf=0.0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
df_test_scaled = scaler.transform(df_test_clean)

id2               0
id3               0
f1           393540
f2           357854
f3           529171
              ...  
f54_freq     203524
f55_freq     203524
f56_freq     203524
f57_freq     346312
f354_freq    113474
Length: 371, dtype: int64 id2          0
id3          0
f1           0
f2           0
f3           0
            ..
f54_freq     0
f55_freq     0
f56_freq     0
f57_freq     0
f354_freq    0
Length: 371, dtype: int64
id2               0
id3               0
f1            98118
f2            89338
f3           132431
              ...  
f54_freq      50943
f55_freq      50943
f56_freq      50943
f57_freq      86580
f354_freq     28517
Length: 371, dtype: int64 id2          0
id3          0
f1           0
f2           0
f3           0
            ..
f54_freq     0
f55_freq     0
f56_freq     0
f57_freq     0
f354_freq    0
Length: 371, dtype: int64




In [12]:
print(y_train.unique())  # should be [0, 1]
print(y_val.unique())


[0 1]
[0 1]


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Choose device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Convert numpy arrays to torch tensors and move to CPU first
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)


# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)

# Define the model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 32),
            nn.LeakyReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.net(x)

# Instantiate and move model to device
model = SimpleNN(X_train.shape[1]).to(device)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
for epoch in range(10):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        preds = model(xb)  # No sigmoid here!
        loss = criterion(preds, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()


    # Validation
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val_tensor.to(device))
        val_loss = criterion(val_preds, y_val_tensor.to(device))
        print(f"Epoch {epoch+1}, Val Loss: {val_loss.item():.4f}")

Epoch 1, Val Loss: 0.0965
Epoch 2, Val Loss: 0.0897
Epoch 3, Val Loss: 0.0888
Epoch 4, Val Loss: 0.0865
Epoch 5, Val Loss: 0.0861
Epoch 6, Val Loss: 0.0846
Epoch 7, Val Loss: 0.0843
Epoch 8, Val Loss: 0.0839
Epoch 9, Val Loss: 0.0837
Epoch 10, Val Loss: 0.0847


In [14]:
# For final predictions (e.g., test set)
model.eval()
with torch.no_grad():
    test_tensor = torch.tensor(df_test_scaled, dtype=torch.float32).to(device)
    test_logits = model(test_tensor)
    test_pred = torch.sigmoid(test_logits).cpu().numpy().flatten()

In [15]:
# Cell 6: Validation and Prediction

val_pred = torch.sigmoid(model(X_val_tensor.to(device))).cpu().detach().numpy().flatten()

auc = roc_auc_score(y_val, val_pred)
print(f"Validation ROC-AUC: {auc:.4f}")

# Predict on test set (if target is available)
# test_pred = model.predict(df_test)

Validation ROC-AUC: 0.9354


In [16]:
submission = pd.read_csv('/kaggle/input/amuxdata/685404e30cfdb_submission_template.csv')
submission['pred'] = test_pred
submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv


In [17]:
print("submission.shape:", submission.shape)
print("test_pred.shape:", test_pred.shape)

submission.shape: (369301, 5)
test_pred.shape: (369301,)


In [18]:
print(test_pred[:10])
print(np.isnan(test_pred).sum(), np.isinf(test_pred).sum())

[0.02200713 0.00231058 0.98975116 0.06414712 0.06035268 0.05715678
 0.02846122 0.0198497  0.14258733 0.04859566]
0 0


In [19]:
print(test_logits[:10])
print(torch.isnan(test_logits).sum())


tensor([[-3.7941],
        [-6.0679],
        [ 4.5703],
        [-2.6803],
        [-2.7453],
        [-2.8031],
        [-3.5303],
        [-3.8995],
        [-1.7940],
        [-2.9744]], device='cuda:0')
tensor(0, device='cuda:0')
