# Initialize

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [None]:
test_path = '/content/drive/MyDrive/Amex/combined/test_combined.parquet'
test_data = pd.read_parquet(test_path)


# All Clicks + Sampled non Clicks

In [None]:
train_pf = '/content/drive/MyDrive/Amex/combined/train_combined.parquet'
train_data = pd.read_parquet(train_pf)
train_data.shape


(770164, 391)

In [None]:
train_data['y'].dtype

dtype('O')

In [None]:
# 1. Keep all clicks (positives)
clicks = train_data[train_data['y'] == '1']

# 2. Randomly sample a fraction of non-clicks (negatives)
non_clicks_sampled = train_data[train_data['y'] == '0'].sample(frac=0.25, random_state=42)

# 3. Combine and shuffle
train_sample = pd.concat([clicks, non_clicks_sampled]).sample(frac=1.0, random_state=42).reset_index(drop=True)


In [None]:
train_sample.shape

(220329, 391)

In [None]:
train_sample.to_parquet(r"/content/drive/MyDrive/Amex/combined/train_sample_25p.parquet",index = True)

#Load Training Data

In [None]:
train_path = '/content/drive/MyDrive/Amex/combined/train_sample_25p.parquet'
train_data = pd.read_parquet(train_path)
train_data.shape

(220329, 391)

In [None]:
numerical_cols = [col for col in train_data.columns if train_data[col].dtype in ['int64', 'float64'] and col.startswith('f')]

categorical_cols = ['id4',
    'f42', 'f48', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57',
    'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235',
    'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245',
    'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255',
    'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265',
    'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275',
    'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285',
    'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295',
    'f296', 'f297', 'f298', 'f299', 'f300', 'f301', 'f302', 'f303', 'f304', 'f305',
    'f306', 'f307', 'f308', 'f309', 'f349', 'f354','id6','id7','f368','f369','f370',
     'f371','f372','id8','id9','id10','id11','f378','f374','id12','id13']

In [None]:
for col in categorical_cols:
    le = LabelEncoder()
    all_data = pd.concat([train_data[col], test_data[col]], axis=0)
    le.fit(all_data.astype(str))
    train_data[col] = le.transform(train_data[col].astype(str))
    test_data[col] = le.transform(test_data[col].astype(str))


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

In [None]:
# Standardize numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(train_data[numerical_cols])

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean')
X_numerical_imputed = imputer.fit_transform(X_numerical_scaled)

# Apply PCA
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_numerical_imputed)
pca_columns = [f'pca_{i}' for i in range(X_pca.shape[1])]
X_pca_df = pd.DataFrame(X_pca, columns=pca_columns, index=train_data.index)



In [None]:
# Combine PCA and categorical features
X = pd.concat([X_pca_df, train_data[categorical_cols]], axis=1)
features = pca_columns + categorical_cols

# Model

In [None]:
# prompt: convert y into a boolean datatype

y = train_data['y'].astype(int)
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
train_set = lgb.Dataset(X_train, y_train, categorical_feature=categorical_cols)
val_set = lgb.Dataset(X_val, y_val, reference=train_set, categorical_feature=categorical_cols)

In [None]:

params = {
    'objective': 'binary',
    'metric': 'auc', #'binary_logloss'
    'learning_rate': 0.01,
    'num_leaves': 64,
    'max_depth': -1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    #'lambda_l1': 1.0,
    #'lambda_l2': 1.0,
    'verbose': -1,
    'random_state': 42
}

model = lgb.train(params,
                  train_set,
                  valid_sets=[train_set, val_set],
                  valid_names=['train', 'valid'],
                  num_boost_round=1000,
                  callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)])


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	train's auc: 0.975803	valid's auc: 0.926045


In [None]:
# Constants
SEED = 42

# Cross-validation settings
N_FOLDS = 5
STRATIFIED = True
SHUFFLE = True

# Updated LightGBM parameters
LGB_PARAMS = {
    'objective': 'binary',
    'metric': 'auc',  # or 'auc' if evaluating ranking performance
    'learning_rate': 0.01,
    'num_leaves': 64,
    'max_depth': -1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    # 'lambda_l1': 1.0,  # Uncomment if needed
    # 'lambda_l2': 1.0,
    'random_state': SEED,
    'verbose': -1,
    'n_jobs': -1,
    'force_col_wise': True  # Efficient for wide datasets
}

# Training parameters
NUM_BOOST_ROUND = 1000
EARLY_STOPPING_ROUNDS = 200
VERBOSE_EVAL = 100

# Categorical features
CATEGORICAL_FEATURES = [
    'id3',    # Offer ID
    'f42',    # Membership level
    'f48',    # Total DL Flights digit representation
    'f50',    # Account Creation Indicator
    'f52',    # Active Part y1
    'f53',    # Member value (1=lowest, 5=highest)
    'f54',    # Honors enrollees indicator
    'f55',    # HG Vacation Club
    'f56',    # H Code Tier
    'f57',    # H Promus Indicator
    'f349',   # Day of week (1=Monday, 7=Sunday)
    'f354'    # Days since launch
]

# Example training
model = lgb.train(
    LGB_PARAMS,
    train_set,
    valid_sets=[train_set, val_set],
    valid_names=['train', 'valid'],
    num_boost_round=NUM_BOOST_ROUND,
    callbacks=[
        lgb.early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True),
        lgb.log_evaluation(period=VERBOSE_EVAL)
    ]
)


Training until validation scores don't improve for 200 rounds
[100]	train's auc: 0.915896	valid's auc: 0.905879
[200]	train's auc: 0.928289	valid's auc: 0.913309
[300]	train's auc: 0.93727	valid's auc: 0.917685
[400]	train's auc: 0.945251	valid's auc: 0.919934
[500]	train's auc: 0.952289	valid's auc: 0.921775
[600]	train's auc: 0.958436	valid's auc: 0.923073
[700]	train's auc: 0.963754	valid's auc: 0.924063
[800]	train's auc: 0.968305	valid's auc: 0.924963
[900]	train's auc: 0.972366	valid's auc: 0.925531
[1000]	train's auc: 0.975803	valid's auc: 0.926045
Did not meet early stopping. Best iteration is:
[1000]	train's auc: 0.975803	valid's auc: 0.926045


In [None]:
# prompt: I want to see the top features from the model

# Get feature importance
feature_importance = model.feature_importance(importance_type='gain')
feature_names = model.feature_name()

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})

# Sort by importance and show top N features
top_n = 20 # You can change this number
top_features = importance_df.sort_values(by='importance', ascending=False).head(top_n)

print("Top features from the LightGBM model:")
top_features

# Optional: Visualize the top features
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(10, 8))
# sns.barplot(x='importance', y='feature', data=top_features.sort_values(by='importance', ascending=False))
# plt.title('Top Feature Importance (Gain)')
# plt.xlabel('Importance (Gain)')
# plt.ylabel('Feature')
# plt.tight_layout()
# plt.show()


In [None]:
X_test = test_data[features].fillna(0)
test_data['score'] = model.predict(X_test)

KeyboardInterrupt: 

# Submission

In [None]:
# Select only required columns
submission = test_data[['id1', 'id2', 'id3', 'id5', 'score']]

# Optional: rename if submission format expects it as 'y'
submission = submission.rename(columns={'score': 'pred'})

In [None]:
# Assume the date column is named 'Date' and is in d-m-y format
submission['id5'] = pd.to_datetime(submission['id5'], format='%Y-%m-%d')

# Convert to m-d-y format as a string
submission['id5'] = submission['id5'].dt.strftime('%m-%d-%Y')

In [None]:
submission.head()

Unnamed: 0,id1,id2,id3,id5,pred
0,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,11-04-2023,0.01895
1,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,11-04-2023,0.129862
2,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,11-05-2023,0.984698
3,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,11-04-2023,0.029769
4,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,11-05-2023,0.022112


In [None]:
submission.to_csv('/content/drive/MyDrive/Amex/r2_submission_fileTrianon.csv', index=False)

 # MAP@7 Evaluation

In [None]:
def mapk(actual, predicted, k=7):
    score = 0.0
    for a, p in zip(actual, predicted):
        if a in p[:k]:
            score += 1 / (p.index(a) + 1)
    return score / len(actual)


In [None]:
# Example usage on validation set:
val_df = X_val.copy()
val_df['id2'] = train_data.loc[X_val.index, 'id2']
val_df['id3'] = train_data.loc[X_val.index, 'id3']
val_df['id5'] = train_data.loc[X_val.index, 'id5']
val_df['y'] = y_val
val_df['score'] = model.predict(X_val)

In [None]:
# Rank items within each id2 group
grouped = val_df.groupby('id2')
predicted = grouped.apply(lambda x: x.sort_values('score', ascending=False)['id3'].tolist())
actual = grouped.apply(lambda x: x[x['y'] == 1]['id3'].tolist()[0] if any(x['y'] == 1) else -1)

  predicted = grouped.apply(lambda x: x.sort_values('score', ascending=False)['id3'].tolist())
  actual = grouped.apply(lambda x: x[x['y'] == 1]['id3'].tolist()[0] if any(x['y'] == 1) else -1)


In [None]:
# Remove invalid rows
valid = actual != -1
predicted = predicted[valid]
actual = actual[valid]

# Score
print("MAP@7:", mapk(actual.tolist(), predicted.tolist(), k=7))

MAP@7: 0.629145890467881
