In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/val_temporal.csv
/kaggle/input/train_temporal.csv
/kaggle/input/test_features_final.csv
/kaggle/input/test_features_final_r3.csv
/kaggle/input/train_features_final.csv
/kaggle/input/final_feature_names.csv


In [2]:
train_features_final=pd.read_csv('/kaggle/input/train_features_final.csv')
test_features_final=pd.read_csv('/kaggle/input/test_features_final.csv')
final_feature_names=pd.read_csv('/kaggle/input/final_feature_names.csv')

In [3]:
# Total popularity of an offer across all users
global_offer_stats = train_features_final.groupby('id3').agg(
    global_offer_impressions=('id1', 'count'),
    global_offer_clicks=('y', 'sum')
).reset_index()

global_offer_stats['global_offer_ctr'] = global_offer_stats['global_offer_clicks'] / global_offer_stats['global_offer_impressions']

bins = [0, 0.01, 0.03, 0.07, 0.15, 1]
labels = [0,1,2,3,4]
global_offer_stats['offer_ctr_bucket'] = pd.cut(global_offer_stats['global_offer_ctr'], bins=bins, labels=labels, include_lowest=True)

In [4]:
train_features_final = train_features_final.merge(
    global_offer_stats[['id3', 'offer_ctr_bucket']],
    on='id3',
    how='left'
)
test_features_final = test_features_final.merge(
    global_offer_stats[['id3', 'offer_ctr_bucket']],
    on='id3',
    how='left'
)

In [5]:
# 1. click_rate_30d
train_features_final['click_rate_30d'] = train_features_final['f206'] / (train_features_final['f207'] + 1e-5)
test_features_final['click_rate_30d'] = test_features_final['f206'] / (test_features_final['f207'] + 1e-5)

# 4. engagement_score
train_features_final['engagement_score'] = train_features_final['f137'] * train_features_final['f206']
test_features_final['engagement_score'] = test_features_final['f137'] * test_features_final['f206']

# 5. log_ctr
train_features_final['log_ctr_f137'] = np.log1p(train_features_final['f137'])
test_features_final['log_ctr_f137'] = np.log1p(test_features_final['f137'])

# 6. exp_ctr
train_features_final['exp_ctr_f134'] = np.exp(-train_features_final['f134'])
test_features_final['exp_ctr_f134'] = np.exp(-test_features_final['f134'])


In [6]:
# Step 1: Map category one-hot to corresponding CTR feature
category_ctr_map = { 
    'f227': 'f130',  # Dining
    'f228': 'f131',  # Entertainment
    'f230': 'f134',  # Services
    'f231': 'f132',  # Shopping
    'f232': 'f133',  # Travel
}

# Step 2: Create category_ctr by using the matching category's CTR
def get_category_ctr(row):
    for cat_flag, ctr_col in category_ctr_map.items():
        if row[cat_flag] == 1:
            return row[ctr_col]
    return np.nan  # fallback in case no match

train_features_final['category_ctr'] = train_features_final.apply(get_category_ctr, axis=1)
test_features_final['category_ctr'] = test_features_final.apply(get_category_ctr, axis=1)

# Drop NaNs
plot_df = train_features_final[['category_ctr', 'y']].dropna()

# Bin CTR for plotting clarity
plot_df['ctr_bin'] = pd.cut(plot_df['category_ctr'], bins=10)

# Grouped mean redemption rate per CTR bin
grouped = plot_df.groupby('ctr_bin')['y'].mean().reset_index()
grouped['ordinal_label'] = grouped['y'].rank(method='first').astype(int) - 1 
print(grouped['ordinal_label'])

grouped['interval'] = pd.IntervalIndex.from_tuples([
    (-0.001, 0.1), (0.1, 0.2), (0.2, 0.3), (0.3, 0.4), (0.4, 0.5),
    (0.5, 0.6), (0.6, 0.7), (0.7, 0.8), (0.8, 0.9), (0.9, 1.0)
])
bin_to_label = dict(zip(grouped['interval'], grouped['ordinal_label']))
bins = [-0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
bin_intervals = pd.cut(train_features_final['category_ctr'], bins=bins)

# Step 4: Map to new label based on redemption rate
train_features_final['category_ctr_bin'] = bin_intervals.map(bin_to_label).astype('Int64')
bin_intervals = pd.cut(test_features_final['category_ctr'], bins=bins)
test_features_final['category_ctr_bin'] = bin_intervals.map(bin_to_label).astype('Int64')


  grouped = plot_df.groupby('ctr_bin')['y'].mean().reset_index()


0    0
1    5
2    6
3    8
4    9
5    7
6    3
7    4
8    1
9    2
Name: ordinal_label, dtype: int64


In [7]:
train_features_final=train_features_final.drop(columns='category_ctr')
test_features_final=test_features_final.drop(columns='category_ctr')

In [8]:
train_features_final["id5"]=pd.to_datetime(train_features_final['id5'],errors='coerce')
test_features_final["id5"]=pd.to_datetime(test_features_final['id5'],errors='coerce')

In [9]:
train_features_final["minute"]=train_features_final['id5'].dt.minute
test_features_final["minute"]=test_features_final['id5'].dt.minute

In [10]:
print(train_features_final.columns.tolist())

['id1', 'id2', 'id3', 'id4', 'id5', 'id10', 'id8', 'id12', 'id13', 'id6', 'id7', 'time_since_last_impression', 'user_session_length', 'f366', 'hourly_offer_frequency', 'user_cumulative_impressions', 'f132', 'f125', 'f207', 'f223', 'f363', 'f210', 'f314', 'f206', 'f358', 'user_avg_session_length', 'f38', 'f337', 'f26', 'f139', 'f151', 'f39', 'f212', 'f203', 'f343', 'f350', 'f95', 'f127', 'f137', 'f204', 'f312', 'f365', 'f96', 'f224', 'f324', 'offer_position_in_session', 'f142', 'f340', 'f97', 'f149', 'f319', 'f67', 'f124', 'f126', 'f138', 'f140', 'f361', 'f28', 'f94', 'f173', 'f186', 'f214', 'f321', 'f346', 'interest_strength_ratio', 'f90', 'f99', 'f121', 'f130', 'f134', 'f141', 'f150', 'f342', 'f22', 'f31', 'f43', 'f59', 'f107', 'f113', 'f336', 'f351', 'f74', 'f76', 'f147', 'f167', 'f195', 'f215', 'f345', 'f2', 'f9', 'f41', 'f133', 'f216', 'f313', 'hour_sin', 'f30', 'f77', 'f98', 'f101', 'f143', 'f179', 'f196', 'f209', 'f318', 'f320', 'f1', 'f5', 'f27', 'f51', 'f58', 'f60', 'f68', 'f73

In [11]:
train_features_final['id4']=pd.to_datetime(train_features_final['id4'],errors='coerce')

In [12]:
test_features_final['id4']=pd.to_datetime(test_features_final['id4'],errors='coerce')

In [13]:
train_features_final=train_features_final.sort_values(by='id4',ascending=True)

In [14]:
train_features_final=train_features_final.drop(columns='id5')
test_features_final=test_features_final.drop(columns='id5')

In [15]:
cols_to_convert = ['id4', 'id12', 'id13', 'impression_time', 'click_time'] # Added 'click_time' for completeness

# List of DataFrames to apply the conversion to
dataframes = [train_features_final, test_features_final]

for df in dataframes:
    for col in cols_to_convert:
        # Check if the column exists in the dataframe before converting
        if col in df.columns:
            # Convert column to datetime objects, then to Unix timestamp in seconds
            df[col] = pd.to_datetime(df[col], errors='coerce').astype('int64') // 10**9 # Added errors='coerce' for robustness

# Convert seconds to days (24 hours * 60 minutes * 60 seconds)
train_features_final['days_since_offer_start'] = (train_features_final['id4'] - train_features_final['id12']) / (24 * 3600)
test_features_final['days_since_offer_start'] = (test_features_final['id4'] - test_features_final['id12']) / (24 * 3600)
# Handle negative values if event timestamp is before offer start (impute to 0 or mean)
train_features_final.loc[train_features_final['days_since_offer_start'] < 0, 'days_since_offer_start'] = 0
test_features_final.loc[test_features_final['days_since_offer_start'] < 0, 'days_since_offer_start'] = 0


# 3. offer_duration
# Convert seconds to days
train_features_final['offer_duration'] = (train_features_final['id13'] - train_features_final['id12']) / (24 * 3600)
test_features_final['offer_duration'] = (test_features_final['id13'] - test_features_final['id12']) / (24 * 3600)
# Handle negative duration if end is before start (impute to 0 or mean)
train_features_final.loc[train_features_final['offer_duration'] < 0, 'offer_duration'] = 0
test_features_final.loc[test_features_final['offer_duration'] < 0, 'offer_duration'] = 0


# --- Insert the Mean/Target Encoding for id3 (and id2 if desired) here ---
# 2. id3_mean_encoded_ctr (Offer ID Mean/Target Encoding)
from sklearn.model_selection import KFold

NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

train_features_final['id3_mean_encoded_ctr'] = np.nan
for fold, (train_idx, val_idx) in enumerate(kf.split(train_features_final)):
    fold_mean_ctr = train_features_final.iloc[train_idx].groupby('id3')['y'].mean()
    train_features_final.loc[val_idx, 'id3_mean_encoded_ctr'] = train_features_final.iloc[val_idx]['id3'].map(fold_mean_ctr)

global_id3_mean_ctr = train_features_final.groupby('id3')['y'].mean()
test_features_final['id3_mean_encoded_ctr'] = test_features_final['id3'].map(global_id3_mean_ctr)
test_features_final['id3_mean_encoded_ctr'] = test_features_final['id3_mean_encoded_ctr'].fillna(train_features_final['y'].mean())

# (Optional) If you want to also mean-encode 'id2' (Customer ID):
# train_features_final['id2_mean_encoded_ctr'] = np.nan
# for fold, (train_idx, val_idx) in enumerate(kf.split(train_features_final)):
#     fold_mean_ctr = train_features_final.iloc[train_idx].groupby('id2')['y'].mean()
#     train_features_final.loc[val_idx, 'id2_mean_encoded_ctr'] = train_features_final.iloc[val_idx]['id2'].map(fold_mean_ctr)
# global_id2_mean_ctr = train_features_final.groupby('id2')['y'].mean()
# test_features_final['id2_mean_encoded_ctr'] = test_features_final['id2'].map(global_id2_mean_ctr)
# test_features_final['id2_mean_encoded_ctr'] = test_features_final['id2_mean_encoded_ctr'].fillna(train_features_final['y'].mean())

In [16]:
X=train_features_final.drop(columns=['y','id1'])
y=train_features_final['y']
X_train=X[:616131]
y_train=y[:616131]
X_test=X[616131:]
y_test=y[616131:]

In [17]:
X_train

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id2,id3,id4,id10,id8,id12,id13,id6,id7,time_since_last_impression,...,offer_ctr_bucket,click_rate_30d,engagement_score,log_ctr_f137,exp_ctr_f134,category_ctr_bin,minute,days_since_offer_start,offer_duration,id3_mean_encoded_ctr
86947,1111176,58396,1698796823,1.0,57130000.0,1694995200,1698796799,,,0.000000,...,0,0.000000,0.000000,0.123859,0.865915,5,0,44.000266,43.999988,
86948,1111176,74465,1698796823,1.0,58120500.0,1690156800,1698796799,,,0.032000,...,0,0.000000,0.000000,0.123859,0.865915,5,0,100.000266,99.999988,
86949,1111176,566484,1698796823,2.0,59991100.0,1690848000,1698883199,,,0.071000,...,1,0.000000,0.000000,0.123859,0.865915,5,0,92.000266,92.999988,0.013029
86950,1111176,762903,1698796823,1.0,80990000.0,1690848000,1698883199,,,0.009000,...,1,0.000000,0.000000,0.123859,0.865915,5,0,92.000266,92.999988,
86951,1111176,14996,1698796823,1.0,56510000.0,1693526400,1698796799,,,0.004000,...,0,0.000000,0.000000,0.123859,0.865915,5,0,61.000266,60.999988,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760465,1899411,689367,1699000256,1.0,53110100.0,1698796800,1704067199,,,1.000924,...,3,0.000000,0.000000,0.101904,0.884635,5,0,2.354815,60.999988,0.139013
626139,1742558,146933,1699000256,1.0,53110000.0,1693872000,1704067199,,,0.000000,...,1,0.000000,0.000000,0.002841,1.000000,0,0,59.354815,117.999988,
252976,1315150,24840,1699000256,1.0,56510500.0,1695168000,1703116799,,,0.695000,...,1,0.142857,0.137371,0.128719,0.886920,5,0,44.354815,91.999988,
252977,1315150,2552,1699000256,1.0,56510000.0,1695168000,1703116799,,,0.001000,...,1,0.333332,0.137371,0.128719,0.886920,5,0,44.354815,91.999988,


In [18]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split

# Compute scale_pos_weight for class imbalance
scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(f"scale_pos_weight = {scale_pos_weight:.2f}")

# Best params you recovered
params = {
    "learning_rate": 0.04588242171630075,
    "num_leaves": 39,
    "max_depth": 10,
    "min_child_samples": 25,
    "feature_fraction": 0.5269864948058149,
    "bagging_fraction": 0.9954022568460998,
    "bagging_freq": 2,
    "lambda_l1": 0.015544600650003099,
    "lambda_l2": 2.1832264146154266e-08,
    "objective": "binary",
    "metric": "auc",  # Can keep this even if optimizing for PR-AUC externally
    "verbosity": -1,
    "boosting_type": "gbdt",
    "device": "gpu",  # Remove this if no GPU is available
    "scale_pos_weight": 1,
}

# Train on full dataset (or use train/test split to evaluate if needed)
final_train = lgb.Dataset(X_train, y_train)
model = lgb.train(params, final_train, num_boost_round=1000)

# Optionally evaluate on test split if X_test/y_test available
if 'X_test' in locals():
    preds = model.predict(X_test)
    pr_auc = average_precision_score(y_test, preds)
    print(f"PR-AUC on holdout set: {pr_auc:.4f}")

# Save model
model.save_model("lgbm_classweighted_optuna_pr_auc.txt")
print("✅ Model retrained and saved.")


scale_pos_weight = 19.79




PR-AUC on holdout set: 0.7469
✅ Model retrained and saved.


In [20]:
feat_imp = pd.DataFrame({
    'feature': model.feature_name(),
    'importance': model.feature_importance(importance_type='gain')  # or 'gain'
}).sort_values(by='importance', ascending=False)


In [20]:
feat_imp_df = pd.DataFrame({
    'Variable': model.feature_name(),
    'Importance_Score': model.feature_importance(importance_type='gain')
}).sort_values(by='Importance_Score', ascending=False).reset_index(drop=True)

# You can add a 'Rank' column if desired
feat_imp_df['Rank'] = feat_imp_df.index + 1

# Reorder columns for better presentation
feat_imp_df = feat_imp_df[['Rank', 'Variable', 'Importance_Score']]

# Define the output file name
output_csv_file = 'variable_importance_for_submission.csv'

# Save the DataFrame to a CSV file
feat_imp_df.to_csv(output_csv_file, index=False)

print(f"\n✅ Variable importance file '{output_csv_file}' created successfully!")
print(f"It contains {len(feat_imp_df)} variables.")
print("\nFirst 30 rows of the importance file:")
print(feat_imp_df.head(30))



✅ Variable importance file 'variable_importance_for_submission.csv' created successfully!
It contains 421 variables.

First 30 rows of the importance file:
    Rank                     Variable  Importance_Score
0      1   time_since_last_impression     716198.770964
1      2                         f366     402674.544489
2      3                         f363     143538.844168
3      4                         f137      96671.873459
4      5             offer_ctr_bucket      66029.619229
5      6                 log_ctr_f137      54913.177975
6      7                         f210      54605.794117
7      8                         f125      47033.955250
8      9                         f132      34413.605288
9     10          user_session_length      29160.861381
10    11                         id12      26123.675611
11    12                         f353      20166.022076
12    13                         f207      15402.721170
13    14                         f365      14402.594923
14 

In [21]:
top300=feat_imp['feature'][:320].tolist()

In [22]:
top300.append('id1')

In [23]:
print("\n--- New Feat Imp (Top Pachaas) ---")
print(feat_imp.head(50))


--- New Feat Imp (Top Pachaas) ---
                         feature     importance
9     time_since_last_impression  716477.763930
11                          f366  402809.727688
18                          f363  143524.676861
36                          f137   99707.214157
411             offer_ctr_bucket   66037.856720
19                          f210   54618.039416
414                 log_ctr_f137   51904.371281
15                          f125   47164.186733
14                          f132   34463.202847
10           user_session_length   29288.687277
5                           id12   26041.226748
157                         f353   20193.652378
16                          f207   15397.296996
39                          f365   14305.923556
23       user_avg_session_length   13993.790203
35                          f127   13657.290727
12        hourly_offer_frequency   13039.739840
415                 exp_ctr_f134   11009.379808
17                          f223   10649.192960
13  

In [None]:
# train_features_final.to_csv('finaltrainig.csv')
# test_features_final.to_csv('finaltestig.csv')

In [None]:
test_filtered=test_features_final[top300]
top300.append('y')
train_filtered=train_features_final[top300]

In [None]:
print(test_filtered.shape,train_filtered.shape)

In [None]:
X=train_filtered.drop(columns=['y','id1'])
y=train_filtered['y']
X_train=X[:616131]
y_train=y[:616131]
X_test=X[616131:]
y_test=y[616131:]

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split


# Compute scale_pos_weight for class imbalance
scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(f"scale_pos_weight = {scale_pos_weight:.2f}")

# Best params you recovered
params = {
    'learning_rate': 0.016623218477757856, 'num_leaves': 88, 'max_depth': 12, 'min_child_samples': 31, 'feature_fraction': 0.7857024999905345, 'bagging_fraction': 0.8106542792647912, 'bagging_freq': 2, 'lambda_l1': 9.625481788587463, 'lambda_l2': 4.969459532508245e-05,
    "objective": "binary",
    "metric": "auc",  # Can keep this even if optimizing for PR-AUC externally
    "verbosity": -1,
    "boosting_type": "gbdt",
    "device": "gpu",  # Remove this if no GPU is available
    "scale_pos_weight": 1,
}

# Train on full dataset (or use train/test split to evaluate if needed)
final_train = lgb.Dataset(X, y)
model_final = lgb.train(params, final_train, num_boost_round=1000)

# Optionally evaluate on test split if X_test/y_test available
if 'X_test' in locals():
    preds = model_final.predict(X_test)
    pr_auc = average_precision_score(y_test, preds)
    print(f"PR-AUC on holdout set: {pr_auc:.4f}")

# Save model
model_final.save_model("lgbm_classweighted_optuna_pr_auc1.txt")
print("✅ Model retrained and saved.")


In [None]:
df_test=pd.read_csv('/kaggle/input/kanafrendatasetsplswork/test_features_final.csv')
df_test=df_test[['id1','id2','id3','id5']]

In [None]:
test_filtered

In [None]:
pred_test_filtered=test_filtered.drop(columns='id1')

In [None]:
pred_test_filtered

In [None]:
prediction=model_final.predict(pred_test_filtered)

In [None]:
prediction

In [None]:
test_filtered['pred']=prediction


In [None]:
test_filtered

In [None]:
submission_df=df_test.merge(test_filtered[['id1','pred']], on='id1', how='left')

In [None]:
submission_df

In [None]:
submission_df.to_csv('finalsubmissionplswork.csv')

In [None]:
# import pandas as pd
# import numpy as np
# import lightgbm as lgb
# from sklearn.metrics import average_precision_score
# from sklearn.model_selection import KFold # For Mean/Target Encoding

# # Load DataFrames
# train_features_final = pd.read_csv('/kaggle/input/train_features_final.csv')
# test_features_final = pd.read_csv('/kaggle/input/test_features_final.csv')
# final_feature_names = pd.read_csv('/kaggle/input/final_feature_names.csv')

# # --- Existing Feature Engineering from your notebook (re-ordered for logical flow) ---

# # Handle id5 date conversion early for minute extraction
# train_features_final["id5"] = pd.to_datetime(train_features_final['id5'], errors='coerce')
# test_features_final["id5"] = pd.to_datetime(test_features_final['id5'], errors='coerce')

# train_features_final["minute"] = train_features_final['id5'].dt.minute
# test_features_final["minute"] = test_features_final['id5'].dt.minute

# # Extract hour from id4 (event timestamp) and convert to sin/cos features
# # Assuming id4 will be converted to datetime later for this purpose if it's not already
# # If id4 is directly unix timestamp, you can convert it to datetime first here
# # For now, let's keep id4 conversion logic later, but add hour feature based on it
# # You had 'hour_sin' and 'hour_cos' in your feature list, let's re-create them properly
# # based on 'id4' once it's a timestamp.
# # For now, we'll create them after id4 is processed to Unix timestamp.


# # Total popularity of an offer across all users
# global_offer_stats = train_features_final.groupby('id3').agg(
#     global_offer_impressions=('id1', 'count'),
#     global_offer_clicks=('y', 'sum')
# ).reset_index()

# global_offer_stats['global_offer_ctr'] = global_offer_stats['global_offer_clicks'] / (global_offer_stats['global_offer_impressions'] + 1e-5) # Added epsilon for stability

# bins = [0, 0.01, 0.03, 0.07, 0.15, 1]
# labels = [0,1,2,3,4]
# global_offer_stats['offer_ctr_bucket'] = pd.cut(global_offer_stats['global_offer_ctr'], bins=bins, labels=labels, include_lowest=True)

# train_features_final = train_features_final.merge(
#     global_offer_stats[['id3', 'offer_ctr_bucket']],
#     on='id3',
#     how='left'
# )
# test_features_final = test_features_final.merge(
#     global_offer_stats[['id3', 'offer_ctr_bucket']],
#     on='id3',
#     how='left'
# )
# # Fill NaN for new offers in test set
# test_features_final['offer_ctr_bucket'] = test_features_final['offer_ctr_bucket'].fillna(train_features_final['offer_ctr_bucket'].mode()[0])


# # Step 1: Map category one-hot to corresponding CTR feature
# category_ctr_map = {
#     'f227': 'f130',  # Dining
#     'f228': 'f131',  # Entertainment
#     'f230': 'f134',  # Services
#     'f231': 'f132',  # Shopping
#     'f232': 'f133',  # Travel
# }

# # Step 2: Create category_ctr by using the matching category's CTR
# def get_category_ctr(row):
#     for cat_flag, ctr_col in category_ctr_map.items():
#         if cat_flag in row and row[cat_flag] == 1: # Check if category flag exists and is 1
#             return row[ctr_col]
#     return np.nan  # fallback in case no match

# train_features_final['category_ctr'] = train_features_final.apply(get_category_ctr, axis=1)
# test_features_final['category_ctr'] = test_features_final.apply(get_category_ctr, axis=1)

# # Drop NaNs for plotting clarity
# plot_df = train_features_final[['category_ctr', 'y']].dropna().copy() # .copy() to avoid SettingWithCopyWarning

# # Bin CTR for plotting clarity
# if not plot_df.empty:
#     plot_df['ctr_bin'] = pd.cut(plot_df['category_ctr'], bins=10)

#     # Grouped mean redemption rate per CTR bin
#     grouped = plot_df.groupby('ctr_bin')['y'].mean().reset_index()
#     grouped['ordinal_label'] = grouped['y'].rank(method='first').astype(int) - 1

#     grouped['interval'] = pd.IntervalIndex.from_tuples([
#         (-0.001, 0.1), (0.1, 0.2), (0.2, 0.3), (0.3, 0.4), (0.4, 0.5),
#         (0.5, 0.6), (0.6, 0.7), (0.7, 0.8), (0.8, 0.9), (0.9, 1.0)
#     ])
#     bin_to_label = dict(zip(grouped['interval'], grouped['ordinal_label']))
#     bins = [-0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

#     # Step 4: Map to new label based on redemption rate
#     # Apply pd.cut directly to the full dataframes
#     train_features_final['category_ctr_bin'] = pd.cut(train_features_final['category_ctr'], bins=bins).map(bin_to_label).astype('Int64')
#     test_features_final['category_ctr_bin'] = pd.cut(test_features_final['category_ctr'], bins=bins).map(bin_to_label).astype('Int64')
# else:
#     print("Warning: plot_df is empty. Cannot create category_ctr_bin based on redemption rate.")
#     train_features_final['category_ctr_bin'] = train_features_final['category_ctr'].fillna(train_features_final['category_ctr'].median()).astype('Int64') # Fallback if empty
#     test_features_final['category_ctr_bin'] = test_features_final['category_ctr'].fillna(test_features_final['category_ctr'].median()).astype('Int64') # Fallback if empty


# train_features_final = train_features_final.drop(columns='category_ctr')
# test_features_final = test_features_final.drop(columns='category_ctr')

# # Convert relevant columns to datetime and then Unix timestamp in seconds
# cols_to_convert_to_unix = ['id4', 'id12', 'id13', 'impression_time', 'click_time'] # Added 'click_time' for completeness, though often target
# # Ensure 'id1' is also kept as a string initially if it's the primary key as it's not converted to unix timestamp.
# # It seems 'id1' contains string values like '1000061_9914_16-23_2023-11-05 09:11:35.557'

# for df in [train_features_final, test_features_final]:
#     for col in cols_to_convert_to_unix:
#         if col in df.columns and df[col].dtype != 'int64': # Only convert if not already int64 (Unix)
#             df[col] = pd.to_datetime(df[col], errors='coerce').astype('int64') // 10**9

# # Drop id5 after minute is extracted, as per your original notebook flow
# # If id5 is not used for anything else. If you use it for date features, keep it until then.
# # Your original code drops it here: train_features_final=train_features_final.drop(columns='id5')
# # Let's keep it until after new date features are made based on it, then drop.
# # For now, let's proceed with dropping as per original notebook.
# # train_features_final=train_features_final.drop(columns='id5')
# # test_features_final=test_features_final.drop(columns='id5')


# # --- Start of Jahnavi's new feature additions (incorporating previous suggestions) ---

# # 1. click_rate_30d = f206 / (f207 + 1e-5)
# train_features_final['click_rate_30d'] = train_features_final['f206'] / (train_features_final['f207'] + 1e-5)
# test_features_final['click_rate_30d'] = test_features_final['f206'] / (test_features_final['f207'] + 1e-5)

# # 2. days_since_offer_start = (timestamp - offer_start_timestamp) in days
# # id4 is 'Event timestamp', id12 is 'Start Timestamp' (now Unix)
# train_features_final['days_since_offer_start'] = (train_features_final['id4'] - train_features_final['id12']) / (24 * 3600)
# test_features_final['days_since_offer_start'] = (test_features_final['id4'] - test_features_final['id12']) / (24 * 3600)
# # Handle negative values if event timestamp is before offer start (impute to 0 or mean)
# train_features_final.loc[train_features_final['days_since_offer_start'] < 0, 'days_since_offer_start'] = 0
# test_features_final.loc[test_features_final['days_since_offer_start'] < 0, 'days_since_offer_start'] = 0


# # 3. offer_duration = offer_end - offer_start in days
# # id13 is 'End Timestamp', id12 is 'Start Timestamp' (now Unix)
# train_features_final['offer_duration'] = (train_features_final['id13'] - train_features_final['id12']) / (24 * 3600)
# test_features_final['offer_duration'] = (test_features_final['id13'] - test_features_final['id12']) / (24 * 3600)
# # Handle negative duration if end is before start (impute to 0 or mean)
# train_features_final.loc[train_features_final['offer_duration'] < 0, 'offer_duration'] = 0
# test_features_final.loc[test_features_final['offer_duration'] < 0, 'offer_duration'] = 0


# # 4. engagement_score = f137 × f206
# train_features_final['engagement_score'] = train_features_final['f137'] * train_features_final['f206']
# test_features_final['engagement_score'] = test_features_final['f137'] * test_features_final['f206']

# # 5. log_ctr = log1p(f137)
# train_features_final['log_ctr_f137'] = np.log1p(train_features_final['f137'])
# test_features_final['log_ctr_f137'] = np.log1p(test_features_final['f137'])

# # 6. exp_ctr = np.exp(-f134)
# train_features_final['exp_ctr_f134'] = np.exp(-train_features_final['f134'])
# test_features_final['exp_ctr_f134'] = np.exp(-test_features_final['f134'])

# # --- End of Jahnavi's new feature additions ---


# # --- Start of Additional Suggested Features ---

# # 1. Time of Day/Week Interaction
# # Extract hour from id4 (Event timestamp - now Unix)
# # Convert Unix timestamp back to datetime temporarily for hour extraction
# train_features_final['event_datetime'] = pd.to_datetime(train_features_final['id4'], unit='s')
# test_features_final['event_datetime'] = pd.to_datetime(test_features_final['id4'], unit='s')

# train_features_final['hour'] = train_features_final['event_datetime'].dt.hour
# test_features_final['hour'] = test_features_final['event_datetime'].dt.hour

# train_features_final['day_of_week'] = train_features_final['event_datetime'].dt.dayofweek # Monday=0, Sunday=6
# test_features_final['day_of_week'] = test_features_final['event_datetime'].dt.dayofweek

# # Cyclical features for hour (already present as 'hour_sin', 'hour_cos' in your column list)
# train_features_final['hour_sin'] = np.sin(2 * np.pi * train_features_final['hour'] / 24)
# train_features_final['hour_cos'] = np.cos(2 * np.pi * train_features_final['hour'] / 24)
# test_features_final['hour_sin'] = np.sin(2 * np.pi * test_features_final['hour'] / 24)
# test_features_final['hour_cos'] = np.cos(2 * np.pi * test_features_final['hour'] / 24)

# # Interaction: hourly_offer_frequency_x_category_ctr
# # Ensure 'hourly_offer_frequency' exists (it was in your feature list)
# if 'hourly_offer_frequency' in train_features_final.columns:
#     train_features_final['hourly_freq_x_cat_ctr_bin'] = train_features_final['hourly_offer_frequency'] * train_features_final['category_ctr_bin']
#     test_features_final['hourly_freq_x_cat_ctr_bin'] = test_features_final['hourly_offer_frequency'] * test_features_final['category_ctr_bin']
# else:
#     print("Warning: 'hourly_offer_frequency' not found, skipping 'hourly_freq_x_cat_ctr_bin'")
#     train_features_final['hourly_freq_x_cat_ctr_bin'] = 0 # Placeholder
#     test_features_final['hourly_freq_x_cat_ctr_bin'] = 0 # Placeholder


# # Interaction: cm_ctr_x_offer_industry_ctr
# # Ensure f366 and f363 exist and are numeric
# train_features_final['cm_ctr_x_offer_industry_ctr'] = train_features_final['f366'] * train_features_final['f363']
# test_features_final['cm_ctr_x_offer_industry_ctr'] = test_features_final['f366'] * test_features_final['f363']

# # Interaction: log_ctr_f137_x_offer_ctr_bucket
# train_features_final['log_ctr_f137_x_offer_bucket'] = train_features_final['log_ctr_f137'] * train_features_final['offer_ctr_bucket']
# test_features_final['log_ctr_f137_x_offer_bucket'] = test_features_final['log_ctr_f137'] * test_features_final['offer_ctr_bucket']

# # Ratio of Decaying to Non-Decaying Impressions/Clicks (if f148, f147, f207, f206 exist)
# if 'f148' in train_features_final.columns and 'f207' in train_features_final.columns:
#     train_features_final['decay_imp_30d_ratio'] = train_features_final['f148'] / (train_features_final['f207'] + 1e-5)
#     test_features_final['decay_imp_30d_ratio'] = test_features_final['f148'] / (test_features_final['f207'] + 1e-5)
# else:
#     print("Warning: Missing f148 or f207 for 'decay_imp_30d_ratio'")
#     train_features_final['decay_imp_30d_ratio'] = 0
#     test_features_final['decay_imp_30d_ratio'] = 0

# if 'f147' in train_features_final.columns and 'f206' in train_features_final.columns:
#     train_features_final['decay_clicks_30d_ratio'] = train_features_final['f147'] / (train_features_final['f206'] + 1e-5)
#     test_features_final['decay_clicks_30d_ratio'] = test_features_final['f147'] / (test_features_final['f206'] + 1e-5)
# else:
#     print("Warning: Missing f147 or f206 for 'decay_clicks_30d_ratio'")
#     train_features_final['decay_clicks_30d_ratio'] = 0
#     test_features_final['decay_clicks_30d_ratio'] = 0

# # Interest Score Aggregations (example: max interest related to dining)
# # This requires mapping offer categories to interest IDs. Let's assume a simplified mapping for demonstration
# # You would need to refine this based on your understanding of 'f_interest_id' and 'f_offer_category' mappings
# # For example, if offer is 'dining' (f227=1), use f8 (restaurant interest score)
# interest_score_map = {
#     'f227': 'f8',   # Dining -> Restaurant interest
#     'f231': 'f41' # Shopping -> assuming f41 is related to general consumption, or pick a specific shopping interest
#     # Add more mappings based on your actual feature mapping
# }

# def get_relevant_interest_score(row):
#     relevant_score = 0
#     for cat_flag, interest_col in interest_score_map.items():
#         if cat_flag in row and row[cat_flag] == 1 and interest_col in row:
#             # We are taking the score of the interest related to the offer's category
#             relevant_score = max(relevant_score, row[interest_col])
#     return relevant_score

# train_features_final['relevant_interest_score'] = train_features_final.apply(get_relevant_interest_score, axis=1)
# test_features_final['relevant_interest_score'] = test_features_final.apply(get_relevant_interest_score, axis=1)


# # --- Mean/Target Encoding for id3 (Offer ID) and id2 (Customer ID) ---
# # This is a powerful technique, but requires careful handling to avoid data leakage.
# # We'll use KFold for training data and simple mean for test data.

# NFOLDS = 5 # Number of folds for K-Fold target encoding
# kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42) # Use a fixed random state for reproducibility

# # Mean encode 'id3' (Offer ID)
# train_features_final['id3_mean_encoded_ctr'] = np.nan
# for fold, (train_idx, val_idx) in enumerate(kf.split(train_features_final)):
#     # Calculate mean on the training fold only
#     fold_mean_ctr = train_features_final.iloc[train_idx].groupby('id3')['y'].mean()
#     # Apply to the validation fold
#     train_features_final.loc[val_idx, 'id3_mean_encoded_ctr'] = train_features_final.iloc[val_idx]['id3'].map(fold_mean_ctr)

# # For test_features_final, use the mean calculated from the entire train_features_final
# global_id3_mean_ctr = train_features_final.groupby('id3')['y'].mean()
# test_features_final['id3_mean_encoded_ctr'] = test_features_final['id3'].map(global_id3_mean_ctr)
# # Handle new id3s in test set (not seen in training) by filling with global mean y
# test_features_final['id3_mean_encoded_ctr'] = test_features_final['id3_mean_encoded_ctr'].fillna(train_features_final['y'].mean())


# # Mean encode 'id2' (Customer ID) - similar logic
# train_features_final['id2_mean_encoded_ctr'] = np.nan
# for fold, (train_idx, val_idx) in enumerate(kf.split(train_features_final)):
#     fold_mean_ctr = train_features_final.iloc[train_idx].groupby('id2')['y'].mean()
#     train_features_final.loc[val_idx, 'id2_mean_encoded_ctr'] = train_features_final.iloc[val_idx]['id2'].map(fold_mean_ctr)

# global_id2_mean_ctr = train_features_final.groupby('id2')['y'].mean()
# test_features_final['id2_mean_encoded_ctr'] = test_features_final['id2'].map(global_id2_mean_ctr)
# test_features_final['id2_mean_encoded_ctr'] = test_features_final['id2_mean_encoded_ctr'].fillna(train_features_final['y'].mean())

# # --- End of Additional Suggested Features ---

# # Final timestamp dropping (assuming id5, event_datetime, hour, day_of_week are now used for features)
# # It's generally good practice to keep original ID columns (id1, id2, id3, id4) for debugging/merging.
# # Only drop id5 if no other datetime features will be derived from it.
# train_features_final = train_features_final.drop(columns=['id5', 'event_datetime'])
# test_features_final = test_features_final.drop(columns=['id5', 'event_datetime'])


# # Sort by id4 (event timestamp) for time-based splitting
# train_features_final = train_features_final.sort_values(by='id4', ascending=True)

# # Define X and y for the model
# # Drop 'id1' (Primary Key) and 'y' (Target) from features
# # Also drop 'click_time' if it exists and is not intended as a feature itself but rather for 'y' derivation
# columns_to_drop = ['y', 'id1']
# if 'click_time' in train_features_final.columns:
#     columns_to_drop.append('click_time')

# X = train_features_final.drop(columns=columns_to_drop)
# y = train_features_final['y']

# # Filter out non-numeric columns that might have slipped through (e.g., if a feature was completely NaN)
# # Or if 'id1' (the string-based one) accidentally got into X after drops.
# # This ensures all features passed to LightGBM are numeric.
# X_cols = X.select_dtypes(include=np.number).columns.tolist()
# X = X[X_cols]
# test_filtered_cols = test_features_final.select_dtypes(include=np.number).columns.tolist()
# # Ensure test set has the same columns as train set before making predictions
# # Identify columns in X that are not in test_features_final
# missing_in_test = set(X.columns) - set(test_features_final.columns)
# for col in missing_in_test:
#     # Add missing columns to test_features_final with a default value (e.g., 0 or mean of train)
#     test_features_final[col] = 0 # Or use X[col].mean()

# # Align columns between training and test sets after feature engineering
# test_filtered = test_features_final[X.columns]


# # Split into training and validation sets based on chronological order
# # Use the same split point as your original notebook for consistency
# split_point = 616131
# X_train = X[:split_point]
# y_train = y[:split_point]
# X_val = X[split_point:] # Renamed X_test to X_val for clarity as it's a validation set
# y_val = y[split_point:]


# # Handle any remaining NaNs in numeric columns (after all feature engineering)
# # LightGBM can handle NaNs, but it's often better to explicitly fill them
# # using a strategy (e.g., median, mean, or 0) for consistency and to avoid unexpected behavior.
# # Filling with 0 for simplicity, but consider the median for continuous features.
# X_train = X_train.fillna(0)
# X_val = X_val.fillna(0)
# test_filtered = test_filtered.fillna(0)


# # Compute scale_pos_weight for class imbalance
# scale_pos_weight = (y == 0).sum() / (y == 1).sum()
# print(f"scale_pos_weight = {scale_pos_weight:.2f}")

# # Best params you recovered
# params = {
#     'learning_rate': 0.016623218477757856,
#     'num_leaves': 88,
#     'max_depth': 12,
#     'min_child_samples': 31,
#     'feature_fraction': 0.7857024999905345,
#     'bagging_fraction': 0.8106542792647912,
#     'bagging_freq': 2,
#     'lambda_l1': 9.625481788587463,
#     'lambda_l2': 4.969459532508245e-05,
#     "objective": "binary",
#     "metric": "auc",
#     "verbosity": -1,
#     "boosting_type": "gbdt",
#     "device": "gpu",  # Remove this if no GPU is available
#     "scale_pos_weight": scale_pos_weight, # Apply the calculated scale_pos_weight
# }

# # Train on the training split (X_train, y_train) for evaluating on X_val, y_val
# train_data = lgb.Dataset(X_train, y_train)
# val_data = lgb.Dataset(X_val, y_val, reference=train_data) # Use validation set for early stopping if desired

# model = lgb.train(
#     params,
#     train_data,
#     num_boost_round=1000,
#     valid_sets=[val_data],
#     callbacks=[lgb.early_stopping(100, verbose=False)] # Early stopping based on validation set
# )

# # Evaluate on validation set
# preds_val = model.predict(X_val)
# pr_auc_val = average_precision_score(y_val, preds_val)
# print(f"PR-AUC on validation set: {pr_auc_val:.4f}")

# # Re-train on the full dataset (X, y) with potentially more rounds (or use best_iteration from early stopping)
# # This is often done for the final model to be saved for submission
# print("\nRetraining final model on full training data...")
# final_model = lgb.train(params, lgb.Dataset(X, y), num_boost_round=model.best_iteration) # Use best iteration from validation
# final_model.save_model("lgbm_classweighted_optuna_pr_auc_new_features.txt")
# print("✅ Final Model retrained and saved.")


# # --- Get and Display New Feature Importance ---
# # Use the final_model trained on the full dataset for importance
# feat_imp_new = pd.DataFrame({
#     'feature': final_model.feature_name(),
#     'importance': final_model.feature_importance(importance_type='gain')
# }).sort_values(by='importance', ascending=False)

# print("\n--- NEW Feature Importance (Top 50) ---")
# print(feat_imp_new.head(50))

# # Optional: Visualize the new feature importance
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(12, 18))
# sns.barplot(x="importance", y="feature", data=feat_imp_new.head(50))
# plt.title("LightGBM Feature Importance (Gain) with New Features")
# plt.xlabel("Feature Importance (Gain)")
# plt.ylabel("Feature Name")
# plt.tight_layout()
# plt.show()


# # --- Generate Predictions for Submission ---
# # Use the 'test_filtered' DataFrame (which is the original test_features_final after all transformations)
# prediction = final_model.predict(test_filtered)

# # Add predictions to the original test_features_final (which still contains id1 for submission)
# # Make sure df_test is the original test data with id1, id2, id3, id5 as used in your submission cell
# # df_test = pd.read_csv('/kaggle/input/kanafrendatasetsplswork/test_features_final.csv')
# # df_test_submission = df_test[['id1','id2','id3','id5']].copy() # Create a copy to avoid modifying original df_test
# # df_test_submission['pred'] = prediction

# # # Sort by id1 if needed for submission format
# # # submission_df = submission_df.sort_values(by='id1')

# # submission_df.to_csv('finalsubmission_with_new_features.csv', index=False)
# # print("✅ Submission file created with new features.")