In [1]:
import pandas as pd
import numpy as np

In [2]:
file_name = 'train_dataset_full.csv'  
df1 = pd.read_csv(file_name)

df1["DateTime"] = pd.to_datetime(df1["DateTime"], errors='coerce')
df1['DateTime'] = df1['DateTime'].ffill() # Forward fill
df1['DateTime'] = df1['DateTime'].bfill() # Backward fill

df1['hour'] = df1['DateTime'].dt.hour
df1['hour_sin'] = np.sin(2 * np.pi * df1['hour'] / 24)
df1['hour_cos'] = np.cos(2 * np.pi * df1['hour'] / 24)
df1 = df1.drop(columns=['hour'])

df1['day_of_week'] = df1['DateTime'].dt.dayofweek

df1["DateTime"].isna().sum()

0

In [3]:
file2_name = 'X_test_1st_label.csv'
df2 = pd.read_csv(file2_name)

df2['DateTime'] = pd.to_datetime(df2["DateTime"], errors='coerce')
df2['DateTime'] = df2['DateTime'].ffill() # Forward fill
df2['DateTime'] = df2['DateTime'].bfill() # Backward fill

df2['hour'] = df2['DateTime'].dt.hour
df2['hour_sin'] = np.sin(2 * np.pi * df2['hour'] / 24)
df2['hour_cos'] = np.cos(2 * np.pi * df2['hour'] / 24)
df2 = df2.drop(columns=['hour'])

df2['day_of_week'] = df2['DateTime'].dt.dayofweek

df2["DateTime"].isna().sum()

0

In [4]:
df = pd.concat([df1, df2], ignore_index=True)
df = df.drop_duplicates()
# Impute missing values with Null

df["city_development_index"] = df["city_development_index"].fillna(0)
df["product_category_2"] = df["product_category_2"].fillna(0)

print(df[["city_development_index", "product_category_2"]].isnull().sum())

df=df.dropna()

city_development_index    0
product_category_2        0
dtype: int64


In [5]:
# import ydata_profiling
# ydata_profiling.ProfileReport(df_combined)

In [6]:
# DATA SPLITTING

import pickle
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# ✅ Step 1: Define the function for splitting users while preserving behavior
def split_by_user(df, split_ratios=(0.6, 0.2, 0.2)):
    """Split DataFrame into train, validation, and test while preserving user behavior distributions."""

    if sum(split_ratios) != 1:
        raise ValueError("Split ratios must sum to 1")

    # ✅ Compute user statistics
    user_stats = df.groupby('user_id').agg(
        n_sessions=('session_id', 'nunique'),
        n_clicks=('is_click', 'sum')
    ).reset_index()

    # ✅ Create stratification groups based on session & click activity
    user_stats['strat_group'] = user_stats.apply(
        lambda x: f"s{x['n_sessions']}_c{x['n_clicks']}", axis=1
    )

    # ✅ Ensure stratification groups have at least 2 users
    group_counts = user_stats['strat_group'].value_counts()
    valid_groups = group_counts[group_counts >= 2].index  # Keep only groups with at least 2 users
    user_stats = user_stats[user_stats['strat_group'].isin(valid_groups)]

    # ✅ Check if stratification is possible
    stratify_col = user_stats['strat_group'] if len(valid_groups) > 1 else None

    # ✅ First Split: Train vs. Temp (Val + Test)
    temp_size = split_ratios[1] + split_ratios[2]  # Combined val + test size
    train_users, temp_users = train_test_split(
        user_stats,
        test_size=temp_size,
        stratify=stratify_col,
        random_state=42
    )

    # ✅ Recalculate stratification for temp split (val vs. test)
    temp_users['strat_group'] = temp_users.apply(
        lambda x: f"s{x['n_sessions']}_c{x['n_clicks']}", axis=1
    )

    temp_group_counts = temp_users['strat_group'].value_counts()
    valid_temp_groups = temp_group_counts[temp_group_counts >= 2].index
    temp_users = temp_users[temp_users['strat_group'].isin(valid_temp_groups)]
    stratify_temp = temp_users['strat_group'] if len(valid_temp_groups) > 1 else None

    # ✅ Second Split: Val vs. Test
    val_size = split_ratios[1] / temp_size  # Relative size of val within temp
    val_users, test_users = train_test_split(
        temp_users,
        test_size=1 - val_size,
        stratify=stratify_temp,
        random_state=42
    )

    # ✅ Assign masks for filtering the original dataset
    train_mask = df['user_id'].isin(train_users['user_id'])
    val_mask = df['user_id'].isin(val_users['user_id'])
    test_mask = df['user_id'].isin(test_users['user_id'])

    # ✅ Return the split dataframes
    return df[train_mask], df[val_mask], df[test_mask]

# ✅ Step 2: Apply the user-based split
train, val, test = split_by_user(df)

print(f"Train set: {train.shape}")
print(f"Validation set: {val.shape}")
print(f"Test set: {test.shape}")

print(train['is_click'].mean(), val['is_click'].mean(), test['is_click'].mean())

Train set: (237627, 18)
Validation set: (77095, 18)
Test set: (77112, 18)
0.06789211663657749 0.06776055515921915 0.06758999896254798


In [7]:

def compute_historical_click(df):
    """Calculate historical click without modifying the original index."""
    temp_df = df[['user_id', 'DateTime', 'is_click']].copy()  # Keep only necessary columns
    
    temp_df = temp_df.sort_values(by=['user_id', 'DateTime'])  # Sort per user
    temp_df['historical_click'] = temp_df.groupby('user_id')['is_click'].cumsum().shift(1).fillna(0)

    return temp_df[['user_id', 'DateTime', 'historical_click']]  # Return only relevant columns

# Compute historical click for each dataset
train_historical = compute_historical_click(train)
val_historical = compute_historical_click(val)
test_historical = compute_historical_click(test)

# Merge back with the original datasets (ensuring correct alignment)
train = train.merge(train_historical, on=['user_id', 'DateTime'], how='left')
val = val.merge(val_historical, on=['user_id', 'DateTime'], how='left')
test = test.merge(test_historical, on=['user_id', 'DateTime'], how='left')


In [8]:
def compute_sessions(df):
    """Calculate the number of historical sessions per user.
    
    For each session, 'sessions' represents the count of previous sessions
    for that user. The first session will have a count of 0.
    """
    # Keep only the necessary columns
    temp_df = df[['user_id', 'DateTime', 'session_id']].copy()
    
    # Sort by user and DateTime (earlier sessions come first)
    temp_df = temp_df.sort_values(by=['user_id', 'DateTime'])
    
    # Count previous sessions using cumcount
    temp_df['sessions'] = temp_df.groupby('user_id').cumcount()
    
    return temp_df[['user_id', 'DateTime', 'session_id', 'sessions']]

# Compute historical sessions for each dataset
train_historical = compute_sessions(train)
val_historical = compute_sessions(val)
test_historical = compute_sessions(test)

# Merge back with the original datasets (ensuring correct alignment)
train = train.merge(train_historical, on=['user_id', 'DateTime', 'session_id'], how='left')
val = val.merge(val_historical, on=['user_id', 'DateTime', 'session_id'], how='left')
test = test.merge(test_historical, on=['user_id', 'DateTime', 'session_id'], how='left')

In [9]:
# ✅ Step 3: Define categorical features
CATEGORICAL = ['product', 'campaign_id', 'webpage_id', 'product_category_1', 
               'product_category_2', 'user_group_id', 'gender', 'user_depth', 
               'city_development_index', 'var_1', 'day_of_week']

# ✅ Step 4: Fit One-Hot Encoder ONLY on the Train Set (to avoid leakage)
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
ohe.fit(train[CATEGORICAL])  # 🚨 Fit ONLY on train data!

# ✅ Step 5: Apply Encoding to Train, Val, and Test
train_encoded = ohe.transform(train[CATEGORICAL])
val_encoded = ohe.transform(val[CATEGORICAL])
test_encoded = ohe.transform(test[CATEGORICAL])

# ✅ Convert Encoded Data to DataFrames
train_encoded_df = pd.DataFrame(train_encoded, columns=ohe.get_feature_names_out(CATEGORICAL), index=train.index)
val_encoded_df = pd.DataFrame(val_encoded, columns=ohe.get_feature_names_out(CATEGORICAL), index=val.index)
test_encoded_df = pd.DataFrame(test_encoded, columns=ohe.get_feature_names_out(CATEGORICAL), index=test.index)

# ✅ Step 6: Ensure Val/Test Have the Same Columns as Train (Add Missing Ones)
for col in train_encoded_df.columns:
    if col not in val_encoded_df.columns:
        val_encoded_df[col] = 0  # Add missing column with zeros
    if col not in test_encoded_df.columns:
        test_encoded_df[col] = 0

# ✅ Step 7: Reorder Val/Test Columns to Match Train Order
val_encoded_df = val_encoded_df[train_encoded_df.columns]
test_encoded_df = test_encoded_df[train_encoded_df.columns]

# ✅ Step 8: Replace Categorical Features with Encoded Data
train = pd.concat([train.drop(columns=CATEGORICAL), train_encoded_df], axis=1)
val = pd.concat([val.drop(columns=CATEGORICAL), val_encoded_df], axis=1)
test = pd.concat([test.drop(columns=CATEGORICAL), test_encoded_df], axis=1)

# ✅ Step 9: Save Encoder for Later Use (Preprocessing External Test Data)
with open("onehot_encoder.pkl", "wb") as f:
    pickle.dump(ohe, f)

print("✅ One-Hot Encoding applied successfully on train, validation and internal test without leakage!")

✅ One-Hot Encoding applied successfully on train, validation and internal test without leakage!


In [11]:
# ✅ Define features and target AFTER all preprocessing
X_train = train.drop(columns=['is_click'])  # Features
y_train = train['is_click']  # Target

X_val = val.drop(columns=['is_click'])
y_val = val['is_click']

X_test = test.drop(columns=['is_click'])
y_test = test['is_click']

# ✅ Drop unnecessary columns
#drop_cols = ["session_id", "DateTime", "user_id"]

#X_train = X_train.drop(columns=drop_cols, errors='ignore')
#X_val = X_val.drop(columns=drop_cols, errors='ignore')
#X_test = X_test.drop(columns=drop_cols, errors='ignore')

# ✅ Save Processed Data to CSV
X_train.to_csv("X_train.csv", index=False)
y_train.to_csv("y_train.csv", index=False)

X_val.to_csv("X_val.csv", index=False)
y_val.to_csv("y_val.csv", index=False)

X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("✅ Processed internal datasets saved successfully!")

✅ Processed internal datasets saved successfully!


**Handling External Test Data**

In [12]:
# Handling External Test Data

file3_name = 'X_test_1st.csv'  
external_test = pd.read_csv(file3_name)

    # ✅ Ensure `is_click` is NOT in external test before saving
external_test = external_test.drop(columns=['is_click'], errors='ignore') 

external_test["DateTime"] = pd.to_datetime(external_test["DateTime"], errors='coerce')
external_test['DateTime'] = external_test['DateTime'].ffill()  # Forward fill
external_test['DateTime'] = external_test['DateTime'].bfill()  # Backward fill

external_test['hour'] = external_test['DateTime'].dt.hour
external_test['hour_sin'] = np.sin(2 * np.pi * external_test['hour'] / 24)
external_test['hour_cos'] = np.cos(2 * np.pi * external_test['hour'] / 24)
external_test = external_test.drop(columns=['hour'])  # Remove raw hour column

external_test['day_of_week'] = external_test['DateTime'].dt.dayofweek

print("Missing DateTime values:", external_test["DateTime"].isna().sum())

# Drop duplicates
external_test.drop_duplicates(inplace=True)


external_test["city_development_index"] = external_test["city_development_index"].fillna(0)
external_test["product_category_2"] = external_test["product_category_2"].fillna(0)

print(external_test[["city_development_index", "product_category_2"]].isnull().sum())

# ✅ Fill missing values for all other columns
NUMERIC_COLUMN = "age_group"
fill_values = {}

for col in external_test.columns:
    if col == NUMERIC_COLUMN and external_test[col].notna().sum() > 0:
        fill_values[col] = external_test[col].median()
    else:
        fill_values[col] = external_test[col].mode()[0] if external_test[col].notna().sum() > 0 else 0

external_test.fillna(fill_values, inplace=True)
print("Missing values after imputation:", external_test.isnull().sum().sum())  # Should be 0

# ✅ Load the Saved Encoder
with open("onehot_encoder.pkl", "rb") as f:
    ohe = pickle.load(f)

# ✅ Ensure all categorical columns exist
available_categorical = [col for col in CATEGORICAL if col in external_test.columns]
print("Categorical Features Used for External Test Encoding:", available_categorical)

# ✅ Apply One-Hot Encoding
external_test_encoded = ohe.transform(external_test[available_categorical])

# ✅ Convert to DataFrame
external_test_encoded_df = pd.DataFrame(
    external_test_encoded, 
    columns=ohe.get_feature_names_out(available_categorical),  # ✅ Fix: Use only available columns
    index=external_test.index
)

# ✅ Ensure consistency with train (Handle Missing/New Columns)
# Ensure all training columns exist in external test
for col in train.columns:
    if col not in external_test_encoded_df.columns:
        external_test_encoded_df[col] = 0  # ✅ Add missing column

# ✅ Remove extra columns not in train
extra_cols = [col for col in external_test_encoded_df.columns if col not in train.columns]
external_test_encoded_df.drop(columns=extra_cols, inplace=True)

# ✅ Reorder columns to match train
external_test_encoded_df = external_test_encoded_df[train.columns]

# ✅ Drop original categorical columns before merging encoded data
external_test.drop(columns=CATEGORICAL, errors='ignore', inplace=True)

# ✅ Merge One-Hot Encoded Data
external_test = pd.concat([external_test, external_test_encoded_df], axis=1)

# ✅ Ensure no duplicate columns before final output
external_test = external_test.loc[:, ~external_test.columns.duplicated()]

print("✅ External test set processed successfully and is aligned with train!")
print("Final Columns:", external_test.columns)


Missing DateTime values: 0
city_development_index    0
product_category_2        0
dtype: int64
Missing values after imputation: 0
Categorical Features Used for External Test Encoding: ['product', 'campaign_id', 'webpage_id', 'product_category_1', 'product_category_2', 'user_group_id', 'gender', 'user_depth', 'city_development_index', 'var_1', 'day_of_week']
✅ External test set processed successfully and is aligned with train!
Final Columns: Index(['session_id', 'DateTime', 'user_id', 'age_level', 'hour_sin',
       'hour_cos', 'is_click', 'historical_click', 'sessions', 'product_A',
       'product_B', 'product_C', 'product_D', 'product_E', 'product_F',
       'product_G', 'product_H', 'product_I', 'product_J',
       'campaign_id_82320.0', 'campaign_id_98970.0', 'campaign_id_105960.0',
       'campaign_id_118601.0', 'campaign_id_359520.0', 'campaign_id_360936.0',
       'campaign_id_396664.0', 'campaign_id_404347.0', 'campaign_id_405490.0',
       'campaign_id_414149.0', 'webpage_id_

In [13]:
# ✅ Check for duplicate column names (show only those with count > 1)
duplicate_columns = external_test.columns[external_test.columns.duplicated()].tolist()

if duplicate_columns:
    print("⚠️ Duplicate Columns Detected:")
    for col in set(duplicate_columns):
        count = (external_test.columns == col).sum()
        if count > 1:
            print(f"Column '{col}' appears {count} times.")
else:
    print("✅ No duplicate columns found.")


✅ No duplicate columns found.


In [14]:

# Concatenate train, validation, and test data to compute historical clicks
train_val_test_data = pd.concat([train, val, test], axis=0)

def compute_historical_clicks_external(external_test, train_val_test_data):
    """
    Impute historical click counts per user for external test set based on train-val-test clicks.
    Ensures no future leakage and correctly handles first-time users.
    """

    # Ensure DateTime columns are in datetime format (if not already)
    train_val_test_data['DateTime'] = pd.to_datetime(train_val_test_data['DateTime'])
    external_test['DateTime'] = pd.to_datetime(external_test['DateTime'])
    
    # Sort the internal and external data by user_id and DateTime
    train_val_test_data_sorted = train_val_test_data.sort_values(by=['user_id', 'DateTime'])
    external_test_sorted = external_test.sort_values(by=['user_id', 'DateTime'])
    
    imputed_historical_clicks = []
    
    # Debug: count how many times we find historical data for a sample of users
    debug_counter = {}
    
    for idx, external_row in external_test_sorted.iterrows():
        user_id = external_row['user_id']
        current_datetime = external_row['DateTime']
        
        # Get all historical sessions for the user that occurred before current_datetime
        user_data_up_to_now = train_val_test_data_sorted[
            (train_val_test_data_sorted['user_id'] == user_id) &
            (train_val_test_data_sorted['DateTime'] < current_datetime)
        ]
        
        if user_data_up_to_now.empty:
            historical_clicks_for_user = 0
        else:
            # Compute cumulative historical clicks for this user
            historical_clicks_for_user = user_data_up_to_now['is_click'].cumsum().shift(1).fillna(0).iloc[-1]
        
        # Debug count per user
        debug_counter[user_id] = debug_counter.get(user_id, 0) + 1
        imputed_historical_clicks.append(historical_clicks_for_user)
    
    # Optionally, print out a sample of the debug information
    for user, count in list(debug_counter.items())[:5]:
        print(f"User {user} processed for {count} external rows.")
    
    # Assign the computed values back using the sorted index
    external_test.loc[external_test_sorted.index, 'historical_click'] = imputed_historical_clicks
    
    return external_test

# Compute once and store the result
external_test_imputed = compute_historical_clicks_external(external_test, train_val_test_data)

# Debug prints (optional)
print("Sample of historical_click values:")
print(external_test_imputed[['user_id', 'DateTime', 'historical_click']].head())

User 19 processed for 1 external rows.
User 46 processed for 2 external rows.
User 59 processed for 1 external rows.
User 62 processed for 2 external rows.
User 67 processed for 2 external rows.
Sample of historical_click values:
   user_id            DateTime  historical_click
0   352186 2017-07-03 10:03:00               0.0
1   980231 2017-07-03 14:21:00               0.0
2   610332 2017-07-05 17:47:00               2.0
3   849506 2017-07-06 11:01:00               0.0
4   499495 2017-07-02 07:50:00               0.0


  external_test.loc[external_test_sorted.index, 'historical_click'] = imputed_historical_clicks


In [24]:
def compute_sessions_external(external_test, train_val_test_data):
    """
    Compute a historical session count for each row in the external_test set,
    taking into account sessions from the internal data (train, val, test) as well
    as prior sessions from external_test for the same user.
    
    For each external test session, the session count is defined as:
    
        sessions = (# internal sessions for user with DateTime < current)
                   + (# external test sessions for user already processed)
    
    This ensures that if no sessions exist before, the count is 0, aligning with
    the logic for train, val, and test.
    """
    import pandas as pd

    # Ensure DateTime columns are datetime objects
    train_val_test_data['DateTime'] = pd.to_datetime(train_val_test_data['DateTime'])
    external_test['DateTime'] = pd.to_datetime(external_test['DateTime'])
    
    # Sort the internal data by user_id and DateTime
    internal_sorted = train_val_test_data.sort_values(by=['user_id', 'DateTime'])
    
    # Precompute a dictionary mapping each user to a list of internal session DateTimes.
    internal_sessions = {}
    for user, group in internal_sorted.groupby('user_id'):
        internal_sessions[user] = group['DateTime'].tolist()
    
    # Sort external_test by user_id and DateTime.
    # The original index is preserved so we can assign back later.
    external_test_sorted = external_test.sort_values(by=['user_id', 'DateTime']).copy()
    
    # List to store the computed session counts.
    sessions = []
    # Dictionary to track how many external sessions for a given user have been processed so far.
    external_counts = {}
    
    # Iterate over the sorted external test rows.
    for idx, row in external_test_sorted.iterrows():
        user = row['user_id']
        current_dt = row['DateTime']
        
        # Count internal sessions for this user with a DateTime strictly before the current one.
        internal_count = sum(1 for dt in internal_sessions.get(user, []) if dt < current_dt)
        
        # Count the external sessions already processed for this user.
        ext_count = external_counts.get(user, 0)
        
        # Compute the historical session number without adding 1.
        # This means that if there are no previous sessions (internal or external),
        # the session count will be 0.
        hist_sessions = internal_count + ext_count
        sessions.append(hist_sessions)
        
        # Update the external session count for this user.
        external_counts[user] = ext_count + 1

    # Assign the computed sessions back to external_test using the sorted index.
    external_test.loc[external_test_sorted.index, 'sessions'] = sessions
    
    return external_test


# =============================================================================
# Example usage:

# Assume train_val_test_data is already computed (concatenated train, val, test)
# and external_test contains columns: user_id, DateTime, session_id

external_test_imputed = compute_sessions_external(external_test, train_val_test_data)

# Inspect the results
print("Sample of computed session numbers:")
print(external_test_imputed[['user_id', 'DateTime', 'session_id', 'sessions']].head())


Sample of computed session numbers:
   user_id            DateTime  session_id  sessions
0   352186 2017-07-03 10:03:00      121580        13
1   980231 2017-07-03 14:21:00       95831         2
2   610332 2017-07-05 17:47:00      421806        81
3   849506 2017-07-06 11:01:00      585403         1
4   499495 2017-07-02 07:50:00      496398         4


In [25]:
# Save the computed DataFrame to CSV
#external_test_imputed.to_csv('X_test_2nd_with_historical_click.csv', index=False)

#print("✅ The external test data with imputed historical_click has been saved as 'X_test_2nd_with_historical_click.csv'")

In [26]:
# ✅ Identify extra columns (present in external_test but not in train)
extra_cols = [col for col in external_test.columns if col not in train.columns]

# ✅ Identify missing columns (present in train but not in external_test)
missing_cols = [col for col in train.columns if col not in external_test.columns]

print(f"⚠️ Extra columns in external_test (removing): {extra_cols}")
print(f"⚠️ Missing columns in external_test (adding as 0): {missing_cols}")

# ✅ Drop extra columns from external_test
external_test.drop(columns=extra_cols, inplace=True, errors='ignore')

# ✅ Add missing columns with default value 0
for col in missing_cols:
    external_test[col] = 0  # Fill missing feature columns with 0

# ✅ Ensure column order matches train
external_test = external_test[train.columns]

print("✅ External test set aligned successfully with train!")
print("Final column count:", len(external_test.columns), " (Should match train -1)")


⚠️ Extra columns in external_test (removing): []
⚠️ Missing columns in external_test (adding as 0): []
✅ External test set aligned successfully with train!
Final column count: 98  (Should match train -1)


In [27]:
# ✅ Ensure `is_click` is NOT in external test before saving
X_test_external = external_test.drop(columns=['is_click'], errors='ignore')  

# ✅ Drop unnecessary columns
#drop_cols = ["session_id", "DateTime", "user_id"]
#X_test_external = X_test_external.drop(columns=drop_cols, errors='ignore')

# ✅ Save Processed External Test Data to CSV
X_test_external.to_csv("X_test_external.csv", index=False)

print("✅ Processed external test dataset saved successfully! (Features only, no labels)")

✅ Processed external test dataset saved successfully! (Features only, no labels)


In [28]:
import pandas as pd

# Load the datasets
X_train = pd.read_csv("X_train.csv")
X_val = pd.read_csv("X_val.csv")
X_test = pd.read_csv("X_test.csv")
X_test_external = pd.read_csv("X_test_external.csv")

# Check column consistency
base_columns = set(X_train.columns)

for name, file in [("X_val", X_val), ("X_test", X_test), ("X_test_external", X_test_external)]:
    current_columns = set(file.columns)
    missing = base_columns - current_columns
    extra = current_columns - base_columns
    print(f"\n{name} column issues:")
    print(f"Missing: {missing}" if missing else "✅ No missing columns")
    print(f"Extra: {extra}" if extra else "✅ No extra columns")



X_val column issues:
✅ No missing columns
✅ No extra columns

X_test column issues:
✅ No missing columns
✅ No extra columns

X_test_external column issues:
✅ No missing columns
✅ No extra columns


In [29]:
y_train = pd.read_csv("y_train.csv")
y_val = pd.read_csv("y_val.csv")
y_test = pd.read_csv("y_test.csv")

for name, (X, y) in {
    "train": (X_train, y_train),
    "val": (X_val, y_val),
    "test": (X_test, y_test),
}.items():
    if len(X) != len(y):
        print(f"⚠️ {name.upper()} mismatch: X_{name} ({len(X)}) vs. y_{name} ({len(y)})")
    else:
        print(f"✅ {name.upper()} length matches: {len(X)} rows")


✅ TRAIN length matches: 532239 rows
✅ VAL length matches: 167749 rows
✅ TEST length matches: 167976 rows


In [30]:
missing_external = base_columns - set(X_test_external.columns)
extra_external = set(X_test_external.columns) - base_columns

print("\nExternal Test Set Issues:")
print(f"Missing columns in X_test_external: {missing_external}" if missing_external else "✅ No missing columns in X_test_external")
print(f"Extra columns in X_test_external: {extra_external}" if extra_external else "✅ No extra columns in X_test_external")



External Test Set Issues:
✅ No missing columns in X_test_external
✅ No extra columns in X_test_external


**Train** is script train2.py

In [31]:
X_test_external.columns

Index(['session_id', 'DateTime', 'user_id', 'age_level', 'hour_sin',
       'hour_cos', 'historical_click', 'sessions', 'product_A', 'product_B',
       'product_C', 'product_D', 'product_E', 'product_F', 'product_G',
       'product_H', 'product_I', 'product_J', 'campaign_id_82320.0',
       'campaign_id_98970.0', 'campaign_id_105960.0', 'campaign_id_118601.0',
       'campaign_id_359520.0', 'campaign_id_360936.0', 'campaign_id_396664.0',
       'campaign_id_404347.0', 'campaign_id_405490.0', 'campaign_id_414149.0',
       'webpage_id_1734.0', 'webpage_id_6970.0', 'webpage_id_11085.0',
       'webpage_id_13787.0', 'webpage_id_28529.0', 'webpage_id_45962.0',
       'webpage_id_51181.0', 'webpage_id_53587.0', 'webpage_id_60305.0',
       'product_category_1_1.0', 'product_category_1_2.0',
       'product_category_1_3.0', 'product_category_1_4.0',
       'product_category_1_5.0', 'product_category_2_0.0',
       'product_category_2_18595.0', 'product_category_2_32026.0',
       'product_