In [781]:
# Import necessary modules
import sys
sys.path.append('..')
import pandas as pd # type: ignore
import constants as cons
import numpy as np

# Load the raw data - using the correct path

# Clean the data using the existing clean_data function
# Use the constants file path instead of hardcoded path
df = pd.read_csv('../' + cons.DATA_PATH + cons.DEFAULT_RAW_TRAIN_FILE)
df = df.drop(columns=cons.COLUMNS_TO_DROP)
df = df.drop_duplicates()
df = df.dropna()

# Add engineered features

# Display basic information about the preprocessed dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
display(df.head())


Dataset shape: (349024, 13)

First few rows:


Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,user_group_id,gender,age_level,user_depth,var_1,is_click
0,98528.0,2017-07-04 16:42,7716.0,C,405490.0,60305.0,3.0,3.0,Male,3.0,3.0,1.0,1.0
1,589714.0,2017-07-07 07:40,1035283.0,I,118601.0,28529.0,4.0,10.0,Female,4.0,3.0,1.0,0.0
2,478652.0,2017-07-07 20:42,65994.0,H,359520.0,13787.0,4.0,4.0,Male,4.0,3.0,0.0,0.0
3,34536.0,2017-07-05 15:05,75976.0,H,405490.0,60305.0,3.0,3.0,Male,3.0,3.0,0.0,0.0
4,71863.0,2017-07-06 20:11,987498.0,C,405490.0,60305.0,3.0,2.0,Male,2.0,3.0,0.0,0.0


In [782]:
print("Adding engineered time related features")
df['DateTime'] = pd.to_datetime(df['DateTime'])
df['day_of_week'] = df['DateTime'].dt.dayofweek
df['hour'] = df['DateTime'].dt.hour
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)


Adding engineered time related features


In [783]:
print("Splitting data into train/val/test sets (60/20/20) while maintaining click distribution")

from sklearn.model_selection import train_test_split
# Get click distribution for stratification
y = df['is_click']

# Split into train and temp sets (60% train, 40% temp)
train_df_naive, temp_df = train_test_split(
    df,
    train_size=0.6,
    stratify=y,
    random_state=42
)

# Split temp into validation and test sets (50% each, so 20% of original data each)
val_df_naive, test_df_naive = train_test_split(
    temp_df,
    train_size=0.5, 
    stratify=temp_df['is_click'],
    random_state=42
)

print(f"Train set size: {len(train_df_naive)} ({len(train_df_naive)/len(df):.1%})")
print(f"Validation set size: {len(val_df_naive)} ({len(val_df_naive)/len(df):.1%})")
print(f"Test set size: {len(test_df_naive)} ({len(test_df_naive)/len(df):.1%})")

print("\nClick rates:")
print(f"Overall: {df['is_click'].mean():.3f}")
print(f"Train: {train_df_naive['is_click'].mean():.3f}")
print(f"Validation: {val_df_naive['is_click'].mean():.3f}") 
print(f"Test: {test_df_naive['is_click'].mean():.3f}")


Splitting data into train/val/test sets (60/20/20) while maintaining click distribution
Train set size: 209414 (60.0%)
Validation set size: 69805 (20.0%)
Test set size: 69805 (20.0%)

Click rates:
Overall: 0.068
Train: 0.068
Validation: 0.068
Test: 0.068


In [784]:

print("Calculating baseline F1 score with naive splitting")
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# One-hot encode categorical columns
print("One-hot encoding categorical features")
from sklearn.preprocessing import OneHotEncoder

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Drop DateTime, user_id, and session_id columns first
columns_to_drop = ['DateTime', 'user_id', 'session_id']
train_df_processed = train_df_naive.drop(columns=columns_to_drop)
val_df_processed = val_df_naive.drop(columns=columns_to_drop) 
test_df_processed = test_df_naive.drop(columns=columns_to_drop)

# Separate features
categorical_features = [col for col in cons.CATEGORICAL if col not in columns_to_drop]
numeric_features = [col for col in train_df_processed.columns if col not in categorical_features + ['is_click']]

# Fit and transform on training data
X_train_encoded = encoder.fit_transform(train_df_processed[categorical_features])
X_val_encoded = encoder.transform(val_df_processed[categorical_features])
X_test_encoded = encoder.transform(test_df_processed[categorical_features])

# Get feature names
feature_names = encoder.get_feature_names_out(categorical_features)

# Convert to DataFrames
X_train_encoded = pd.DataFrame(X_train_encoded, columns=feature_names, index=train_df_processed.index)
X_val_encoded = pd.DataFrame(X_val_encoded, columns=feature_names, index=val_df_processed.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=feature_names, index=test_df_processed.index)

# Add numeric columns
X_train = pd.concat([X_train_encoded, train_df_processed[numeric_features]], axis=1)
X_val = pd.concat([X_val_encoded, val_df_processed[numeric_features]], axis=1)
X_test = pd.concat([X_test_encoded, test_df_processed[numeric_features]], axis=1)

y_train = train_df_processed['is_click']
y_val = val_df_processed['is_click']
y_test = test_df_processed['is_click']

# Train Random Forest model with reasonable parameters
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10, 
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

print("Training Random Forest model...")
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

# Print F1 scores
print("\nF1 Scores:")
baseline_f1_naive = f1_score(y_test, y_test_pred)
print(f"Test F1 (naive splitting): {baseline_f1_naive:.3f}")


Calculating baseline F1 score with naive splitting
One-hot encoding categorical features
Training Random Forest model...

F1 Scores:
Test F1 (naive splitting): 0.151


In [785]:
from sklearn.model_selection import train_test_split

print(f"Splitting data by user, maintaining click and session distribution")
# Create user-level features for stratification
user_features = df.groupby('user_id').agg({
    'session_id': 'count',  # number of sessions
    'is_click': 'sum'       # number of clicks (not rate)
}).reset_index()

# Create stratification group using actual values
user_features['strat_group'] = user_features.apply(
    lambda x: f"sessions_{int(x['session_id'])}_clicks_{int(x['is_click'])}", 
    axis=1
)

# Identify common and rare groups
group_counts = user_features['strat_group'].value_counts()
common_groups = group_counts[group_counts >= 6].index

# Split users into common and rare groups
common_users = user_features[user_features['strat_group'].isin(common_groups)]
rare_users = user_features[~user_features['strat_group'].isin(common_groups)]

# Split common users with stratification
train_users_common, temp_users_common = train_test_split(
    common_users['user_id'],
    train_size=0.6,
    stratify=common_users['strat_group'],
    random_state=42
)

val_users_common, test_users_common = train_test_split(
    temp_users_common,
    train_size=0.5,
    stratify=common_users.loc[common_users['user_id'].isin(temp_users_common), 'strat_group'],
    random_state=42
)

# Randomly assign rare users to maintain approximately 60-20-20 split
rare_users_shuffled = rare_users['user_id'].sample(frac=1, random_state=42)
n_rare = len(rare_users_shuffled)
n_train_rare = int(0.6 * n_rare)
n_val_rare = int(0.2 * n_rare)

train_users_rare = rare_users_shuffled[:n_train_rare]
val_users_rare = rare_users_shuffled[n_train_rare:n_train_rare + n_val_rare]
test_users_rare = rare_users_shuffled[n_train_rare + n_val_rare:]

# Combine common and rare users
train_users = pd.concat([train_users_common, train_users_rare])
val_users = pd.concat([val_users_common, val_users_rare])
test_users = pd.concat([test_users_common, test_users_rare])

# Create the final dataframes
df_train = df[df['user_id'].isin(train_users)].copy()
df_val = df[df['user_id'].isin(val_users)].copy()
df_test = df[df['user_id'].isin(test_users)].copy()

# Print statistics to verify the split
print("Number of users in each set:")
print(f"Train: {len(train_users)} ({len(train_users)/len(user_features):.1%})")
print(f"Validation: {len(val_users)} ({len(val_users)/len(user_features):.1%})")
print(f"Test: {len(test_users)} ({len(test_users)/len(user_features):.1%})")

print("\nNumber of sessions in each set:")
print(f"Train: {len(df_train)} ({len(df_train)/len(df):.1%})")
print(f"Validation: {len(df_val)} ({len(df_val)/len(df):.1%})")
print(f"Test: {len(df_test)} ({len(df_test)/len(df):.1%})")

# Verify click distributions are similar
print("\nClick rates in each set:")
print(f"Train: {df_train['is_click'].mean():.3f}")
print(f"Validation: {df_val['is_click'].mean():.3f}")
print(f"Test: {df_test['is_click'].mean():.3f}")

# Print distribution of sessions per user in each set
print("\nAverage sessions per user in each set:")
print(f"Train: {df_train.groupby('user_id')['session_id'].count().mean():.2f}")
print(f"Validation: {df_val.groupby('user_id')['session_id'].count().mean():.2f}")
print(f"Test: {df_test.groupby('user_id')['session_id'].count().mean():.2f}")

Splitting data by user, maintaining click and session distribution
Number of users in each set:
Train: 76507 (60.0%)
Validation: 25502 (20.0%)
Test: 25503 (20.0%)

Number of sessions in each set:
Train: 209531 (60.0%)
Validation: 69610 (19.9%)
Test: 69883 (20.0%)

Click rates in each set:
Train: 0.068
Validation: 0.068
Test: 0.068

Average sessions per user in each set:
Train: 2.74
Validation: 2.73
Test: 2.74


In [786]:
# Create feature for whether user has viewed product before
def add_product_history(df):
    # Sort by user and datetime
    df = df.sort_values(['user_id', 'DateTime'])
    
    # Initialize the new feature
    df['product_viewed_before'] = 0
    
    # For each user
    for user_id in df['user_id'].unique():
        user_sessions = df[df['user_id'] == user_id]
        
        # For each session of this user (already sorted chronologically)
        for i, (_, current_session) in enumerate(user_sessions.iterrows()):
            if i > 0:  # Skip first session
                # Get all previous sessions for this user
                previous_sessions = user_sessions.iloc[:i]
                # Check if current product was viewed in any previous session
                if current_session['product'] in previous_sessions['product'].values:
                    df.loc[current_session.name, 'product_viewed_before'] = 1
    
    return df

# Add the feature to each dataset
print("Adding product history feature to train set...")
df_train = add_product_history(df_train)

print("Adding product history feature to validation set...")
df_val = add_product_history(df_val)

print("Adding product history feature to test set...")
df_test = add_product_history(df_test)


Adding product history feature to train set...
Adding product history feature to validation set...
Adding product history feature to test set...


In [787]:
from sklearn.preprocessing import OneHotEncoder
# Initialize OneHotEncoder for categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit encoder on train set categorical columns
encoder.fit(df_train[cons.CATEGORICAL])

# Transform train set
train_cat_encoded = encoder.transform(df_train[cons.CATEGORICAL])
train_cat_cols = encoder.get_feature_names_out(cons.CATEGORICAL)
df_train_encoded = pd.concat([
    df_train.drop(columns=cons.CATEGORICAL),
    pd.DataFrame(train_cat_encoded, columns=train_cat_cols, index=df_train.index)
], axis=1)

# Transform validation set using fitted encoder
val_cat_encoded = encoder.transform(df_val[cons.CATEGORICAL]) 
df_val_encoded = pd.concat([
    df_val.drop(columns=cons.CATEGORICAL),
    pd.DataFrame(val_cat_encoded, columns=train_cat_cols, index=df_val.index)
], axis=1)

# Transform test set using fitted encoder
test_cat_encoded = encoder.transform(df_test[cons.CATEGORICAL])
df_test_encoded = pd.concat([
    df_test.drop(columns=cons.CATEGORICAL),
    pd.DataFrame(test_cat_encoded, columns=train_cat_cols, index=df_test.index)
], axis=1)

df_train = df_train_encoded
df_val = df_val_encoded
df_test = df_test_encoded

In [788]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Calculate baseline F1 score without product history feature
baseline_features = [col for col in df_train.columns 
                    if col not in ['user_id', 'session_id', 'DateTime', 'is_click', 'product_viewed_before']]

# Initialize baseline model with reasonable parameters
baseline_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10, 
    min_samples_split=5,
    class_weight='balanced',
    random_state=42
)

# Train model
baseline_model.fit(df_train[baseline_features], df_train['is_click'])

# Generate predictions
baseline_predictions = baseline_model.predict(df_test[baseline_features])

# Calculate and print F1 score
baseline_f1 = f1_score(df_test['is_click'], baseline_predictions)
print(f"Baseline Model F1 Score: {baseline_f1:.3f}")


Baseline Model F1 Score: 0.152


In [789]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Features to exclude from model
exclude_cols = ['user_id', 'session_id', 'DateTime']

# Get feature columns with and without product history
features_with_history = [col for col in df_train.columns if col not in exclude_cols + ['is_click']]
features_without_history = [col for col in features_with_history if col != 'product_viewed_before']

# Initialize models
model_with_history = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, 
                                          class_weight='balanced', random_state=42)

# Train and evaluate model with product history
model_with_history.fit(df_train[features_with_history], df_train['is_click'])
y_pred_with_history = model_with_history.predict(df_test[features_with_history])
f1_with_history = f1_score(df_test['is_click'], y_pred_with_history)


print("Test Set F1 Scores:")
print(f"With product_viewed_before:    {f1_with_history:.4f}")
print(f"Without product_viewed_before: {baseline_f1:.4f}")
print(f"Improvement:                   {((f1_with_history - baseline_f1) / baseline_f1 * 100):.1f}%")


Test Set F1 Scores:
With product_viewed_before:    0.1605
Without product_viewed_before: 0.1519
Improvement:                   5.6%


In [790]:
# Function to split dataframe into initial and final sessions for each user
def split_initial_final_sessions(df):
    # Sort by user_id and DateTime to ensure chronological order
    df = df.sort_values(['user_id', 'DateTime'])
    
    # Get the last session for each user
    final_sessions = df.groupby('user_id').last().reset_index()
    
    # Get all sessions except the last one for each user
    initial_sessions = df.merge(
        final_sessions[['user_id', 'session_id']], 
        on='user_id', 
        how='left', 
        suffixes=('', '_final')
    )
    initial_sessions = initial_sessions[
        initial_sessions['session_id'] != initial_sessions['session_id_final']
    ].drop('session_id_final', axis=1)
    
    return initial_sessions, final_sessions

# Split each dataset into initial and final sessions
print("Splitting datasets into initial and final sessions...")

train_initial, train_final = split_initial_final_sessions(df_train)
val_initial, val_final = split_initial_final_sessions(df_val)
test_initial, test_final = split_initial_final_sessions(df_test)

# Print summary statistics
print("\nDataset sizes after splitting:")
print(f"Training set:")
print(f"  Initial sessions: {len(train_initial)} rows")
print(f"  Final sessions:   {len(train_final)} rows")
print(f"  Total users:      {train_final['user_id'].nunique()}")

print(f"\nValidation set:")
print(f"  Initial sessions: {len(val_initial)} rows")
print(f"  Final sessions:   {len(val_final)} rows")
print(f"  Total users:      {val_final['user_id'].nunique()}")

print(f"\nTest set:")
print(f"  Initial sessions: {len(test_initial)} rows")
print(f"  Final sessions:   {len(test_final)} rows")
print(f"  Total users:      {test_final['user_id'].nunique()}")


Splitting datasets into initial and final sessions...

Dataset sizes after splitting:
Training set:
  Initial sessions: 133024 rows
  Final sessions:   76507 rows
  Total users:      76507

Validation set:
  Initial sessions: 44108 rows
  Final sessions:   25502 rows
  Total users:      25502

Test set:
  Initial sessions: 44380 rows
  Final sessions:   25503 rows
  Total users:      25503


In [791]:
pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [805]:
from xgboost import XGBClassifier

# First, get the final session for each user
test_final = df_test.sort_values(['user_id', 'DateTime']).groupby('user_id').last().reset_index()
train_final = df_train.sort_values(['user_id', 'DateTime']).groupby('user_id').last().reset_index()

# Get all other sessions for initial
test_initial = df_test[~df_test['session_id'].isin(test_final['session_id'])]
train_initial = df_train[~df_train['session_id'].isin(train_final['session_id'])]

# Define columns to drop only for model input
model_cols_to_drop = ['DateTime', 'user_id', 'session_id']

# Initialize models with parameters for high class imbalance
model_params = {
    'n_estimators': 100,
    'max_depth': 8,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'scale_pos_weight': 20,  # ~1/0.00068 to handle class imbalance
    'random_state': 42
}

xgb_initial = XGBClassifier(**model_params)
xgb_final = XGBClassifier(**model_params)

# Keep full features for calculations
X_train_initial_full = train_initial.drop(['is_click'], axis=1)
# Drop user_id only for model
X_train_initial_model = X_train_initial_full.drop(model_cols_to_drop, axis=1)
y_train_initial = train_initial['is_click']

xgb_initial.fit(X_train_initial_model, y_train_initial)

# Get probabilities for train_initial
train_initial_probs = xgb_initial.predict_proba(X_train_initial_model)[:, 1]

# Calculate user-level mean probabilities using the full features
user_probs = pd.DataFrame({
    'user_id': X_train_initial_full['user_id'],
    'mean_prob': train_initial_probs
}).groupby('user_id')['mean_prob'].mean()

# Overall mean probability for users with no history
global_mean = train_initial_probs.mean()

results = []
for alpha in [0.1, 0.2, 0.5, 0.8, 0.9]:
    print(f"\nTesting with alpha = {alpha}")
    
    # Keep full features for calculations
    X_train_final_full = train_final.drop(['is_click'], axis=1).copy()
    X_train_final_full['user_mean_prob'] = (
        alpha * X_train_final_full['user_id'].map(user_probs).fillna(global_mean) +
        (1 - alpha) * global_mean
    )
    # Create model features by dropping user_id
    X_train_final_model = X_train_final_full.drop(model_cols_to_drop, axis=1)
    y_train_final = train_final['is_click']
    
    # Train final model
    xgb_final.fit(X_train_final_model, y_train_final)
    
    # Keep full features for test set calculations
    X_test_initial_full = test_initial.drop(['is_click'], axis=1)
    X_test_initial_model = X_test_initial_full.drop(model_cols_to_drop, axis=1)
    test_initial_probs = xgb_initial.predict_proba(X_test_initial_model)[:, 1]
    
    # Calculate user-level mean probabilities for test set
    test_user_probs = pd.DataFrame({
        'user_id': X_test_initial_full['user_id'],
        'mean_prob': test_initial_probs
    }).groupby('user_id')['mean_prob'].mean()
    
    # Add mean probabilities to test_final
    X_test_final_full = test_final.drop(['is_click'], axis=1).copy()
    X_test_final_full['user_mean_prob'] = (
        alpha * X_test_final_full['user_id'].map(test_user_probs).fillna(global_mean) +
        (1 - alpha) * global_mean
    )
    X_test_final_model = X_test_final_full.drop(model_cols_to_drop, axis=1)
    
    # Get predictions
    initial_preds = xgb_initial.predict(X_test_initial_model)
    final_preds = xgb_final.predict(X_test_final_model)
    
    # Create a dictionary mapping session_id to prediction
    initial_pred_dict = dict(zip(test_initial['session_id'], initial_preds))
    final_pred_dict = dict(zip(test_final['session_id'], final_preds))
    
    # Combine all predictions using the original df_test order
    all_preds = df_test['session_id'].map({**initial_pred_dict, **final_pred_dict})
    
    # Calculate F1 score
    overall_f1 = f1_score(df_test['is_click'], all_preds)
    results.append({'alpha': alpha, 'overall_f1': overall_f1})
    
    print(f"F1 score: {overall_f1:.4f}")

best_f1 = max([r['overall_f1'] for r in results])
print(f"\nBest F1 score: {best_f1:.4f}")
# Combine train sets
X_train_combined = pd.concat([
    train_initial.drop(['is_click'], axis=1).drop(model_cols_to_drop, axis=1),
    train_final.drop(['is_click'], axis=1).drop(model_cols_to_drop, axis=1)
])
y_train_combined = pd.concat([train_initial['is_click'], train_final['is_click']])

# Train XGBoost on combined data
xgb_combined = XGBClassifier(**model_params)
xgb_combined.fit(X_train_combined, y_train_combined)

# Combine test sets
X_test_combined = pd.concat([
    test_initial.drop(['is_click'], axis=1).drop(model_cols_to_drop, axis=1),
    test_final.drop(['is_click'], axis=1).drop(model_cols_to_drop, axis=1)
])
y_test_combined = pd.concat([test_initial['is_click'], test_final['is_click']])

# Get predictions on combined test set
combined_preds = xgb_combined.predict(X_test_combined)
combined_f1 = f1_score(y_test_combined, combined_preds)

print(f"F1 score with combined training: {combined_f1:.4f}")
print(f"F1 score with recursive forecasting: {best_f1:.4f}")
print(f"Difference: {(combined_f1 - best_f1):.4f} ({(combined_f1/best_f1 - 1)*100:.1f}%)")



Testing with alpha = 0.1
F1 score: 0.1425

Testing with alpha = 0.2
F1 score: 0.1425

Testing with alpha = 0.5
F1 score: 0.1425

Testing with alpha = 0.8
F1 score: 0.1430

Testing with alpha = 0.9
F1 score: 0.1430

Best F1 score: 0.1430
F1 score with combined training: 0.1426
F1 score with recursive forecasting: 0.1430
Difference: -0.0005 (-0.3%)
