In [8]:
import sys
import pandas as pd # type: ignore
sys.path.append('..')
import constants as cons
import numpy as np

from preprocess import split_by_user
from preprocess import clean_data

df = pd.read_csv('../' + cons.DATA_PATH + cons.DEFAULT_RAW_TRAIN_FILE)
df = df.drop(columns=cons.COLUMNS_TO_DROP)
df = clean_data(df)

In [9]:
print("Adding engineered time related features")
df['DateTime'] = pd.to_datetime(df['DateTime'])
df['day_of_week'] = df['DateTime'].dt.dayofweek
df['hour'] = df['DateTime'].dt.hour
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)



Adding engineered time related features


In [10]:
print("Splitting data into train/val/test sets (60/20/20) while maintaining click distribution")

from sklearn.model_selection import train_test_split
# Get click distribution for stratification
y = df['is_click']

# Split into train and temp sets (60% train, 40% temp)
train_df_naive, temp_df = train_test_split(
    df,
    train_size=0.6,
    stratify=y,
    random_state=42
)

# Split temp into validation and test sets (50% each, so 20% of original data each)
val_df_naive, test_df_naive = train_test_split(
    temp_df,
    train_size=0.5, 
    stratify=temp_df['is_click'],
    random_state=42
)

print(f"Train set size: {len(train_df_naive)} ({len(train_df_naive)/len(df):.1%})")
print(f"Validation set size: {len(val_df_naive)} ({len(val_df_naive)/len(df):.1%})")
print(f"Test set size: {len(test_df_naive)} ({len(test_df_naive)/len(df):.1%})")

print("\nClick rates:")
print(f"Overall: {df['is_click'].mean():.3f}")
print(f"Train: {train_df_naive['is_click'].mean():.3f}")
print(f"Validation: {val_df_naive['is_click'].mean():.3f}") 
print(f"Test: {test_df_naive['is_click'].mean():.3f}")


Splitting data into train/val/test sets (60/20/20) while maintaining click distribution
Train set size: 209414 (60.0%)
Validation set size: 69805 (20.0%)
Test set size: 69805 (20.0%)

Click rates:
Overall: 0.068
Train: 0.068
Validation: 0.068
Test: 0.068


In [11]:

print("Calculating baseline F1 score with naive splitting")
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# One-hot encode categorical columns
print("One-hot encoding categorical features")
from sklearn.preprocessing import OneHotEncoder

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Drop DateTime, user_id, and session_id columns first
columns_to_drop = ['DateTime', 'user_id', 'session_id']
train_df_processed = train_df_naive.drop(columns=columns_to_drop)
val_df_processed = val_df_naive.drop(columns=columns_to_drop) 
test_df_processed = test_df_naive.drop(columns=columns_to_drop)

# Separate features
categorical_features = [col for col in cons.CATEGORICAL if col not in columns_to_drop]
numeric_features = [col for col in train_df_processed.columns if col not in categorical_features + ['is_click']]

# Fit and transform on training data
X_train_encoded = encoder.fit_transform(train_df_processed[categorical_features])
X_val_encoded = encoder.transform(val_df_processed[categorical_features])
X_test_encoded = encoder.transform(test_df_processed[categorical_features])

# Get feature names
feature_names = encoder.get_feature_names_out(categorical_features)

# Convert to DataFrames
X_train_encoded = pd.DataFrame(X_train_encoded, columns=feature_names, index=train_df_processed.index)
X_val_encoded = pd.DataFrame(X_val_encoded, columns=feature_names, index=val_df_processed.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=feature_names, index=test_df_processed.index)

# Add numeric columns
X_train = pd.concat([X_train_encoded, train_df_processed[numeric_features]], axis=1)
X_val = pd.concat([X_val_encoded, val_df_processed[numeric_features]], axis=1)
X_test = pd.concat([X_test_encoded, test_df_processed[numeric_features]], axis=1)

y_train = train_df_processed['is_click']
y_val = val_df_processed['is_click']
y_test = test_df_processed['is_click']

# Train Random Forest model with reasonable parameters
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10, 
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

print("Training Random Forest model...")
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

# Print F1 scores
print("\nF1 Scores:")
baseline_f1_naive = f1_score(y_test, y_test_pred)
print(f"Test F1 (naive splitting): {baseline_f1_naive:.3f}")


Calculating baseline F1 score with naive splitting
One-hot encoding categorical features
Training Random Forest model...

F1 Scores:
Test F1 (naive splitting): 0.151


In [12]:
from sklearn.model_selection import train_test_split

print(f"Splitting data by user, maintaining click and session distribution")
# Create user-level features for stratification
user_features = df.groupby('user_id').agg({
    'session_id': 'count',  # number of sessions
    'is_click': 'sum'       # number of clicks (not rate)
}).reset_index()

# Create stratification group using actual values
user_features['strat_group'] = user_features.apply(
    lambda x: f"sessions_{int(x['session_id'])}_clicks_{int(x['is_click'])}", 
    axis=1
)

# Identify common and rare groups
group_counts = user_features['strat_group'].value_counts()
common_groups = group_counts[group_counts >= 6].index

# Split users into common and rare groups
common_users = user_features[user_features['strat_group'].isin(common_groups)]
rare_users = user_features[~user_features['strat_group'].isin(common_groups)]

# Split common users with stratification
train_users_common, temp_users_common = train_test_split(
    common_users['user_id'],
    train_size=0.6,
    stratify=common_users['strat_group'],
    random_state=42
)

val_users_common, test_users_common = train_test_split(
    temp_users_common,
    train_size=0.5,
    stratify=common_users.loc[common_users['user_id'].isin(temp_users_common), 'strat_group'],
    random_state=42
)

# Randomly assign rare users to maintain approximately 60-20-20 split
rare_users_shuffled = rare_users['user_id'].sample(frac=1, random_state=42)
n_rare = len(rare_users_shuffled)
n_train_rare = int(0.6 * n_rare)
n_val_rare = int(0.2 * n_rare)

train_users_rare = rare_users_shuffled[:n_train_rare]
val_users_rare = rare_users_shuffled[n_train_rare:n_train_rare + n_val_rare]
test_users_rare = rare_users_shuffled[n_train_rare + n_val_rare:]

# Combine common and rare users
train_users = pd.concat([train_users_common, train_users_rare])
val_users = pd.concat([val_users_common, val_users_rare])
test_users = pd.concat([test_users_common, test_users_rare])

# Create the final dataframes
df_train = df[df['user_id'].isin(train_users)].copy()
df_val = df[df['user_id'].isin(val_users)].copy()
df_test = df[df['user_id'].isin(test_users)].copy()

# Print statistics to verify the split
print("Number of users in each set:")
print(f"Train: {len(train_users)} ({len(train_users)/len(user_features):.1%})")
print(f"Validation: {len(val_users)} ({len(val_users)/len(user_features):.1%})")
print(f"Test: {len(test_users)} ({len(test_users)/len(user_features):.1%})")

print("\nNumber of sessions in each set:")
print(f"Train: {len(df_train)} ({len(df_train)/len(df):.1%})")
print(f"Validation: {len(df_val)} ({len(df_val)/len(df):.1%})")
print(f"Test: {len(df_test)} ({len(df_test)/len(df):.1%})")

# Verify click distributions are similar
print("\nClick rates in each set:")
print(f"Train: {df_train['is_click'].mean():.3f}")
print(f"Validation: {df_val['is_click'].mean():.3f}")
print(f"Test: {df_test['is_click'].mean():.3f}")

# Print distribution of sessions per user in each set
print("\nAverage sessions per user in each set:")
print(f"Train: {df_train.groupby('user_id')['session_id'].count().mean():.2f}")
print(f"Validation: {df_val.groupby('user_id')['session_id'].count().mean():.2f}")
print(f"Test: {df_test.groupby('user_id')['session_id'].count().mean():.2f}")

Splitting data by user, maintaining click and session distribution
Number of users in each set:
Train: 76507 (60.0%)
Validation: 25502 (20.0%)
Test: 25503 (20.0%)

Number of sessions in each set:
Train: 209531 (60.0%)
Validation: 69610 (19.9%)
Test: 69883 (20.0%)

Click rates in each set:
Train: 0.068
Validation: 0.068
Test: 0.068

Average sessions per user in each set:
Train: 2.74
Validation: 2.73
Test: 2.74


In [13]:
# Create feature for whether user has viewed product before
def add_product_history(df):
    # Sort by user and datetime
    df = df.sort_values(['user_id', 'DateTime'])
    
    # Initialize the new feature
    df['product_viewed_before'] = 0
    
    # For each user
    for user_id in df['user_id'].unique():
        user_sessions = df[df['user_id'] == user_id]
        
        # For each session of this user (already sorted chronologically)
        for i, (_, current_session) in enumerate(user_sessions.iterrows()):
            if i > 0:  # Skip first session
                # Get all previous sessions for this user
                previous_sessions = user_sessions.iloc[:i]
                # Check if current product was viewed in any previous session
                if current_session['product'] in previous_sessions['product'].values:
                    df.loc[current_session.name, 'product_viewed_before'] = 1
    
    return df

# Add the feature to each dataset
print("Adding product history feature to train set...")
df_train = add_product_history(df_train)

print("Adding product history feature to validation set...")
df_val = add_product_history(df_val)

print("Adding product history feature to test set...")
df_test = add_product_history(df_test)


Adding product history feature to train set...
Adding product history feature to validation set...
Adding product history feature to test set...


In [14]:
from sklearn.preprocessing import OneHotEncoder
# Initialize OneHotEncoder for categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit encoder on train set categorical columns
encoder.fit(df_train[cons.CATEGORICAL])

# Transform train set
train_cat_encoded = encoder.transform(df_train[cons.CATEGORICAL])
train_cat_cols = encoder.get_feature_names_out(cons.CATEGORICAL)
df_train_encoded = pd.concat([
    df_train.drop(columns=cons.CATEGORICAL),
    pd.DataFrame(train_cat_encoded, columns=train_cat_cols, index=df_train.index)
], axis=1)

# Transform validation set using fitted encoder
val_cat_encoded = encoder.transform(df_val[cons.CATEGORICAL]) 
df_val_encoded = pd.concat([
    df_val.drop(columns=cons.CATEGORICAL),
    pd.DataFrame(val_cat_encoded, columns=train_cat_cols, index=df_val.index)
], axis=1)

# Transform test set using fitted encoder
test_cat_encoded = encoder.transform(df_test[cons.CATEGORICAL])
df_test_encoded = pd.concat([
    df_test.drop(columns=cons.CATEGORICAL),
    pd.DataFrame(test_cat_encoded, columns=train_cat_cols, index=df_test.index)
], axis=1)

df_train = df_train_encoded
df_val = df_val_encoded
df_test = df_test_encoded

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, average_precision_score

# Calculate baseline F1 score without product history feature
baseline_features = [col for col in df_train.columns 
                    if col not in ['user_id', 'session_id', 'DateTime', 'is_click', 'product_viewed_before']]

# Initialize baseline model with reasonable parameters
baseline_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10, 
    min_samples_split=5,
    class_weight='balanced',
    random_state=42
)

# Train model
baseline_model.fit(df_train[baseline_features], df_train['is_click'])

# Generate predictions
baseline_predictions = baseline_model.predict(df_test[baseline_features])

# Calculate and print F1 score
baseline_avg_precision = average_precision_score(df_test['is_click'], baseline_predictions)
print(f"Baseline Model F1 Score: {baseline_avg_precision:.3f}")


Baseline Model F1 Score: 0.078


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
import xgboost as xgb

# Features to exclude from model
exclude_cols = ['user_id', 'session_id', 'DateTime']

# Get feature columns with and without product history
features_with_history = [col for col in df_train.columns if col not in exclude_cols + ['is_click']]
features_without_history = [col for col in features_with_history if col != 'product_viewed_before']

# Initialize models
model_with_history = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_lambda=1,
    scale_pos_weight=df_train['is_click'].value_counts()[0]/df_train['is_click'].value_counts()[1],
    random_state=42
)

# Train and evaluate model with product history
model_with_history.fit(df_train[features_with_history], df_train['is_click'])
y_pred_with_history = model_with_history.predict(df_test[features_with_history])
avg_precision_with_history = average_precision_score(df_test['is_click'], y_pred_with_history)

# Calculate and print the normalized average precision score
# (Normalizing by the mean click rate helps account for class imbalance)
normalized_avg_precision = avg_precision_with_history / df_test['is_click'].mean()
print(f"Normalized Average Precision Score: {normalized_avg_precision:.4f}")
print("Test Set F1 Scores:")
print(f"With product_viewed_before:    {avg_precision_with_history:.4f}")
print(f"Without product_viewed_before: {baseline_avg_precision:.4f}")
print(f"Improvement:                   {((avg_precision_with_history - baseline_avg_precision) / baseline_avg_precision * 100):.1f}%")


Normalized Average Precision Score: 1.1814
Test Set F1 Scores:
With product_viewed_before:    0.0800
Without product_viewed_before: 0.0784
Improvement:                   2.1%


In [17]:
# Calculate overall CTR (mean click-through rate)
overall_ctr = df_train['is_click'].mean()

# Calculate CTR conditional on product being viewed before
ctr_viewed_before = df_train[df_train['product_viewed_before'] == 1]['is_click'].mean()
ctr_not_viewed_before = df_train[df_train['product_viewed_before'] == 0]['is_click'].mean()

print("\nClick-through Rate Analysis:")
print(f"Overall CTR: {overall_ctr:.4f}")
print(f"CTR when product viewed before: {ctr_viewed_before:.4f}")
print(f"Relative lift when previously viewed: {((overall_ctr - ctr_viewed_before) / overall_ctr * 100):.1f}%")


Click-through Rate Analysis:
Overall CTR: 0.0677
CTR when product viewed before: 0.0492
Relative lift when previously viewed: 27.4%


In [18]:
# Sort by user and datetime
df_train = df_train.sort_values(['user_id', 'DateTime'])

# Calculate time since previous session for each user
df_train['time_since_prev'] = df_train.groupby('user_id')['DateTime'].diff()

# Convert timedelta to hours
df_train['hours_since_prev'] = df_train['time_since_prev'].dt.total_seconds() / 360

# Get users with more than 2 sessions
users_multi_sessions = df_train.groupby('user_id').size()
users_multi_sessions = users_multi_sessions[users_multi_sessions > 2].index

# Filter for those users and sessions after the first one
filtered_df = df_train[
    (df_train['user_id'].isin(users_multi_sessions)) & 
    (df_train['hours_since_prev'].notna())
]

# Create binary features
filtered_df['had_recent_session'] = (filtered_df['hours_since_prev'] <= 1).astype(int)

# Calculate CTR for sessions with and without recent activity
ctr_with_recent = filtered_df[filtered_df['had_recent_session'] == 1]['is_click'].mean()
ctr_without_recent = filtered_df[filtered_df['had_recent_session'] == 0]['is_click'].mean()

print("\nClick-through Rate Analysis for Users with Multiple Sessions (2nd session onwards):")
print(f"CTR when had session within last hour: {ctr_with_recent:.4f}")
print(f"CTR when no recent session: {ctr_without_recent:.4f}")
print(f"Relative lift with recent session: {((ctr_with_recent - ctr_without_recent) / ctr_without_recent * 100):.1f}%")

# Calculate statistical significance using chi-square test
contingency = pd.crosstab(filtered_df['had_recent_session'], filtered_df['is_click'])
from scipy.stats import chi2_contingency
chi2, p_value = chi2_contingency(contingency)[:2]

print(f"\nChi-square statistic: {chi2:.2f}")
print(f"p-value: {p_value:.4e}")



Click-through Rate Analysis for Users with Multiple Sessions (2nd session onwards):
CTR when had session within last hour: 0.0489
CTR when no recent session: 0.0591
Relative lift with recent session: -17.4%

Chi-square statistic: 60.46
p-value: 7.5225e-15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['had_recent_session'] = (filtered_df['hours_since_prev'] <= 1).astype(int)
