In [648]:
# Import necessary modules
import sys
sys.path.append('..')
import pandas as pd # type: ignore
import constants as cons
import numpy as np

# Load the raw data - using the correct path

# Clean the data using the existing clean_data function
# Use the constants file path instead of hardcoded path
df = pd.read_csv('../' + cons.DATA_PATH + cons.DEFAULT_RAW_TRAIN_FILE)
df = df.drop(columns=cons.COLUMNS_TO_DROP)
df = df.drop_duplicates()
df = df.dropna()

# Add engineered features

# Display basic information about the preprocessed dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
display(df.head())


Dataset shape: (349024, 13)

First few rows:


Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,user_group_id,gender,age_level,user_depth,var_1,is_click
0,98528.0,2017-07-04 16:42,7716.0,C,405490.0,60305.0,3.0,3.0,Male,3.0,3.0,1.0,1.0
1,589714.0,2017-07-07 07:40,1035283.0,I,118601.0,28529.0,4.0,10.0,Female,4.0,3.0,1.0,0.0
2,478652.0,2017-07-07 20:42,65994.0,H,359520.0,13787.0,4.0,4.0,Male,4.0,3.0,0.0,0.0
3,34536.0,2017-07-05 15:05,75976.0,H,405490.0,60305.0,3.0,3.0,Male,3.0,3.0,0.0,0.0
4,71863.0,2017-07-06 20:11,987498.0,C,405490.0,60305.0,3.0,2.0,Male,2.0,3.0,0.0,0.0


In [649]:
print("Adding engineered time related features")
df['DateTime'] = pd.to_datetime(df['DateTime'])
df['day_of_week'] = df['DateTime'].dt.dayofweek
df['hour'] = df['DateTime'].dt.hour
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)


Adding engineered time related features


In [650]:
print(f"Removing users with more than 10 sessions -- likely bots")
# Count sessions per user
session_counts = df.groupby('user_id')['session_id'].count()

# Get users with 10 or fewer sessions
valid_users = session_counts[session_counts <= 10].index

# Filter dataframe to only include those users
df = df[df['user_id'].isin(valid_users)]


Removing users with more than 10 sessions -- likely bots


In [651]:
from sklearn.model_selection import train_test_split

print(f"Splitting data by user, maintaining click and session distribution")
# Create user-level features for stratification
user_features = df.groupby('user_id').agg({
    'session_id': 'count',  # number of sessions
    'is_click': 'sum'       # number of clicks (not rate)
}).reset_index()

# Create stratification group using actual values
user_features['strat_group'] = user_features.apply(
    lambda x: f"sessions_{int(x['session_id'])}_clicks_{int(x['is_click'])}", 
    axis=1
)

# Identify common and rare groups
group_counts = user_features['strat_group'].value_counts()
common_groups = group_counts[group_counts >= 6].index

# Split users into common and rare groups
common_users = user_features[user_features['strat_group'].isin(common_groups)]
rare_users = user_features[~user_features['strat_group'].isin(common_groups)]

# Split common users with stratification
train_users_common, temp_users_common = train_test_split(
    common_users['user_id'],
    train_size=0.6,
    stratify=common_users['strat_group'],
    random_state=42
)

val_users_common, test_users_common = train_test_split(
    temp_users_common,
    train_size=0.5,
    stratify=common_users.loc[common_users['user_id'].isin(temp_users_common), 'strat_group'],
    random_state=42
)

# Randomly assign rare users to maintain approximately 60-20-20 split
rare_users_shuffled = rare_users['user_id'].sample(frac=1, random_state=42)
n_rare = len(rare_users_shuffled)
n_train_rare = int(0.6 * n_rare)
n_val_rare = int(0.2 * n_rare)

train_users_rare = rare_users_shuffled[:n_train_rare]
val_users_rare = rare_users_shuffled[n_train_rare:n_train_rare + n_val_rare]
test_users_rare = rare_users_shuffled[n_train_rare + n_val_rare:]

# Combine common and rare users
train_users = pd.concat([train_users_common, train_users_rare])
val_users = pd.concat([val_users_common, val_users_rare])
test_users = pd.concat([test_users_common, test_users_rare])

# Create the final dataframes
df_train = df[df['user_id'].isin(train_users)].copy()
df_val = df[df['user_id'].isin(val_users)].copy()
df_test = df[df['user_id'].isin(test_users)].copy()

# Print statistics to verify the split
print("Number of users in each set:")
print(f"Train: {len(train_users)} ({len(train_users)/len(user_features):.1%})")
print(f"Validation: {len(val_users)} ({len(val_users)/len(user_features):.1%})")
print(f"Test: {len(test_users)} ({len(test_users)/len(user_features):.1%})")

print("\nNumber of sessions in each set:")
print(f"Train: {len(df_train)} ({len(df_train)/len(df):.1%})")
print(f"Validation: {len(df_val)} ({len(df_val)/len(df):.1%})")
print(f"Test: {len(df_test)} ({len(df_test)/len(df):.1%})")

# Verify click distributions are similar
print("\nClick rates in each set:")
print(f"Train: {df_train['is_click'].mean():.3f}")
print(f"Validation: {df_val['is_click'].mean():.3f}")
print(f"Test: {df_test['is_click'].mean():.3f}")

# Print distribution of sessions per user in each set
print("\nAverage sessions per user in each set:")
print(f"Train: {df_train.groupby('user_id')['session_id'].count().mean():.2f}")
print(f"Validation: {df_val.groupby('user_id')['session_id'].count().mean():.2f}")
print(f"Test: {df_test.groupby('user_id')['session_id'].count().mean():.2f}")

Splitting data by user, maintaining click and session distribution
Number of users in each set:
Train: 73924 (60.0%)
Validation: 24642 (20.0%)
Test: 24642 (20.0%)

Number of sessions in each set:
Train: 156266 (60.0%)
Validation: 52277 (20.1%)
Test: 51932 (19.9%)

Click rates in each set:
Train: 0.076
Validation: 0.076
Test: 0.076

Average sessions per user in each set:
Train: 2.11
Validation: 2.12
Test: 2.11


In [652]:
# Create feature for whether user has viewed product before
def add_product_history(df):
    # Sort by user and datetime
    df = df.sort_values(['user_id', 'DateTime'])
    
    # Initialize the new feature
    df['product_viewed_before'] = 0
    
    # For each user
    for user_id in df['user_id'].unique():
        user_sessions = df[df['user_id'] == user_id]
        
        # For each session of this user (already sorted chronologically)
        for i, (_, current_session) in enumerate(user_sessions.iterrows()):
            if i > 0:  # Skip first session
                # Get all previous sessions for this user
                previous_sessions = user_sessions.iloc[:i]
                # Check if current product was viewed in any previous session
                if current_session['product'] in previous_sessions['product'].values:
                    df.loc[current_session.name, 'product_viewed_before'] = 1
    
    return df

# Add the feature to each dataset
print("Adding product history feature to train set...")
df_train = add_product_history(df_train)

print("Adding product history feature to validation set...")
df_val = add_product_history(df_val)

print("Adding product history feature to test set...")
df_test = add_product_history(df_test)

# Verify the feature was added correctly
print("\nProduct viewed before rates in each set:")
print(f"Train: {df_train['product_viewed_before'].mean():.3f}")
print(f"Validation: {df_val['product_viewed_before'].mean():.3f}")
print(f"Test: {df_test['product_viewed_before'].mean():.3f}")


Adding product history feature to train set...
Adding product history feature to validation set...
Adding product history feature to test set...

Product viewed before rates in each set:
Train: 0.305
Validation: 0.307
Test: 0.302


In [653]:
from sklearn.preprocessing import OneHotEncoder
# Initialize OneHotEncoder for categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit encoder on train set categorical columns
encoder.fit(df_train[cons.CATEGORICAL])

# Transform train set
train_cat_encoded = encoder.transform(df_train[cons.CATEGORICAL])
train_cat_cols = encoder.get_feature_names_out(cons.CATEGORICAL)
df_train_encoded = pd.concat([
    df_train.drop(columns=cons.CATEGORICAL),
    pd.DataFrame(train_cat_encoded, columns=train_cat_cols, index=df_train.index)
], axis=1)

# Transform validation set using fitted encoder
val_cat_encoded = encoder.transform(df_val[cons.CATEGORICAL]) 
df_val_encoded = pd.concat([
    df_val.drop(columns=cons.CATEGORICAL),
    pd.DataFrame(val_cat_encoded, columns=train_cat_cols, index=df_val.index)
], axis=1)

# Transform test set using fitted encoder
test_cat_encoded = encoder.transform(df_test[cons.CATEGORICAL])
df_test_encoded = pd.concat([
    df_test.drop(columns=cons.CATEGORICAL),
    pd.DataFrame(test_cat_encoded, columns=train_cat_cols, index=df_test.index)
], axis=1)

df_train = df_train_encoded
df_val = df_val_encoded
df_test = df_test_encoded

In [661]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Features to exclude from model
exclude_cols = ['user_id', 'session_id', 'DateTime']

# Get feature columns with and without product history
features_with_history = [col for col in df_train.columns if col not in exclude_cols + ['is_click']]
features_without_history = [col for col in features_with_history if col != 'product_viewed_before']

# Initialize models
model_with_history = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, 
                                          class_weight='balanced', random_state=42)
model_without_history = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5,
                                             class_weight='balanced', random_state=42)

# Train and evaluate model with product history
model_with_history.fit(df_train[features_with_history], df_train['is_click'])
y_pred_with_history = model_with_history.predict(df_test[features_with_history])
f1_with_history = f1_score(df_test['is_click'], y_pred_with_history)

# Train and evaluate model without product history
model_without_history.fit(df_train[features_without_history], df_train['is_click'])
y_pred_without_history = model_without_history.predict(df_test[features_without_history])
f1_without_history = f1_score(df_test['is_click'], y_pred_without_history)

print("Test Set F1 Scores:")
print(f"With product_viewed_before:    {f1_with_history:.4f}")
print(f"Without product_viewed_before: {f1_without_history:.4f}")
print(f"Improvement:                   {((f1_with_history - f1_without_history) / f1_without_history * 100):.1f}%")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test Set F1 Scores:
With product_viewed_before:    0.1604
Without product_viewed_before: 0.1558
Ratio:                         1.0295


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [654]:
# Sort sessions chronologically for each user
df_train = df_train.sort_values(['user_id', 'DateTime'])
df_val = df_val.sort_values(['user_id', 'DateTime'])
df_test = df_test.sort_values(['user_id', 'DateTime'])



# Split train set into first and returning sessions
train_first = df_train.groupby('user_id').first().reset_index()
train_returning = df_train.merge(train_first[['user_id', 'session_id']], 
                                on='user_id', 
                                suffixes=('', '_first'))
train_returning = train_returning[train_returning['session_id'] != train_returning['session_id_first']]
train_returning = train_returning.drop('session_id_first', axis=1)

# Split validation set into first and returning sessions
val_first = df_val.groupby('user_id').first().reset_index()
val_returning = df_val.merge(val_first[['user_id', 'session_id']], 
                            on='user_id',
                            suffixes=('', '_first'))
val_returning = val_returning[val_returning['session_id'] != val_returning['session_id_first']]
val_returning = val_returning.drop('session_id_first', axis=1)

# Split test set into first and returning sessions
test_first = df_test.groupby('user_id').first().reset_index()
test_returning = df_test.merge(test_first[['user_id', 'session_id']], 
                              on='user_id',
                              suffixes=('', '_first'))
test_returning = test_returning[test_returning['session_id'] != test_returning['session_id_first']]
test_returning = test_returning.drop('session_id_first', axis=1)

# Print statistics about the splits
print("Number of sessions in first/returning splits:")
print(f"Train first: {len(train_first)} ({len(train_first)/len(df_train):.1%})")
print(f"Train returning: {len(train_returning)} ({len(train_returning)/len(df_train):.1%})")
print(f"\nVal first: {len(val_first)} ({len(val_first)/len(df_val):.1%})")
print(f"Val returning: {len(val_returning)} ({len(val_returning)/len(df_val):.1%})")
print(f"\nTest first: {len(test_first)} ({len(test_first)/len(df_test):.1%})")
print(f"Test returning: {len(test_returning)} ({len(test_returning)/len(df_test):.1%})")


Number of sessions in first/returning splits:
Train first: 73924 (47.3%)
Train returning: 82342 (52.7%)

Val first: 24642 (47.1%)
Val returning: 27635 (52.9%)

Test first: 24642 (47.5%)
Test returning: 27290 (52.5%)


In [655]:
alpha = 0.1
# Train logistic regression model on first sessions
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Prepare features for first sessions model
cols_to_drop = ['DateTime', 'user_id', 'session_id']
X_train_first = train_first.drop(cols_to_drop + ['is_click'], axis=1)
y_train_first = train_first['is_click']

X_test_first = test_first.drop(cols_to_drop + ['is_click'], axis=1)
y_test_first = test_first['is_click']

# Train model
first_sessions_model = LogisticRegression(C=0.01, class_weight='balanced', random_state=42)
first_sessions_model.fit(X_train_first, y_train_first)

# Get predictions and F1 score on test set
test_first_preds = first_sessions_model.predict(X_test_first)
print(f"F1 score on test first sessions: {f1_score(y_test_first, test_first_preds):.3f}")

# Calculate mean CTR on returning sessions
mean_returning_ctr = train_returning['is_click'].mean()
print(f"Mean CTR on returning sessions: {mean_returning_ctr:.3f}")

# Get first session features for each user in train_returning
user_first_sessions = train_first.drop(['DateTime', 'session_id', 'is_click', 'user_id'], axis=1)

# Get model predictions for first sessions
first_session_preds = first_sessions_model.predict_proba(user_first_sessions)[:, 1]
first_session_preds = pd.Series(first_session_preds, index=train_first['user_id'])

# Map predictions to returning sessions and calculate weighted feature
train_returning['first_session_weighted_pred'] = train_returning['user_id'].map(first_session_preds)
train_returning['first_session_weighted_pred'] = (1 - alpha) * train_returning['first_session_weighted_pred'] + alpha * mean_returning_ctr

# Prepare features for returning sessions model
X_train_returning = train_returning.drop(cols_to_drop + ['is_click'], axis=1)
y_train_returning = train_returning['is_click']

# Train returning sessions model
returning_sessions_model = LogisticRegression(C=0.01, class_weight='balanced', random_state=42)
from sklearn.ensemble import RandomForestClassifier
returning_sessions_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, 
                                                class_weight='balanced', random_state=42)
returning_sessions_model.fit(X_train_returning, y_train_returning)

# Get first session features for test returning users
test_user_first_sessions = test_first.drop(['DateTime', 'session_id', 'is_click', 'user_id'], axis=1)

# Get model predictions for test users' first sessions
test_first_session_preds = first_sessions_model.predict_proba(test_user_first_sessions)[:, 1]
test_first_session_preds = pd.Series(test_first_session_preds, index=test_first['user_id'])

# Map predictions to test returning sessions and calculate weighted feature
test_returning['first_session_weighted_pred'] = test_returning['user_id'].map(test_first_session_preds)
test_returning['first_session_weighted_pred'] = (1 - alpha) * test_returning['first_session_weighted_pred'] + alpha * mean_returning_ctr

# Prepare test features and get predictions
X_test_returning = test_returning.drop(cols_to_drop + ['is_click'], axis=1)
y_test_returning = test_returning['is_click']

test_returning_preds = returning_sessions_model.predict(X_test_returning)
print(f"F1 score on test returning sessions: {f1_score(y_test_returning, test_returning_preds):.3f}")
# For comparison, get predictions from first sessions model on returning sessions
X_test_returning_first = test_returning.drop(cols_to_drop + ['is_click', 'first_session_weighted_pred'], axis=1)
naive_returning_preds = first_sessions_model.predict(X_test_returning_first)
print(f"Naive F1 score (first sessions model on returning): {f1_score(y_test_returning, naive_returning_preds):.3f}")
# Combine predictions from both models for overall score
combined_preds = []
combined_true = []

# Add first session predictions
combined_preds.extend(test_first_preds)
combined_true.extend(y_test_first)

# Add returning session predictions
combined_preds.extend(test_returning_preds) 
combined_true.extend(y_test_returning)

print(f"Overall F1 score using specialized models: {f1_score(combined_true, combined_preds):.3f}")

# For naive approach - use first sessions model for all predictions
naive_combined_preds = []
naive_combined_true = []

# Add first session predictions (same as above)
naive_combined_preds.extend(test_first_preds)
naive_combined_true.extend(y_test_first)

# Add naive predictions on returning sessions
naive_combined_preds.extend(naive_returning_preds)
naive_combined_true.extend(y_test_returning)

print(f"Overall Naive F1 score using only first sessions model: {f1_score(naive_combined_true, naive_combined_preds):.3f}")




F1 score on test first sessions: 0.183
Mean CTR on returning sessions: 0.066
F1 score on test returning sessions: 0.134
Naive F1 score (first sessions model on returning): 0.127
Overall F1 score using specialized models: 0.164
Overall Naive F1 score using only first sessions model: 0.157


In [658]:
# First, get the last session for each user - this will be our "returning" set
train_returning_last = df_train.sort_values(['user_id', 'DateTime']).groupby('user_id').last().reset_index()
val_returning_last = df_val.sort_values(['user_id', 'DateTime']).groupby('user_id').last().reset_index()
test_returning_last = df_test.sort_values(['user_id', 'DateTime']).groupby('user_id').last().reset_index()

# Now get all other sessions for the "first" set
train_first = pd.concat([
    df_train[~df_train['session_id'].isin(train_returning_last['session_id'])],
    train_first  # Include the original first sessions
]).drop_duplicates()

val_first = pd.concat([
    df_val[~df_val['session_id'].isin(val_returning_last['session_id'])],
    val_first  # Include the original first sessions
]).drop_duplicates()

test_first = pd.concat([
    df_test[~df_test['session_id'].isin(test_returning_last['session_id'])],
    test_first  # Include the original first sessions
]).drop_duplicates()

# Train first model (RandomForest) on all sessions except last
cols_to_drop = ['DateTime', 'user_id', 'session_id']
X_train_first = train_first.drop(cols_to_drop + ['is_click'], axis=1)
y_train_first = train_first['is_click']

first_sessions_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=10, 
    min_samples_split=10,
    class_weight='balanced',
    random_state=42
)
first_sessions_model.fit(X_train_first, y_train_first)
alpha = 0.8
# Get predictions for all training first sessions
train_first_preds = first_sessions_model.predict(X_train_first)

# Calculate if user had ANY predicted clicks in first sessions
user_has_predicted_click = pd.DataFrame({
    'user_id': train_first['user_id'],
    'pred': train_first_preds
}).groupby('user_id')['pred'].max()  # max will be 1 if ANY session predicted click, 0 otherwise

# Add binary prediction feature to train_returning_last
train_returning_last['weighted_pred'] = train_returning_last['user_id'].map(user_has_predicted_click)
train_returning_last['weighted_pred'] = train_returning_last['weighted_pred'] * alpha + (1 - alpha) * mean_returning_ctr

# Train returning model on last sessions with the new feature
X_train_returning = train_returning_last.drop(cols_to_drop + ['is_click'], axis=1)
y_train_returning = train_returning_last['is_click']

returning_sessions_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    class_weight='balanced',
    random_state=42
)
returning_sessions_model.fit(X_train_returning, y_train_returning)

# Get predictions for all test first sessions
X_test_first = test_first.drop(cols_to_drop + ['is_click'], axis=1)
test_first_preds = first_sessions_model.predict(X_test_first)

# Calculate if user had ANY predicted clicks in test first sessions
test_user_has_predicted_click = pd.DataFrame({
    'user_id': test_first['user_id'],
    'pred': test_first_preds
}).groupby('user_id')['pred'].max()

# Add binary prediction feature to test_returning_last
test_returning_last['weighted_pred'] = test_returning_last['user_id'].map(test_user_has_predicted_click)
test_returning_last['weighted_pred'] = test_returning_last['weighted_pred'] * alpha + (1 - alpha) * mean_returning_ctr

print(test_returning_last.head())
# Get predictions and evaluate
X_test_returning = test_returning_last.drop(cols_to_drop + ['is_click'], axis=1)
y_test_returning = test_returning_last['is_click']

test_returning_preds = returning_sessions_model.predict(X_test_returning)
print(f"F1 score on test returning (last) sessions: {f1_score(y_test_returning, test_returning_preds):.3f}")

# For comparison, get predictions from first sessions model
X_test_returning_first = test_returning_last.drop(cols_to_drop + ['is_click', 'weighted_pred'], axis=1)
naive_returning_preds = first_sessions_model.predict(X_test_returning_first)
print(f"Naive F1 score (first sessions model on last sessions): {f1_score(y_test_returning, naive_returning_preds):.3f}")




   user_id  session_id            DateTime  age_level  user_depth  is_click  \
0     64.0    313949.0 2017-07-07 17:55:00        2.0         2.0       1.0   
1     76.0    241652.0 2017-07-06 20:11:00        2.0         3.0       0.0   
2     97.0    419274.0 2017-07-03 07:53:00        2.0         3.0       0.0   
3    150.0     63075.0 2017-07-05 19:08:00        3.0         3.0       0.0   
4    156.0    582378.0 2017-07-07 03:43:00        2.0         3.0       0.0   

   hour  hour_cos  hour_sin  product_viewed_before  ...  gender_Male  \
0    17 -0.258819 -0.965926                      0  ...          0.0   
1    20  0.500000 -0.866025                      1  ...          1.0   
2     7 -0.258819  0.965926                      0  ...          1.0   
3    19  0.258819 -0.965926                      1  ...          1.0   
4     3  0.707107  0.707107                      0  ...          1.0   

   var_1_0.0  var_1_1.0  day_of_week_0  day_of_week_1  day_of_week_2  \
0        0.0        