In [2]:
# Import necessary modules
import sys
sys.path.append('..')
import pandas as pd # type: ignore
import constants as cons
from preprocess import clean_data, add_engineered_features

# Load the raw data - using the correct path

# Clean the data using the existing clean_data function
# Use the constants file path instead of hardcoded path
df = pd.read_csv('../' + cons.DATA_PATH + cons.DEFAULT_RAW_TRAIN_FILE)
df = clean_data(df)

# Add engineered features
df = add_engineered_features(df)

# Display basic information about the preprocessed dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
display(df.head())


Dataset shape: (349024, 52)

First few rows:


Unnamed: 0,user_id,age_level,user_depth,is_click,product_B,product_C,product_D,product_E,product_F,product_G,...,user_group_id_9.0,user_group_id_10.0,user_group_id_11.0,user_group_id_12.0,gender_Male,var_1_1.0,hour,day_of_week,cum_ctr,sessions_per_user
0,858557.0,4.0,3.0,0.0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,6,0.0,2
1,243253.0,2.0,2.0,0.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,6,0.0,3
2,1097446.0,3.0,3.0,0.0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,6,0.0,13
3,243253.0,2.0,2.0,0.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,6,0.0,3
4,469098.0,4.0,3.0,0.0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,6,0.0,1


In [3]:
# Filter users with exactly 2 sessions
users_with_2_sessions = df.groupby('user_id').size()
users_with_2_sessions = users_with_2_sessions[users_with_2_sessions == 2].index

# Get data only for users with 2 sessions
two_session_data = df[df['user_id'].isin(users_with_2_sessions)].copy()

# Sort by user_id and DateTime to get 1st and 2nd entries
two_session_data = two_session_data.sort_values(['user_id', 'hour', 'day_of_week'])

# Create first and second entry dataframes
first_entries = two_session_data.groupby('user_id').first()
second_entries = two_session_data.groupby('user_id').last()

# Calculate conditional probabilities
clicked_first = first_entries['is_click'] == 1
not_clicked_first = first_entries['is_click'] == 0

# P(click on 2nd | clicked 1st)
p_click2_given_click1 = second_entries[clicked_first]['is_click'].mean()
n_clicked_first = clicked_first.sum()

# P(click on 2nd | not clicked 1st)
p_click2_given_noclick1 = second_entries[not_clicked_first]['is_click'].mean()
n_not_clicked_first = not_clicked_first.sum()

# Calculate ratio
ratio = p_click2_given_click1 / p_click2_given_noclick1

print(f"Number of users with exactly 2 sessions: {len(users_with_2_sessions)}")
print(f"\nUsers who clicked on first entry: {n_clicked_first}")
print(f"P(click on 2nd | clicked 1st) = {p_click2_given_click1:.3f}")

print(f"\nUsers who didn't click on first entry: {n_not_clicked_first}")
print(f"P(click on 2nd | not clicked 1st) = {p_click2_given_noclick1:.3f}")

print(f"\nRatio of probabilities (click1/noclick1): {ratio:.3f}")


Number of users with exactly 2 sessions: 26292

Users who clicked on first entry: 2285
P(click on 2nd | clicked 1st) = 0.134

Users who didn't click on first entry: 24007
P(click on 2nd | not clicked 1st) = 0.074

Ratio of probabilities (click1/noclick1): 1.809


In [6]:
# Filter users with 3 or more sessions
users_with_3plus_sessions = df.groupby('user_id').size()
users_with_3plus_sessions = users_with_3plus_sessions[users_with_3plus_sessions >= 3].index

# Get data only for users with 3+ sessions
three_plus_session_data = df[df['user_id'].isin(users_with_3plus_sessions)].copy()

# Sort by user_id and DateTime to get 1st and 3rd entries
three_plus_session_data = three_plus_session_data.sort_values(['user_id', 'hour', 'day_of_week'])

# Create first and third entry dataframes
first_entries = three_plus_session_data.groupby('user_id').first()
third_entries = three_plus_session_data.groupby('user_id').nth(2)  # Index 2 gives 3rd entry

# Calculate conditional probabilities
clicked_first = first_entries['is_click'] == 1
not_clicked_first = first_entries['is_click'] == 0

# P(click on 3rd | clicked 1st)
p_click3_given_click1 = third_entries[clicked_first]['is_click'].mean()
n_clicked_first = clicked_first.sum()

# P(click on 3rd | not clicked 1st)
p_click3_given_noclick1 = third_entries[not_clicked_first]['is_click'].mean()
n_not_clicked_first = not_clicked_first.sum()

# Calculate ratio
ratio = p_click3_given_click1 / p_click3_given_noclick1

print(f"Number of users with 3 or more sessions: {len(users_with_3plus_sessions)}")
print(f"\nUsers who clicked on first entry: {n_clicked_first}")
print(f"P(click on 3rd | clicked 1st) = {p_click3_given_click1:.3f}")

print(f"\nUsers who didn't click on first entry: {n_not_clicked_first}")
print(f"P(click on 3rd | not clicked 1st) = {p_click3_given_noclick1:.3f}")

print(f"\nRatio of probabilities (click1/noclick1): {ratio:.3f}")

# Print statistical significance info
print(f"\nSample sizes:")
print(f"Users with click on 1st: {n_clicked_first:,}")
print(f"Users without click on 1st: {n_not_clicked_first:,}")
print(f"Total sample size: {len(users_with_3plus_sessions):,}")


Number of users with 3 or more sessions: 35606

Users who clicked on first entry: 2889
P(click on 3rd | clicked 1st) = 0.114

Users who didn't click on first entry: 32717
P(click on 3rd | not clicked 1st) = 0.063

Ratio of probabilities (click1/noclick1): 1.813

Sample sizes:
Users with click on 1st: 2,889
Users without click on 1st: 32,717
Total sample size: 35,606


In [8]:
# Get second entries and calculate probabilities for click patterns
second_entries = three_plus_session_data.groupby('user_id').nth(1)  # Index 1 gives 2nd entry

# Create masks for different click patterns in 1st and 2nd sessions
clicked_both = (first_entries['is_click'] == 1) & (second_entries['is_click'] == 1)
clicked_neither = (first_entries['is_click'] == 0) & (second_entries['is_click'] == 0) 
clicked_exactly_one = ((first_entries['is_click'] == 1) & (second_entries['is_click'] == 0)) | \
                     ((first_entries['is_click'] == 0) & (second_entries['is_click'] == 1))

# Calculate conditional probabilities
p_click3_given_neither = third_entries[clicked_neither]['is_click'].mean()
p_click3_given_both = third_entries[clicked_both]['is_click'].mean()
p_click3_given_one = third_entries[clicked_exactly_one]['is_click'].mean()

# Get sample sizes
n_clicked_neither = clicked_neither.sum()
n_clicked_both = clicked_both.sum()
n_clicked_one = clicked_exactly_one.sum()

print("Conditional probabilities of clicking on 3rd session:")
print(f"\nGiven no clicks in 1st and 2nd:")
print(f"P(click on 3rd | no clicks) = {p_click3_given_neither:.3f}")
print(f"Sample size: {n_clicked_neither:,} users")

print(f"\nGiven clicks in both 1st and 2nd:")
print(f"P(click on 3rd | both clicks) = {p_click3_given_both:.3f}")
print(f"Sample size: {n_clicked_both:,} users")

print(f"\nGiven exactly one click in 1st or 2nd:")
print(f"P(click on 3rd | one click) = {p_click3_given_one:.3f}")
print(f"Sample size: {n_clicked_one:,} users")

print(f"\nTotal users analyzed: {len(users_with_3plus_sessions):,}")
# Calculate conditional probabilities for specific click patterns
clicked_first_not_second = (first_entries['is_click'] == 1) & (second_entries['is_click'] == 0)
clicked_second_not_first = (first_entries['is_click'] == 0) & (second_entries['is_click'] == 1)

p_click3_given_first_not_second = third_entries[clicked_first_not_second]['is_click'].mean()
p_click3_given_second_not_first = third_entries[clicked_second_not_first]['is_click'].mean()

# Get sample sizes
n_clicked_first_not_second = clicked_first_not_second.sum()
n_clicked_second_not_first = clicked_second_not_first.sum()

print("\nAdditional conditional probabilities:")
print(f"\nGiven click on 1st but not 2nd:")
print(f"P(click on 3rd | click1, no click2) = {p_click3_given_first_not_second:.3f}")
print(f"Sample size: {n_clicked_first_not_second:,} users")

print(f"\nGiven click on 2nd but not 1st:")
print(f"P(click on 3rd | no click1, click2) = {p_click3_given_second_not_first:.3f}") 
print(f"Sample size: {n_clicked_second_not_first:,} users")


Conditional probabilities of clicking on 3rd session:

Given no clicks in 1st and 2nd:
P(click on 3rd | no clicks) = 0.061
Sample size: 30,365 users

Given clicks in both 1st and 2nd:
P(click on 3rd | both clicks) = 0.144
Sample size: 354 users

Given exactly one click in 1st or 2nd:
P(click on 3rd | one click) = 0.101
Sample size: 4,887 users

Total users analyzed: 35,606

Additional conditional probabilities:

Given click on 1st but not 2nd:
P(click on 3rd | click1, no click2) = 0.110
Sample size: 2,535 users

Given click on 2nd but not 1st:
P(click on 3rd | no click1, click2) = 0.091
Sample size: 2,352 users
