 ### Homework  Notebook

### Generating Synthetic Dataset for Use

In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

np.random.seed(123)
n = 2500

df = pd.DataFrame({
    'user_id': range(1, n+1),
    'sessions_per_week': np.random.poisson(lam=5, size=n),
    'avg_session_time': np.random.normal(15, 5, size=n).clip(1, 60),  # minutes
    'pages_viewed': np.random.poisson(lam=20, size=n).clip(1, 100),
    'device': np.random.choice(['Mobile','Desktop','Tablet'], size=n, p=[0.6,0.3,0.1]),
    'country': np.random.choice(['US','India','Germany','Brazil','Canada'], size=n),
    'purchase_amount': np.random.lognormal(mean=3.2, sigma=0.7, size=n).round(2),
    'churn_flag': np.random.choice([0,1], size=n, p=[0.75,0.25])  # 1=churned, 0=active
})


### Engagement ratio
This feature measures how engaged a user is during each session. Someone who visits many pages per session is likely more curious or motivated to purchase 
than someone who opens the app frequently but only views a few pages. It captures depth of engagement rather than just frequency.

In [36]:
df['pages_per_session'] = (df['pages_viewed'] / df['sessions_per_week']).replace([np.inf, -np.inf], 0).round(2)


### Rolling mean of session time

A rolling average of session time smooths out spikes from unusually short or long sessions. This helps models detect trends in user attention span over time, which can be a better predictor of churn or spending than any single session.

In [37]:
df['rolling_session_time'] = df['avg_session_time'].rolling(3).mean().fillna(df['avg_session_time'])

### High spender flag

This binary indicator distinguishes users who consistently spend more than the median. Splitting users into high vs. low spenders helps classification models, especially for churn or upselling predictions, by highlighting a segment of users with different value to the business.

In [38]:
df['high_spender'] = (df['purchase_amount'] > df['purchase_amount'].median()).astype(int)

### Country frequency encoding
Frequency encoding converts each country into its relative share of users. This captures how common or rare a country is in the dataset, which may correlate with business priorities or market size. Unlike one-hot encoding, it avoids creating many extra columns.

In [39]:
country_freq = df['country'].value_counts(normalize=True)
df['country_freq'] = df['country'].map(country_freq)

### One-hot encoding for device
Device type (Mobile, Desktop, Tablet) can strongly influence user behavior. For example, mobile users might have shorter sessions, while desktop users may browse more before purchasing. One-hot encoding ensures the model treats devices as distinct, not ordered.

In [40]:
df = pd.get_dummies(df, columns=['device'], drop_first=True)

### Polynomial & Interaction Features

### Interaction: pages viewed × session time
Captures the intensity of user engagement. A user who spends a long time per session and views many pages is different from one who does only one of these. Interaction terms help models learn patterns that only appear when features combine.

In [41]:
df['pages_x_time'] = df['pages_viewed'] * df['avg_session_time']

### Squared: purchase_amount
Squaring amplifies the effect of extreme purchase values. For example, very high spenders may behave differently (VIP customers), and this transformation allows models to give more weight to these outliers.

In [42]:
df['purchase_amount_sq'] = df['purchase_amount'] ** 2

### Polynomial features for sessions + purchase_amount

Adds squared terms and interaction between sessions and purchases. This captures nonlinear relationships such as “moderate sessions + moderate spend” being more predictive of churn than either extreme alone. Polynomial terms give models more flexibility to learn curved trends instead of straight lines.

In [43]:
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[['sessions_per_week','purchase_amount']])

poly_df = pd.DataFrame(
    poly_features,
    columns=poly.get_feature_names_out(['sessions_per_week','purchase_amount'])
)

In [None]:
# Concatenate back
df = pd.concat([df, poly_df], axis=1)
df.head()

Unnamed: 0,user_id,sessions_per_week,avg_session_time,pages_viewed,country,purchase_amount,churn_flag,pages_per_session,rolling_session_time,high_spender,country_freq,device_Mobile,device_Tablet,pages_x_time,purchase_amount_sq,sessions_per_week.1,purchase_amount.1,sessions_per_week^2,sessions_per_week purchase_amount,purchase_amount^2
0,1,7,16.951129,24,Brazil,46.03,0,3.43,16.951129,1,0.192,True,False,406.827105,2118.7609,7.0,46.03,49.0,322.21,2118.7609
1,2,5,19.709662,18,Canada,15.61,1,3.6,19.709662,0,0.208,True,False,354.773923,243.6721,5.0,15.61,25.0,78.05,243.6721
2,3,4,22.544426,32,US,28.88,0,8.0,19.735073,1,0.2068,False,True,721.421638,834.0544,4.0,28.88,16.0,115.52,834.0544
3,4,8,5.922137,26,India,35.74,0,3.25,16.058742,1,0.1932,False,False,153.975561,1277.3476,8.0,35.74,64.0,285.92,1277.3476
4,5,4,13.572359,18,US,35.79,1,4.5,14.012974,1,0.2068,False,False,244.302467,1280.9241,4.0,35.79,16.0,143.16,1280.9241
