In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

import seaborn as sns
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import pointbiserialr
from xgboost import XGBClassifier


# 1. Understand the Problem and Data

In [None]:
df = pd.read_csv('conversion_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
df.columns

In [None]:
for col in df.columns:
    uniques = sorted(df[col].unique())
    print(f"{col:<20}{len(uniques):<5}{uniques[:5]}")

# 2. Data Preparation

## i. Handle Outliers

### a. Domain-Specific Rules

In [None]:
# The age variable has a maximum of 123, which might be unrealistic.
# Cap or remove outliers using the 99th percentile.
# df['age'] = np.where(df['age'] > 100, 100, df['age'])

In [None]:
df[df['age']>100]

In [None]:
df = df[~(df['age']>100)] # There are only two records that is invalid, let's first remove them directly

### b. Statistical Methods

#### (1) Z-Score Method
Measures how far a data point is from the mean in terms of standard deviations.
- A common threshold is |z| > 3.
- Use Case: Normally distributed data.
- Limitations: Sensitive to non-normal distributions.

In [None]:
# # Calculate Z-scores
# data['z_score'] = (data['value'] - data['value'].mean()) / data['value'].std()

# # Identify outliers
# outliers = data[np.abs(data['z_score']) > 3]
# print(outliers)


#### (2) Interquartile Range (IQR) Method
Defines outliers as data points outside the range [Q1 - 1.5*IQR, Q3 + 1.5*IQR].
- Use Case: Skewed or non-normal distributions.
- Strength: Less sensitive to outliers than Z-score.

In [None]:
# # Calculate IQR
# Q1 = data['value'].quantile(0.25)
# Q3 = data['value'].quantile(0.75)
# IQR = Q3 - Q1

# # Define bounds
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# # Identify outliers
# outliers = data[(data['value'] < lower_bound) | (data['value'] > upper_bound)]
# print(outliers)


### c. Visualization Methods

#### (1) Boxplot
Highlights outliers beyond the whiskers.

#### (2) Scatter Plot
Visualize relationships between two variables and detect outliers.

#### (3) Distribution Plot
Check for unusual peaks or long tails.

### d. Machine Learning-Based Methods

#### (1) Isolation Forest
Anomaly detection algorithm that isolates outliers in the data.

- Use Case: Multivariate or high-dimensional data.
- Strength: Works well with large datasets.

In [None]:
# from sklearn.ensemble import IsolationForest

# # Train Isolation Forest
# iso = IsolationForest(contamination=0.05, random_state=42)
# data['anomaly'] = iso.fit_predict(data[['value']])

# # Anomalies are labeled as -1
# outliers = data[data['anomaly'] == -1]
# print(outliers)


## ii. Missing or Incorrect Values

In [None]:
df.isnull().sum()

# 3. Exploratory Data Analysis (EDA)

## i. Univariate Analysis
Analyze each feature independently to understand its distribution.

### a. Numerical Features

- Histogram
- Boxplot

#### Age

##### 1. Summary Statistics

In [None]:
# Descriptive statistics for each group
df.groupby('converted')['age'].describe().reset_index()

##### 2. Visualize the Distributions
What to Look For:
- Are there clusters or gaps in age (e.g., young users dominating the data)?
- Are there outliers (e.g., extremely high ages)?

In [None]:
# KDE plot: Compare age distributions
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.kdeplot(data=df, x="age", hue="converted", fill=True, common_norm=False, ax=ax[0])
ax[0].set_title("Age Distribution by Conversion Status")
ax[0].set_xlabel("Age")
ax[0].set_ylabel("Density")

# Boxplot: Compare age distributions
sns.boxplot(data=df, x="converted", y="age", ax=ax[1])
ax[1].set_title("Boxplot of Age by Conversion Status")
ax[1].set_xlabel("Conversion Status")
ax[1].set_ylabel("Age")

plt.tight_layout()
plt.show()


#### Pages Visited

In [None]:
# KDE plot: Compare total_pages_visited distributions
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.kdeplot(data=df, x="total_pages_visited", hue="converted", fill=True, common_norm=False, ax=ax[0])
ax[0].set_title("Pages Visited Distribution by Conversion Status")
ax[0].set_xlabel("Pages Visited")
ax[0].set_ylabel("Density")

# Boxplot: Compare total_pages_visited distributions
sns.boxplot(data=df, x="converted", y="total_pages_visited", ax=ax[1])
ax[1].set_title("Boxplot of Pages Visited by Conversion Status")
ax[1].set_xlabel("Conversion Status")
ax[1].set_ylabel("Pages Visited")

plt.tight_layout()
plt.show()


### b. Categorical Features
- Frequency counts
- Bar plot

#### Country

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5))

sns.countplot(data=df, x='country', hue='converted', ax=ax[0])
ax[0].set_title("Count Plot of Country ")
sns.barplot(data=df, x='country', y='converted', ax=ax[1])
ax[1].set_title('Conversion Rate per Country')
ax[1].set_ylabel("Conversion Rate")
plt.tight_layout()
plt.show()

#### New User

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5))

sns.countplot(data=df, x='new_user', hue='converted', ax=ax[0])
ax[0].set_title("Count Plot of User Types")
sns.barplot(data=df, x='new_user', y='converted', ax=ax[1])
ax[1].set_title('Conversion Rate per User Type')
plt.tight_layout()
plt.show()

#### Source

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5))

sns.countplot(data=df, x='source', hue='converted', ax=ax[0])
ax[0].set_title("Count Plot of Sources")
sns.barplot(data=df, x='source', y='converted', ax=ax[1])
ax[1].set_title('Conversion Rate per Source')
plt.tight_layout()
plt.show()

## ii. Multivariate Analysis
Explore relationships between two or more features.

#### a. Correlation Analysis (Numerical Features)
- Compute pairwise correlations.
- Scatter Plots
- Pair Plot: Visualize pairwise relationships among all numerical features.
- Point-Biserial Correlation: Since converted is binary, the Point-Biserial Correlation is more suitable for quantifying the relationship.
    - If p < 0.05, the correlation is statistically significant.

In [None]:
df.columns

Correlation Between Age, Pages, and Conversion
What to Look For:
- If age and total_pages_visited are highly correlated, normalization may be redundant.
- If both are weakly correlated with converted individually but together show stronger relationships, normalization or interaction terms may help.

In [None]:
# Calculate the correlation matrix for the numerical features
correlation_matrix = df[['age', 'total_pages_visited', 'converted']].corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True)
plt.title('Correlation Matrix for Numerical Features')
plt.show()


In [None]:
sns.pairplot(df, diag_kind="kde", corner=True, markers="o")
plt.suptitle("Pair Plot of Numerical Features", y=1.02)
plt.show()

In [None]:
correlation, p_value = pointbiserialr(df['converted'], df['age'])
print(f"Point-Biserial Correlation: {correlation:.2f}, p-value: {p_value:.4f}")

#### b. Categorical vs. Numerical Features
Compare distributions of numerical features across categories.

- Boxplot
- Violin Plot

In [None]:
# Create subplots to compare categorical and numerical features side by side
fig, axes = plt.subplots(3, 2, figsize=(15, 16))

# Age distribution by Country
sns.boxplot(x='country', y='age', data=df, ax=axes[0, 0])
axes[0, 0].set_title("Age Distribution by Country")
axes[0, 0].set_xlabel("Country")
axes[0, 0].set_ylabel("Age")

# Total pages visited by Country
sns.boxplot(x='country', y='total_pages_visited', data=df, ax=axes[0, 1])
axes[0, 1].set_title("Total Pages Visited by Country")
axes[0, 1].set_xlabel("Country")
axes[0, 1].set_ylabel("Total Pages Visited")

# Age distribution by Source
sns.boxplot(x='source', y='age', data=df, ax=axes[1, 0])
axes[1, 0].set_title("Age Distribution by Source")
axes[1, 0].set_xlabel("Source")
axes[1, 0].set_ylabel("Age")

# Total pages visited by Source
sns.boxplot(x='source', y='total_pages_visited', data=df, ax=axes[1, 1])
axes[1, 1].set_title("Total Pages Visited by Source")
axes[1, 1].set_xlabel("Source")
axes[1, 1].set_ylabel("Total Pages Visited")

# Age distribution by new_user status
sns.boxplot(x='new_user', y='age', data=df, ax=axes[2, 0])
axes[2, 0].set_title("Age Distribution by New User Status")
axes[2, 0].set_xlabel("New User")
axes[2, 0].set_ylabel("Age")

# Total pages visited by new_user status
sns.boxplot(x='new_user', y='total_pages_visited', data=df, ax=axes[2, 1])
axes[2, 1].set_title("Total Pages Visited by New User Status")
axes[2, 1].set_xlabel("New User")
axes[2, 1].set_ylabel("Total Pages Visited")

# Adjust layout
plt.tight_layout()
plt.show()


- Age Distribution by Country
    - The median age is similar across all countries (around 30).
    - The US has a slightly narrower interquartile range, meaning the age distribution is more consistent compared to the other countries.
    - Outliers (older users) are present in all countries but appear more frequently in the US.
- Total Pages Visited by Country
    - All countries have a similar median for pages visited, with most users visiting around 4–5 pages.
    - Outliers exist in all countries where users have visited significantly more pages (over 20).
    - The US shows slightly more variability in total pages visited compared to the other countries.
- Age Distribution by Source
    - Users from all sources have a similar median age of around 30.
    - Users from Ads show slightly more variability in age, with a broader interquartile range.
    - Outliers are present in all sources but are more prominent for users from Ads.
- Total Pages Visited by Source
    - Users from all sources have a similar median for pages visited, with most users visiting around 4–5 pages.
    - Users from SEO show slightly more variability, with more outliers (users visiting over 20 pages).
- Age Distribution by New User Status
    - The median age for both new and returning users is around 30.
    - The distribution is nearly identical for both groups, with no clear distinction.
- Total Pages Visited by New User Status
    - Both groups have a similar distribution for pages visited, with medians around 4–5 pages.
    - Outliers (users visiting many more pages) exist in both groups, but there is no significant difference in variability.

##### Pages Visited vs. Conversion Rate

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 12))

data = df.groupby('total_pages_visited')['converted'].mean().reset_index()
sns.lineplot(data=df, x='total_pages_visited', y='converted', marker='o', linestyle='-', color='#F7B32B', ax=ax[0,0])
ax[0,0].set_title("Conversion Rate vs. Pages Visited")
ax[0,0].set_xlabel("Pages Visited")
ax[0,0].set_ylabel("Conversion Rate")
ax[0,0].grid(True)


sns.lineplot(data=df, x='total_pages_visited', y='converted', hue='new_user', marker='o', ax=ax[1,0])
ax[1,0].set_title('Conversion Rate vs. Pages Visited by New User Status')
ax[1,0].set_xlabel('Pages Visited')
ax[1,0].set_ylabel('Conversion Rate')
ax[1,0].legend(title='New User (1 = New, 0 = Existing)')
ax[1,0].grid(True)

sns.lineplot(data=df, x='total_pages_visited', y='converted', hue='country', marker='o', ax=ax[0,1])
ax[0,1].set_title('Conversion Rate vs. Pages Visited by Country')
ax[0,1].set_xlabel('Pages Visited')
ax[0,1].set_ylabel('Conversion Rate')
ax[0,1].legend(title='Country')
ax[0,1].grid(True)

sns.lineplot(data=df, x='total_pages_visited', y='converted', hue='source', marker='o', ax=ax[1,1])
ax[1,1].set_title('Conversion Rate vs. Pages Visited by Source')
ax[1,1].set_xlabel('Pages Visited')
ax[1,1].set_ylabel('Conversion Rate')
ax[1,1].legend(title='Source')
ax[1,1].grid(True)

plt.tight_layout()
plt.show()

##### Age vs. Conversion Rate

In [None]:
data = df.groupby('age')['converted'].mean().reset_index()
plt.figure(figsize=(6, 5))
sns.lineplot(data=data, x='age', y='converted', marker='o', linestyle='-', color='#F7B32B')
plt.title("Conversion Rate vs. Age")
plt.xlabel("Age")
plt.ylabel("Conversion Rate")
plt.grid(True)
plt.show()

#### c. Categorical vs. Categorical Features
- Use crosstabs or stacked bar charts.

## iii. Target Variable Analysis

- Analyze the target variable in the context of:
    - Distribution: For numerical targets, plot histograms or boxplots.
    - Class Balance: For classification tasks, check class distribution.

In [None]:
# Plot the distribution of the 'converted'
plt.figure(figsize=(6,5))
sns.countplot(x='converted', data=df)
plt.title("Distribution of Conversion Status")
plt.xlabel("Converted (0 = Not Converted, 1 = Converted)")
plt.ylabel("Count")
plt.show()


## iv. Missing Data Analysis

- Patterns of Missingness:
    - Check if missing values are random or follow a pattern.
- Imputation Strategies:
    - Fill missing values with mean/median/mode, forward fill, or interpolation.
    - For categorical features, use the most frequent category.


## v. Time-Based Analysis

- Trend Analysis:
    - Plot trends over time for features like sales, clicks, etc.
- Seasonality: Look for patterns across months, days, or hours.

# 4. Feature Engineering

## i. Binning

- Fixed-Width Bins: You know meaningful ranges (e.g., decades or demographics).
- Equal-Width Bins: Data ranges widely and you need uniform bins.
- Quantile-Based Bins: Data is skewed, and you want evenly sized groups.
- Statistical Measures: Binning by quartiles or percentiles for comparative analysis.
- Clustering-Based Bins: Grouping based on natural clusters in the data.
- Custom Conditions: Full control, such as age-based policies or rules.
- Demographic Ranges: Predefined groups for demographic analysis.

### a. Bin Age into Groups

#### 1. Clustering-Based Bins (K-Means)

In [None]:
# Reshape age column for K-Means
ages = df['age'].values.reshape(-1, 1)

# Fit K-Means with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=42).fit(ages)

# Assign cluster labels
df['age_group_cluster_based'] = kmeans.labels_

df.groupby('age_group_cluster_based')[['age', 'converted']].agg({'age':['min','max','count'], 'converted':['mean']}).reset_index(drop=True)

#### 2. Custom Conditions

In [None]:
conditions = [
    (df["age"] <= 24),
    (df["age"] >= 25) & (df["age"] <= 32),
    (df["age"] >= 33) & (df["age"] <= 41),
    (df["age"] >= 42),
]

choices = ['0-24', '25-32', '33-41', '42+']

df['custom_age_group'] = np.select(conditions, choices, default='Unknown')


#### 3. Equal-width bins

In [None]:
# Create equal-width bins
bins = pd.interval_range(start=df['age'].min(), end=df['age'].max(), freq=20)  # Bin width = 20 years
df['age_group_equal_width'] = pd.cut(df['age'], bins=bins)

df.groupby('age_group_equal_width')[['age', 'converted']].agg({'age':['min','max','count'], 'converted':['mean']}).reset_index()

#### 4. Quantile-based bins

In [None]:
# Create quantile-based bins
df['age_group_quantile_based'] = pd.qcut(df['age'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

df.groupby('age_group_quantile_based')[['age', 'converted']].agg({'age':['min','max','count'], 'converted':['mean']}).reset_index()


### b. Analyze Across Ages

#### 1. Age Groups vs. Conversion Rates
Check the Relationship Between Age and Conversion Rate

What to Look For:
- Do conversion rates increase, decrease, or remain stable across age groups?
- If conversion rates vary significantly, age likely plays a role in engagement and normalization may help.

In [None]:
# Conversion rate by age group
conversion_by_age = df.groupby('custom_age_group')['converted'].mean().reset_index()

plt.figure(figsize=(6,5))
sns.barplot(data=conversion_by_age, x='custom_age_group', y='converted', palette='Blues_d')
plt.title('Conversion Rate by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Conversion Rate')
# plt.ylim(0, 1)
plt.show()


In [None]:
# data = (
#     df.groupby("custom_age_group")["converted"]
#     .agg(["mean", "count"])
#     .reset_index()
#     .assign(mean=lambda x: x["mean"] * 100)
# )
# fig, ax1 = plt.subplots(figsize=(15, 5))
# sns.barplot(data=data, x="custom_age_group", y="count", ax=ax1, alpha=0.8)
# ax1.set_ylabel("Count", fontsize=12)
# ax1.set_title("Conversion Counts and Rates by Age Group", fontsize=12, pad=10)

# ax2 = ax1.twinx()
# sns.lineplot(
#     data=data,
#     x="custom_age_group",
#     y="mean",
#     marker="o",
#     linestyle="-",
#     linewidth=4,
#     ax=ax2,
#     color="#F7ACCF",
# )
# ax2.set_ylabel("Conversion Rate", fontsize=12)
# ax2.yaxis.set_major_formatter(PercentFormatter())

# plt.tight_layout()
# plt.show()


#### 2. Age Groups vs. Pages Visited

What to Look For:
- Do younger or older users visit significantly more pages on average?
- Is there high variability in total_pages_visited within certain age groups?

In [None]:
plt.figure(figsize=(6,5))
sns.boxplot(data=df, x='custom_age_group', y='total_pages_visited')
plt.title('Total Pages Visited Across Age Groups')
plt.xlabel('Age Group')
plt.ylabel('Total Pages Visited')
plt.show()

#### 3. Age Groups vs. Pages Visited vs. Conversion Rates


In [None]:
# Group data by age group and total pages visited, calculating the conversion rate
data = df.groupby(['custom_age_group', 'total_pages_visited'])['converted'].mean().reset_index()

# Create a FacetGrid to visualize conversion patterns across age groups and pages visited
plt.figure(figsize=(6,5))
sns.lineplot(
    data=data, 
    x='total_pages_visited', 
    y='converted', 
    hue='custom_age_group', 
    marker='o'
)
plt.title("Conversion Patterns for Age Groups by Total Pages Visited")
plt.xlabel("Total Pages Visited")
plt.ylabel("Conversion Rate")
plt.legend(title="Age Group")
plt.grid(True)
plt.show()

Based on EDA, decide whether normalization is necessary:

- Normalize If:

    - total_pages_visited varies significantly across age groups.
    - Conversion patterns differ for younger vs. older users with similar page visits.
    - total_pages_visited and age interact in a way that directly influences conversion.
- Do Not Normalize If:

    - total_pages_visited has a strong independent relationship with converted regardless of age.
    - age and total_pages_visited are weakly correlated or show no significant interaction.

Conclusion

- Conversion patterns differ for younger vs. older users with similar page visits:
    - Younger age groups (0–24, 25–32) achieve higher conversion rates earlier (around 10–15 pages visited).
    - Older age groups (33–41, 42+) require slightly more page visits (closer to 15–20) to reach similar conversion rates.
- Interaction between age and total_pages_visited:
    - There is a clear interaction between age and page visits, as the conversion rate's behavior varies by age group.
    - This suggests that total pages visited alone may not be sufficient without considering the age factor.

You should normalize or adjust for total_pages_visited across different age groups because the interaction between age and page visits impacts conversion behavior. This normalization will ensure fair comparisons and improve model accuracy when analyzing conversion patterns.

## ii. Interaction Features

Creating interaction features involves combining two or more features to capture relationships that may improve your model's predictive performance. These interactions can help capture non-linear relationships or dependencies between variables that single features alone cannot explain.

Types of Interaction Features
1. Numerical-Numerical Interactions
    - Combine numerical features by performing arithmetic operations like:

        - Multiplication
        - Division
        - Addition or subtraction
2. Numerical-Categorical Interactions
    - Combine numerical features with categorical features, such as grouping numerical features by categories.

3. Categorical-Categorical Interactions
    - Create interaction features by combining multiple categorical variables (e.g., cross-features).

### a. Methods to Create Interaction Features

##### 1. Numerical-Numerical Example
Use arithmetic operations between numerical columns.

In [None]:
df['new_user_pages_interaction'] = df['new_user'] * df['total_pages_visited']

**How `new_user_pages_interaction` Impacts the Model**
- Without the Interaction Feature:
    - The model might treat new_user and total_pages_visited independently. For example:
        - It might learn that being a new user has a weak positive effect on conversion.
        - It might also learn that visiting more pages has a strong positive effect on conversion.
- With the Interaction Feature:
    - The model can now understand the conditional relationship:
        - New users who visit many pages are more likely to convert.
        - Existing users who visit many pages might have a different conversion pattern, which is not captured by this interaction.

**When to Use This Interaction**
- Use `new_user * total_pages_visited` if:
    - You believe new users' behavior (page visits) affects conversion differently than existing users.
        - For example, new users might be more curious or engaged if they visit more pages, leading to higher conversion rates.
    - You observe this pattern during Exploratory Data Analysis (EDA):
        - Plot conversion rates against total_pages_visited for new users and existing users separately. If the patterns differ, this interaction is likely meaningful.


The `age_pages_interaction` introduces a non-linear relationship between the features:

- For example:
    - A 25-year-old who visits 10 pages (age * total_pages_visited = 250) may behave differently than a 40-year-old who visits 10 pages (age * total_pages_visited = 400).
    - The interaction may reveal patterns like "older users who visit more pages are more likely to convert."
- Hypothesis:
    - Some age groups may require fewer or more pages to convert compared to others.
    - Users who are older and visit many pages might show different behavior compared to younger users with the same level of engagement.

**Scaling Before Multiplication**
- If age and total_pages_visited have very different scales (e.g., age ranges from 17-100, but total_pages_visited ranges from 1-29), scale the features before creating the interaction to avoid large values dominating.

**Alternative Interactions**
- Categorical Interaction: Group total_pages_visited into bins (e.g., Low, Medium, High) and create a combined feature with new_user:

In [None]:
# bins = [0, 5, 10, 20]
# labels = ['Low', 'Medium', 'High']
# data['pages_group'] = pd.cut(data['total_pages_visited'], bins=bins, labels=labels)
# data['new_user_pages_group'] = data['new_user'].astype(str) + '_' + data['pages_group'].astype(str)

In [None]:
df['age_pages_interaction'] = df['age'] * df['total_pages_visited']

**Why Multiply Instead of Adding or Dividing?**
- Multiplication (age * total_pages_visited):
    - Amplifies the combined effect of age and total_pages_visited.
    - Captures non-linear patterns where the two features interact strongly.
- Addition (age + total_pages_visited):
    - Simpler but less powerful for non-linear relationships.
- Division (total_pages_visited / age):
    - Useful if you think normalized engagement by age matters (e.g., younger users with high pages relative to their age).
        - Younger users may be more engaged on average than older users, and their behavior (e.g., pages visited) needs to be compared relative to their typical age group behavior.
        - Example: A 20-year-old visiting 5 pages might show low engagement, but a 60-year-old visiting 5 pages could indicate high engagement.
    - When Age Moderates Engagement
        - If the relationship between engagement (e.g., total pages visited) and conversion depends on age, normalized metrics can help capture this interaction.
        - For example, younger users may need to visit fewer pages to convert, while older users may need more.
    - Adjusting for Different User Expectations
        - Certain age groups may inherently exhibit different browsing patterns:
            - Younger users might visit fewer pages due to shorter attention spans.
            - Older users might browse more pages to make informed decisions.
        - Normalizing helps adjust for these natural tendencies to better understand conversion likelihood.

Avoid using the interaction feature if:

- The relationship between age and total_pages_visited is weak or independent.
- Your model is tree-based (e.g., Random Forest, XGBoost), as these algorithms capture interactions naturally.

In [None]:
# This creates a normalized feature by dividing total_pages_visited by age. It assumes that engagement should be proportional to age.
df['pages_per_age'] = df['total_pages_visited'] / (df['age'] + 1)  # Add 1 to avoid division by zero

#### 2. Numerical-Categorical Example
Multiply or group numerical features by categorical variables.

It involves calculating aggregate statistics (e.g., mean, median, sum) for a numerical feature within groups defined by a categorical feature.

- Example:
    - Numerical Feature: total_pages_visited
    - Categorical Feature: country (e.g., US, UK, Germany)
- Result:
    - Calculate the mean pages visited for users in each country.


Why Group Numerical Features by Categorical Variables?
- (a) Reveal Group-Specific Patterns: Grouping helps identify how numerical behavior varies across categories.
    - Example:
        - Do users from different countries (country) visit more or fewer pages (total_pages_visited) on average?
    - Insights:
        - Users from "US" might visit more pages than users from "Germany."
        - This could indicate differences in engagement across countries.
- (b) Create Features for Modeling: Aggregated statistics (e.g., mean, max, sum) can be used as new features in your machine learning model. They help capture group-level effects.
- (c) Highlight Outliers: Grouping can help detect outliers within specific categories.
    - Example:
        - If the mean number of pages visited in the "UK" is 3, and a user from the "UK" visits 20 pages, this might be an anomaly worth investigating.

When Should You Group Numerical Features by Categorical Variables?
- Use Cases:
    - Category-Dependent Behavior:
    - When numerical variables are influenced by categorical groupings.
        - Example: Engagement (pages visited) varies across countries or marketing sources.
    - Outlier Detection:
        - Find users whose behavior deviates significantly from their group.

In [None]:
mean_pages_by_country = df.groupby('country')['total_pages_visited'].transform('mean')
df['pages_vs_country_mean'] = df['total_pages_visited'] / mean_pages_by_country

#### 3. Categorical-Categorical Example
Combine two or more categorical features using concatenation.

Why Combine Categorical Features?
- (a) Capture Interactions Between Categories
    - Some categorical variables may have interacting effects on the target variable. Analyzing them in isolation may miss this relationship.

        - Example:
            - country and source independently affect conversion, but the combination of country and source might reveal stronger patterns.
            - Users from the "US" converting through "SEO" might behave differently than users from "UK" through "Direct."
- (b) Create More Granular Groups
    - Concatenation creates finer-grained categories, which may better represent the data.
    - Example:
        - If country has 3 categories and source has 3 categories, combining them creates 9 unique groups.
        - This allows the model to differentiate between granular segments like US_SEO vs. US_Ads.
- (c) Improve Predictive Power
    - Models like linear models (e.g., Logistic Regression) and tree-based models (e.g., Random Forest, XGBoost) often perform better when provided with interaction terms. Combining categorical features explicitly informs the model about the interactions.
- (d) Uncover Hidden Insights
    - EDA using combined categorical features can uncover hidden relationships that are not apparent when analyzing each feature independently.
    - Example:
        - Users from "Germany" might perform poorly overall, but users from "Germany" who come through "SEO" might convert well. This insight is only visible when combining country and source.

When Should You Combine Categorical Features?

- Suspected Interactions:
    - When you believe that two or more categories interact and jointly affect the target.
        - Example: marketing_channel and device_type (e.g., "SEO + Mobile" vs. "SEO + Desktop").
- High Cardinality Features:
    - If your original categorical features have low cardinality (few unique values), combining them won't lead to an explosion in the number of categories.
    - Example:
        - country (3 categories) × source (3 categories) = 9 combined categories.
- Improving Model Performance:
    - For linear models or when your current features aren't providing enough predictive power.
- Segmentation Analysis:
    - Helps you analyze specific groups in detail.
    - Example: "US_SEO" may have a conversion rate of 10%, while "Germany_Direct" is 2%.

When Not to Combine Categorical Features
- High Cardinality Explosion:
    - If the original categorical features have many unique values, combining them can lead to an unmanageable number of unique categories.
    - Example:
        - city (500 categories) × source (10 categories) = 5000 combined categories.
- Sparse Data:
    - If your dataset is small, combining categorical features can result in sparse data, where some combinations appear infrequently or not at all.
- Tree-Based Models:
    - Tree-based models (e.g., Random Forest, XGBoost) can automatically capture interactions between features. Explicitly combining categories might be redundant.

In [None]:
# Example: Combine `country` and `source` into a single feature
# data['country_source_interaction'] = data['country'] + '_' + data['source']

# If your model requires numerical input, encode the new feature (e.g., one-hot or label encoding):

# data = pd.get_dummies(data, columns=['country_source_interaction'], drop_first=True)

#### 4. Best Practices
- Domain Knowledge: Focus on interactions that make logical sense in your context (e.g., new_user * total_pages_visited might reflect user engagement).
- Avoid Overfitting: Be cautious when creating too many interaction features, especially for small datasets.
- Model-Specific Considerations:
    - Tree-based models (e.g., Random Forest, XGBoost) can capture interactions naturally, so explicit interaction features might be redundant.
    - Linear models (e.g., Logistic Regression) often benefit significantly from interaction features.


## iii. Categorical Encoding

In [None]:
df = pd.get_dummies(df, columns=['country', 'source'], drop_first=True)

In [None]:
df.sample(3)

When Should You Encode Age Bins?
- Yes, encode if:
    - You plan to use age_group as a feature in your model.
    - Binning introduces meaningful categorical distinctions (e.g., age ranges like 0-20, 21-40).
    - Use **one-hot encoding** for linear models or when the bins have no inherent order.
    - Use **label encoding** or **ordinal mapping** for tree-based models or when the bins have a natural order.
- No, do not encode if:
    - You decide to work with the raw numerical age directly without binning.
    - The model (e.g., tree-based methods like Random Forest or XGBoost) can handle the numerical representation without encoding.

In [None]:
# Label encode the age group
label_encoder = LabelEncoder()
df['age_group_encoded'] = label_encoder.fit_transform(df['custom_age_group'])
print(df[['custom_age_group', 'age_group_encoded']].head())

# 5. Normalize/Scale Numerical Features

Scaling numerical features is an essential step in preparing data for machine learning models, especially when features have different scales or ranges. Whether or not you need to scale depends on the type of model you're using and the nature of the data.

1. Why Scale Numerical Features?
    - Ensure Equal Contribution: Features with larger ranges can dominate those with smaller ranges in distance-based models (e.g., k-NN) or optimization algorithms (e.g., gradient descent).
    - Improve Convergence: Scaling speeds up convergence in gradient-based optimization algorithms (e.g., in Logistic Regression, Neural Networks).
    - Prevent Numerical Instability: Extremely large feature values can cause numerical instability in some algorithms.

2. When to Scale Numerical Features
    - (a) Always Scale When Using Distance-Based Algorithms
        - For models where distances or similarity measures are important, scaling ensures that all features contribute equally.

        - Examples:
            - k-Nearest Neighbors (k-NN): Uses Euclidean or Manhattan distance; unscaled features with large ranges dominate the distance calculation.
            - Support Vector Machines (SVM): Uses a kernel function (e.g., radial basis function) that depends on distances.
            - Principal Component Analysis (PCA): Maximizes variance; unscaled features with large variance dominate the principal components.
            - Clustering (e.g., K-Means): Relies on distances to assign clusters; unscaled features distort the clusters.

    - (b) Scale for Gradient-Based Models
        - Models that rely on gradient descent optimization (e.g., Logistic Regression, Neural Networks) benefit from scaling, as it ensures:
            - Faster convergence.
            - Balanced updates for all weights during training.
        - Examples:
            - Logistic Regression
            - Neural Networks
            - Linear Regression (if regularized, e.g., Ridge, Lasso)
    - (c) Scale for Regularized Models
        - Regularization techniques penalize the magnitude of coefficients (e.g., L1/L2 regularization in Ridge, Lasso, or ElasticNet). Without scaling, the regularization terms are biased toward features with larger ranges.
        - Examples:
            - Ridge Regression
            - Lasso Regression
            - ElasticNet
    - (d) When Feature Values Have Drastically Different Ranges
        - If numerical features have drastically different scales, scaling is required to prevent models from being biased toward features with larger ranges.

3. When Not to Scale
    - (a) Tree-Based Models
        - Decision trees and ensemble models like Random Forest, XGBoost, LightGBM, and CatBoost do not require scaling, as they are not sensitive to the magnitude of feature values.
        - Why?:
            - These models split features based on thresholds (e.g., age > 30), not distances or gradients.
    - (b) Features with Meaningful Units
        - Some features have ranges that are meaningful or interpretable in their original units. Scaling might make these values harder to interpret.
        - Examples:
            - house_price (in dollars): Scaling might obscure meaningful dollar values.
            - number_of_items_sold: The raw counts are often interpretable as-is.
    - (c) When All Features Are on the Same Scale
        - If all numerical features already have similar ranges, scaling is unnecessary.
        - Examples:
            - temperature (in °C) and precipitation (in mm), both ranging from 0 to 100.


## i. Scaling Methods

### a. Standardization (Z-Score Scaling)
Standardization rescales features to have a mean of 0 and standard deviation of 1. Use this when features are approximately normally distributed.

Best For:
- Gradient-based models
- Regularized models
- PCA

In [None]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# data[['age_scaled', 'income_scaled']] = scaler.fit_transform(data[['age', 'income']])


### b. Min-Max Scaling
Scales features to a specific range (default: 0 to 1). Use this when all features need to be scaled proportionally.
Best For:
- Distance-based models (e.g., k-NN, k-Means)
- When features have a non-Gaussian distribution
- Neural networks (ensures all features have similar weight initialization)

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# data[['age_scaled', 'income_scaled']] = scaler.fit_transform(data[['age', 'income']])


### c. Robust Scaling
Scales features using the median and interquartile range, making it robust to outliers.
Best For:
- Datasets with outliers
- Gradient-based models

In [None]:
# from sklearn.preprocessing import RobustScaler

# scaler = RobustScaler()
# data[['age_scaled', 'income_scaled']] = scaler.fit_transform(data[['age', 'income']])


## ii. Visualizing the Need for Scaling

What to Look For:
- Scaling ensures features have similar ranges and contribute equally to the model.

In [None]:
# sns.boxplot(data=data[['age', 'income']])
# plt.title('Feature Ranges Before Scaling')
# plt.show()


In [None]:
# scaled_data = StandardScaler().fit_transform(data[['age', 'income']])
# sns.boxplot(data=pd.DataFrame(scaled_data, columns=['age_scaled', 'income_scaled']))
# plt.title('Feature Ranges After Scaling')
# plt.show()


# 6. Split Data for Training

In [None]:
df.columns

In [None]:
df.drop(
    [
        "age_group_cluster_based",
        "age_group_equal_width",
        "age_group_quantile_based",
        "custom_age_group",
    ],
    axis=1,
    inplace=True,
)


In [None]:
X = df.drop('converted', axis=1)  # Features
y = df['converted']              # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 7. Feature Selection

## i. Understand Feature Importance

### a. Correlation Analysis (For Numerical Features)

- Check the correlation between numerical features and the target variable (e.g., converted).
- Remove features with very low or no correlation with the target.

In [None]:
# Select numerical columns only
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate correlation
correlation = numerical_df.corr()
print(correlation['converted'].sort_values(ascending=False))

### b. Feature Importance from Models
- Use tree-based models like Random Forest, Gradient Boosting (e.g., XGBoost, LightGBM) to compute feature importance.

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
print(feature_importances.sort_values(ascending=False))

## ii. Use Statistical Tests

### a. Chi-Square Test (For Categorical Features)
- Measure the dependency of categorical features on the target.

In [None]:
# from sklearn.feature_selection import chi2
# chi_scores, p_values = chi2(X_train, y_train)
# print(p_values)  # Features with low p-values are significant


### b. ANOVA F-Test
- Use F-test for numerical features to check how well they distinguish the target variable.


## iii. Feature Selection Based on Domain Knowledge

### d. Cross-Validation for Feature Subsets
- Train models with different subsets of features and compare performance using cross-validation (e.g., AUC, F1 score).

## iv. Automated Feature Selection Tools
- Use libraries like Boruta or SHAP for automated and interpretable feature selection.

### a. Boruta

In [None]:
from boruta import BorutaPy
rf = RandomForestClassifier()
boruta_selector = BorutaPy(rf, n_estimators='auto', random_state=42)
boruta_selector.fit(X_train.values, y_train.values)
selected_features = X_train.columns[boruta_selector.support_]
print(selected_features)


In [None]:
selected_features

## v. Dimensionality Reduction
- If you have a large number of features:
    - Use PCA (Principal Component Analysis) to reduce dimensionality while retaining variance.
    - Be cautious when using PCA for interpretability-focused models.

In [None]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=10)  # Reduce to 10 components
# X_pca = pca.fit_transform(X_train)
# print(pca.explained_variance_ratio_)


# 8. Train Predictive Models

In [None]:
X = df[selected_features]  # Features
y = df['converted']              # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## i. Logistic Regression
Start with a simple model as a baseline.

In [None]:
# Train logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Evaluate performance
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))
print(f"AUC: {roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1]):.2f}")

## ii. Decision Tree / Random Forest
Use tree-based models to capture non-linear relationships.

In [None]:
# Train random forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Evaluate performance
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))


## iii. Gradient Boosting (e.g., XGBoost, LightGBM)
Boosting models often outperform simpler models in binary classification tasks.

In [None]:
# Train XGBoost
xgb = XGBClassifier(eval_metric='auc', random_state=42)
xgb.fit(X_train, y_train)

# Evaluate performance
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

# 9. Model Evaluation

Evaluate your models on metrics like:
- Accuracy: The proportion of correct predictions.
- Precision: The ability of the model to avoid false positives (relevant for imbalanced classes).
- Recall (Sensitivity): The ability to identify all true positives.
- F1-Score: A balance between precision and recall.
- AUC-ROC: Measures the ability to discriminate between positive and negative classes.


## i. ROC and Precision-Recall Curves

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Logistic Regression
y_pred_proba_lr = logreg.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_lr)
plt.plot(fpr, tpr, label="Logistic Regression (AUC=%.2f)" % roc_auc_score(y_test, y_pred_proba_lr))

# Random Forest
y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
plt.plot(fpr_rf, tpr_rf, label="Random Forest (AUC=%.2f)" % roc_auc_score(y_test, y_pred_proba_rf))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.title("ROC Curve Comparison")
plt.show()


Logistic Regression slightly outperforms Random Forest in precision and recall.

## ii. Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Logistic Regression
cm_lr = confusion_matrix(y_test, logreg.predict(X_test))
ConfusionMatrixDisplay(cm_lr).plot(cmap="Blues", values_format="d")
plt.title("Logistic Regression Confusion Matrix")

# Random Forest
cm_rf = confusion_matrix(y_test, rf.predict(X_test))
ConfusionMatrixDisplay(cm_rf).plot(cmap="Greens", values_format="d")
plt.title("Random Forest Confusion Matrix")
plt.show()


- If recall (catching all conversions) is critical, focus on improving models to reduce False Negatives (e.g., use SMOTE for balancing the classes).
- If precision (avoiding false positives) is more important, Logistic Regression already does well.


## iii. Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

# Logistic Regression
scores_lr = cross_val_score(logreg, X_train, y_train, cv=5, scoring="f1")
print("Logistic Regression F1 Cross-Validation Scores:", scores_lr)
print("Mean F1:", scores_lr.mean())

# Random Forest
scores_rf = cross_val_score(rf, X_train, y_train, cv=5, scoring="f1")
print("Random Forest F1 Cross-Validation Scores:", scores_rf)
print("Mean F1:", scores_rf.mean())


# 10. Provide Recommendations
Based on the model results and feature importance, provide actionable insights:

1. Feature Importance:

    - Use SHAP values or model coefficients to understand which features drive conversion.
    - Example: If total_pages_visited has high importance, encourage users to view more pages by improving navigation or content.
2. Segment Analysis:

    - Identify underperforming segments (e.g., low conversion rates for specific country or source) and target them with customized strategies.
3. Test Ideas to Improve Revenue:

    - For new users (new_user=1), create onboarding flows to improve engagement.
    - Invest in high-performing sources (e.g., SEO) while revisiting strategies for weaker ones.


- **Insight**: Users with higher page visits tend to convert more often, as observed from EDA (steep conversion increase between 10–20 pages visited).

- **Recommendation**:
    - Encourage users to visit more pages by optimizing internal linking and providing engaging content (e.g., recommended pages).
    - Add incentives (e.g., discounts or pop-ups) when users cross a threshold of visited pages to push conversions.

- **Insight**: Conversion patterns vary significantly for younger and older users, especially for similar levels of engagement.
    - Younger users may require fewer page visits to convert, while older users need more engagement.
- **Recommendation**:
    - Personalize the user experience by age group:
    - Younger Users: Streamline the user journey (fewer pages, quicker call-to-action).
    - Older Users: Provide more detailed content or targeted information to build trust.

- **Insight**: Some age groups (e.g., younger users) may naturally visit more pages per unit of time, driving conversion rates up.
- **Recommendation**:
    - Use age-targeted marketing campaigns:
    - For older users, reduce friction by displaying essential information upfront.
    - For younger users, leverage engaging visuals or interactive elements to maintain interest.


- **Insight**: Users who visit significantly more pages than their country’s average may be more likely to convert.
- **Recommendation**:
    - Optimize the experience for underperforming regions:
    - Identify countries with lower average page visits and localize content (e.g., language, cultural preferences).
    - Test regional variations of the website to improve engagement.


***Recommendations Summary***
- Enhance Page Engagement:
    - Improve internal linking and recommended content to encourage more page visits.
    - Use pop-ups or incentives to reward high engagement (e.g., after visiting 10 pages).
- Age-Based Personalization:
    - Simplify journeys for younger users with quick actions.
    - Provide more detailed content for older users.

# 11. Deployment
If you're satisfied with the model's performance:

- Save the model using joblib or pickle.
- Deploy it in production to predict conversion probabilities for new users.