<a href="https://colab.research.google.com/github/anjineyulutv/Amazon_Fine_Food_Reviews/blob/master/GIM_Workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Feature Scaling: Why it matters in clustering ---
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np

# Create dummy marketing data (income & spend score)
np.random.seed(42)
df = pd.DataFrame({
    'income': np.random.randint(20000, 100000, 50),
    'spend_score': np.random.randint(1, 100, 50)
})

# Without scaling: Income dominates due to large numeric range
kmeans_unscaled = KMeans(n_clusters=3, random_state=42).fit(df)
df['cluster_unscaled'] = kmeans_unscaled.labels_

# Apply Standard Scaling
scaler = StandardScaler()
scaled = scaler.fit_transform(df[['income','spend_score']])
kmeans_scaled = KMeans(n_clusters=3, random_state=42).fit(scaled)
df['cluster_scaled'] = kmeans_scaled.labels_

print(df[['income','spend_score','cluster_unscaled','cluster_scaled']].head())

# üí° Lesson: Without scaling, 'income' (20k‚Äì100k) dominates the clustering.
# After scaling, both 'income' and 'spend_score' influence equally.
# Always scale distance-based features in marketing segmentation.

   income  spend_score  cluster_unscaled  cluster_scaled
0   35795            6                 1               1
1   20860           54                 1               2
2   96820            4                 2               1
3   74886           54                 2               0
4   26265           93                 1               2


In [None]:
# üö® Concept: Data Leakage ‚Äî When Your Model Knows the Future üö®
# -------------------------------------------------------------
# Marketing Scenario:
# You're predicting customer churn using engagement data.
# But your dataset mistakenly includes a feature that only exists AFTER the churn event (like "last_month_purchase_drop").
# The model learns from it and shows unrealistically high performance.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

np.random.seed(42)

# ----- Step 1: Create dummy marketing data -----
n = 500
data = pd.DataFrame({
    'email_click_rate': np.random.beta(2,5,n),           # engagement level
    'support_tickets': np.random.poisson(1.5, n),        # number of support issues
    'last_login_days': np.random.randint(1, 60, n),      # days since last login
})

# True churn probability (hidden ground truth)
true_prob = 0.3*data['email_click_rate'] + 0.2*(data['support_tickets']>2) - 0.01*data['last_login_days'] + np.random.normal(0,0.05,n)
data['churn'] = (true_prob > 0.2).astype(int)

# ----- Step 2: Add a 'leaked' feature -----
# This feature correlates strongly with churn, but is actually from the future
data['post_event_engagement'] = data['churn'] + np.random.normal(0,0.1,n)  # leakage!

# Split data
X = data.drop('churn', axis=1)
y = data['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ----- Step 3: Model with leakage -----
model_leak = LogisticRegression(max_iter=500)
model_leak.fit(X_train, y_train)
auc_leak = roc_auc_score(y_test, model_leak.predict_proba(X_test)[:,1])

# ----- Step 4: Model without leakage -----
X_train_noleak = X_train.drop('post_event_engagement', axis=1)
X_test_noleak = X_test.drop('post_event_engagement', axis=1)

model_clean = LogisticRegression(max_iter=500)
model_clean.fit(X_train_noleak, y_train)
auc_clean = roc_auc_score(y_test, model_clean.predict_proba(X_test_noleak)[:,1])

# ----- Step 5: Compare Results -----
print(f"AUC with leakage:   {auc_leak:.3f}")
print(f"AUC without leakage: {auc_clean:.3f}")

# üí° Life Lesson:
# If your model performs *too perfectly*, double-check your features.
# Leakage often happens when post-event or derived data sneaks into training.

AUC with leakage:   1.000
AUC without leakage: 0.966


In [None]:
# --- Data Granularity Demo: Daily vs Monthly forecasting ---
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

np.random.seed(42)
days = pd.date_range("2024-01-01", periods=60)
daily_sales = np.random.randint(100,500,60)
df = pd.DataFrame({'date': days, 'sales': daily_sales})

# Add monthly aggregate
df['month'] = df['date'].dt.month
monthly = df.groupby('month')['sales'].mean().reset_index()

# Simple regression on day index (fine-grained)
df['day_idx'] = np.arange(len(df))
model_daily = LinearRegression().fit(df[['day_idx']], df['sales'])

# Regression on monthly average (coarse-grained)
monthly['month_idx'] = monthly.index
model_monthly = LinearRegression().fit(monthly[['month_idx']], monthly['sales'])

print(f"Daily model coef: {model_daily.coef_[0]:.2f} | Monthly model coef: {model_monthly.coef_[0]:.2f}")

# üí° Lesson: Aggregating to months smooths variation ‚Äî hides weekend/holiday spikes.
# In marketing, wrong granularity may erase actionable temporal insights.

Daily model coef: 0.80 | Monthly model coef: -9.01


In [None]:
# --- RFM Features: Recency, Frequency, Monetary ---
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Generate fake transaction data
np.random.seed(42)
df = pd.DataFrame({
    'customer_id': np.random.choice(['C1','C2','C3','C4'], 20),
    'purchase_date': [datetime(2025,1,1)+timedelta(days=int(x)) for x in np.random.randint(0,90,20)],
    'amount': np.random.randint(50,500,20)
})

today = datetime(2025,4,1)
rfm = df.groupby('customer_id').agg({
    'purchase_date': lambda x: (today - x.max()).days,  # Recency
    'customer_id': 'count',                             # Frequency
    'amount': 'sum'                                     # Monetary
}).rename(columns={'purchase_date':'Recency','customer_id':'Frequency','amount':'Monetary'})

print(rfm)

# üí° Lesson: RFM encodes business intuition into features.
# Recency (how recently bought), Frequency (how often), Monetary (how much spent)
# are powerful signals for marketing segmentation & campaign targeting.

             Recency  Frequency  Monetary
customer_id                              
C1                27          4      1228
C2                70          1       369
C3                 3          9      2487
C4                 2          6      1615


In [None]:
# --- Temporal Features in Marketing Data ---
import pandas as pd
import numpy as np

np.random.seed(42)
df = pd.DataFrame({
    'date': pd.date_range("2024-01-01", periods=10),
    'visits': np.random.randint(100,500,10)
})

# Add temporal features
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)
df['rolling_mean'] = df['visits'].rolling(window=3, min_periods=1).mean()

print(df)

# üí° Lesson: Adding day-of-week, weekend, and rolling trends helps capture seasonality.
# In marketing, these features help models learn weekday vs. weekend traffic behavior.

        date  visits  day_of_week  is_weekend  rolling_mean
0 2024-01-01     202            0           0    202.000000
1 2024-01-02     448            1           0    325.000000
2 2024-01-03     370            2           0    340.000000
3 2024-01-04     206            3           0    341.333333
4 2024-01-05     171            4           0    249.000000
5 2024-01-06     288            5           1    221.666667
6 2024-01-07     120            6           1    193.000000
7 2024-01-08     202            0           0    203.333333
8 2024-01-09     221            1           0    181.000000
9 2024-01-10     314            2           0    245.666667


In [None]:
# ‚öñÔ∏è Concept 6: Handling Imbalanced Targets ‚Äî Don‚Äôt Let the Minority Class Disappear
# -----------------------------------------------------------------------------------
# Marketing Scenario:
# You‚Äôre predicting whether a customer will churn.
# Only a small fraction actually churn (~10%).
# The model might just predict "no churn" for everyone and still appear accurate.
# Let's see why using class weights (or other techniques) is essential.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

np.random.seed(42)

# ----- Step 1: Create an imbalanced dataset -----
n = 800
data = pd.DataFrame({
    'usage_hours': np.random.exponential(10, n),     # engagement level
    'email_opens': np.random.poisson(3, n),          # marketing interactions
    'support_calls': np.random.poisson(1, n),        # customer support contacts
})

# 10% churners only
data['churn'] = (np.random.rand(n) < 0.1).astype(int)

# Slight pattern: churners tend to have lower usage
data.loc[data['churn'] == 1, 'usage_hours'] *= 0.5
data.loc[data['churn'] == 1, 'email_opens'] *= 0.7

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('churn', axis=1), data['churn'], test_size=0.3, random_state=42, stratify=data['churn']
)

# ----- Step 2: Train model without handling imbalance -----
model_plain = LogisticRegression(max_iter=500)
model_plain.fit(X_train, y_train)
pred_plain = model_plain.predict(X_test)
auc_plain = roc_auc_score(y_test, model_plain.predict_proba(X_test)[:, 1])

# ----- Step 3: Train model with class weights -----
model_weighted = LogisticRegression(max_iter=500, class_weight='balanced')
model_weighted.fit(X_train, y_train)
pred_weighted = model_weighted.predict(X_test)
auc_weighted = roc_auc_score(y_test, model_weighted.predict_proba(X_test)[:, 1])

# ----- Step 4: Compare performance -----
print("Without Handling Imbalance:")
print(classification_report(y_test, pred_plain, digits=3))
print(f"AUC: {auc_plain:.3f}\n")

print("With Class Weight Balancing:")
print(classification_report(y_test, pred_weighted, digits=3))
print(f"AUC: {auc_weighted:.3f}")

# üí° Life Lesson:
# High accuracy doesn‚Äôt always mean a good model.
# In marketing churn, fraud, or rare-event prediction ‚Äî the *minority class* matters most.
# Always inspect recall/precision for that class or use class weights, SMOTE, or stratified sampling.

Without Handling Imbalance:
              precision    recall  f1-score   support

           0      0.904     1.000     0.950       217
           1      0.000     0.000     0.000        23

    accuracy                          0.904       240
   macro avg      0.452     0.500     0.475       240
weighted avg      0.818     0.904     0.859       240

AUC: 0.715

With Class Weight Balancing:
              precision    recall  f1-score   support

           0      0.951     0.627     0.756       217
           1      0.165     0.696     0.267        23

    accuracy                          0.633       240
   macro avg      0.558     0.661     0.511       240
weighted avg      0.876     0.633     0.709       240

AUC: 0.714


 2.1 0.  3.5 0.7 2.1 4.2 0.7 1.4 0.  2.8 2.8 2.8 2.1 2.8 1.4 0.7 4.9 2.8
 4.2 4.9 2.1 2.8 3.5 0.7 3.5 2.1 2.8 0.7 1.4 2.1 3.5 2.1 2.8 1.4 3.5 0.7
 1.4 4.2 2.1 4.9 2.8 1.4 0.7 2.8 0.7 3.5 5.6 2.1 2.1 2.1 2.1 2.8 0.7 1.4
 2.8 0.7 0.7 0.7]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[data['churn'] == 1, 'email_opens'] *= 0.7
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# --- One-Hot Encoding vs Label Encoding ---
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

df = pd.DataFrame({'channel': ['Email','Social','Affiliate','Social','Email']})

# Label Encoding (WRONG for non-ordinal data)
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['channel'])

# One-Hot Encoding (Correct)
ohe = OneHotEncoder(sparse_output=False)
encoded = pd.DataFrame(ohe.fit_transform(df[['channel']]), columns=ohe.get_feature_names_out(['channel']))
df = pd.concat([df, encoded], axis=1)

print(df)

# üí° Lesson: Label encoding imposes fake numeric order (‚ÄúSocial‚Äù>‚ÄúEmail‚Äù).
# One-hot keeps equality among categories, preserving campaign semantics.

     channel  label_encoded  channel_Affiliate  channel_Email  channel_Social
0      Email              1                0.0            1.0             0.0
1     Social              2                0.0            0.0             1.0
2  Affiliate              0                1.0            0.0             0.0
3     Social              2                0.0            0.0             1.0
4      Email              1                0.0            1.0             0.0


In [None]:
# --- Feature Selection Importance ---
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Simulated marketing dataset with redundant features
np.random.seed(42)
df = pd.DataFrame({
    'age': np.random.randint(18,60,100),
    'income': np.random.randint(30000,100000,100),
    'click_rate': np.random.rand(100),
    'random_noise': np.random.rand(100),  # useless feature
    'target': np.random.randint(0,2,100)
})

X = df.drop('target', axis=1)
y = df['target']

model = RandomForestClassifier().fit(X, y)
feat_imp = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feat_imp)

# üí° Lesson: Not all features help. Drop low-importance ones to simplify model.
# In marketing, too many noisy metrics ‚Üí overfit and confusion.

click_rate      0.299527
income          0.242154
random_noise    0.238246
age             0.220074
dtype: float64


In [None]:
# --- Handling Missing Values ---
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

df = pd.DataFrame({
    'customer_id': ['C1','C2','C3','C4'],
    'age': [25, np.nan, 30, np.nan],
    'spend_score': [80, 70, np.nan, 60]
})

# Mean imputation for numeric columns
imputer = SimpleImputer(strategy='mean')
df[['age','spend_score']] = imputer.fit_transform(df[['age','spend_score']])

print(df)

# üí° Lesson: Missing data is unavoidable ‚Äî impute wisely.
# Mean/median fills are simple; advanced methods (KNN, model-based) are better for large gaps.
# Always inspect the cause before imputing.

  customer_id   age  spend_score
0          C1  25.0         80.0
1          C2  27.5         70.0
2          C3  30.0         70.0
3          C4  27.5         60.0
