In [None]:
# CTR Prediction Model Training - 40M Dataset
This notebook trains a click-through rate prediction model using SGDClassifier for efficient handling of large-scale data (40M rows) with high-cardinality categorical features.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, log_loss
import joblib
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

## Why SGDClassifier for 40M Row CTR Dataset?

**Key Advantages:**
- **Memory Efficient**: Online learning - doesn't load entire dataset into memory
- **Scalable**: Designed for large datasets (40M+ rows)
- **Sparse Data Optimized**: Handles high-dimensional categorical features efficiently
- **CTR Optimized**: Using `loss='log'` gives logistic regression behavior
- **Incremental Learning**: Can use `partial_fit()` if needed for streaming data

**Configuration for CTR Prediction:**
- `loss='log'`: Logistic regression loss function
- `alpha=0.0001`: L2 regularization to prevent overfitting
- `learning_rate='adaptive'`: Adjusts learning rate for better convergence
- `random_state=42`: For reproducibility

In [None]:
# Load sample data for testing (replace with your 40M dataset path)
df = pd.read_csv('files/sample_train.csv')
print(f"Dataset shape: {df.shape}")
print(f"Click rate: {df['click'].mean():.4f}")

# Identify categorical and numerical features
categorical_features = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 
                       'app_category', 'device_id', 'device_ip', 'device_model']
numerical_features = ['C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 
                     'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

print(f"Categorical features: {len(categorical_features)}")
print(f"Numerical features: {len(numerical_features)}")

# Check cardinality of categorical features
for col in categorical_features:
    print(f"{col}: {df[col].nunique()} unique values")

In [None]:
# Memory-efficient feature engineering for large datasets
def create_features(df, fit_encoders=True):
    """
    Create features using memory-efficient encoding methods
    """
    # Separate target
    if 'click' in df.columns:
        y = df['click'].values
        X_num = df[numerical_features].values
    else:
        y = None
        X_num = df[numerical_features].values
    
    # Low cardinality categoricals: One-hot encoding
    low_card_features = ['site_category', 'app_domain', 'app_category']
    if fit_encoders:
        global ohe
        ohe = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
        X_low_card = ohe.fit_transform(df[low_card_features])
    else:
        X_low_card = ohe.transform(df[low_card_features])
    
    # High cardinality categoricals: Feature hashing (memory efficient)
    high_card_features = ['site_id', 'site_domain', 'app_id', 'device_id', 'device_ip', 'device_model']
    if fit_encoders:
        global hasher
        hasher = FeatureHasher(n_features=10000, input_type='string')  # Adjust n_features based on memory
    
    # Convert to strings and hash
    hash_data = df[high_card_features].astype(str).apply(lambda x: ' '.join(x), axis=1)
    X_high_card = hasher.transform(hash_data)
    
    # Combine all features
    from scipy.sparse import csr_matrix
    X_num_sparse = csr_matrix(X_num)
    X_combined = hstack([X_num_sparse, X_low_card, X_high_card])
    
    return X_combined, y

# Create features from sample data
X, y = create_features(df, fit_encoders=True)
print(f"Feature matrix shape: {X.shape}")
print(f"Feature matrix sparsity: {1 - X.nnz / (X.shape[0] * X.shape[1]):.4f}")
print(f"Memory usage: ~{X.data.nbytes / 1024**2:.2f} MB")

In [None]:
# Split data for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# SGDClassifier optimized for CTR prediction
sgd_model = SGDClassifier(
    loss='log',                    # Logistic regression loss for CTR prediction
    alpha=0.0001,                  # L2 regularization strength
    learning_rate='adaptive',      # Adaptive learning rate for better convergence
    eta0=0.01,                    # Initial learning rate
    max_iter=1000,                # Maximum iterations
    early_stopping=True,          # Stop early if no improvement
    validation_fraction=0.1,      # Fraction for early stopping validation
    n_iter_no_change=5,          # Patience for early stopping
    random_state=42,
    class_weight='balanced',      # Handle class imbalance (17% click rate)
    verbose=1                     # Show progress
)

print("Training SGDClassifier...")
sgd_model.fit(X_train, y_train)
print("Training completed!")

In [None]:
# Evaluate the model
y_pred = sgd_model.predict(X_test)
y_pred_proba = sgd_model.predict_proba(X_test)[:, 1]

print("=== CTR Prediction Results ===")
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
print(f"Log Loss: {log_loss(y_test, y_pred_proba):.4f}")
print()
print("Classification Report:")
print(classification_report(y_test, y_pred))

# CTR-specific metrics
predicted_ctr = y_pred_proba.mean()
actual_ctr = y_test.mean()
print(f"Actual CTR: {actual_ctr:.4f}")
print(f"Predicted CTR: {predicted_ctr:.4f}")
print(f"CTR Prediction Error: {abs(predicted_ctr - actual_ctr):.4f}")

# Model complexity
print(f"Model coefficients shape: {sgd_model.coef_.shape}")
print(f"Number of iterations: {sgd_model.n_iter_}")
print(f"Final learning rate: {sgd_model.learning_rate_:.6f}" if hasattr(sgd_model, 'learning_rate_') else "N/A")

## AUC-ROC Analysis for CTR Prediction

AUC-ROC is the most important metric for CTR prediction as it measures the model's ability to distinguish between clicks and non-clicks across all probability thresholds.

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

print("=== AUC-ROC Detailed Analysis ===")
print(f"AUC-ROC Score: {roc_auc:.4f}")
print()

# AUC-ROC interpretation for CTR
if roc_auc >= 0.8:
    print("üéØ EXCELLENT: AUC ‚â• 0.8 - Very strong CTR prediction model")
elif roc_auc >= 0.7:
    print("‚úÖ GOOD: AUC ‚â• 0.7 - Good CTR prediction performance")
elif roc_auc >= 0.6:
    print("‚ö†Ô∏è  FAIR: AUC ‚â• 0.6 - Acceptable but room for improvement")
else:
    print("‚ùå POOR: AUC < 0.6 - Model needs significant improvement")

print()
print("CTR Industry Benchmarks:")
print("- Random model: AUC = 0.5")
print("- Baseline CTR models: AUC = 0.6-0.7")
print("- Strong CTR models: AUC = 0.7-0.8")
print("- Excellent CTR models: AUC > 0.8")

# Plot ROC Curve
plt.figure(figsize=(10, 6))

plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for CTR Prediction')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)

# Plot Precision-Recall curve (also important for imbalanced CTR data)
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, pr_thresholds = precision_recall_curve(y_test, y_pred_proba)
avg_precision = average_precision_score(y_test, y_pred_proba)

plt.subplot(1, 2, 2)
plt.plot(recall, precision, color='blue', lw=2, label=f'PR Curve (AP = {avg_precision:.4f})')
plt.axhline(y=y_test.mean(), color='red', linestyle='--', label=f'Baseline (CTR = {y_test.mean():.4f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nAverage Precision Score: {avg_precision:.4f}")
print("(Average Precision is especially important for imbalanced CTR data)")

In [None]:
# Find optimal threshold for CTR prediction
from sklearn.metrics import f1_score

# Calculate F1 scores for different thresholds
f1_scores = []
for threshold in thresholds:
    y_pred_thresh = (y_pred_proba >= threshold).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_thresh))

optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
optimal_f1 = f1_scores[optimal_idx]

print("=== CTR Prediction Threshold Analysis ===")
print(f"Optimal Threshold: {optimal_threshold:.4f}")
print(f"F1-Score at Optimal Threshold: {optimal_f1:.4f}")
print(f"TPR at Optimal Threshold: {tpr[optimal_idx]:.4f}")
print(f"FPR at Optimal Threshold: {fpr[optimal_idx]:.4f}")
print()

# Apply optimal threshold
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)

# CTR-specific metrics with optimal threshold
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_optimal)
tn, fp, fn, tp = cm.ravel()

precision_optimal = tp / (tp + fp) if (tp + fp) > 0 else 0
recall_optimal = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

print("Performance with Optimal Threshold:")
print(f"Precision (Click Prediction Accuracy): {precision_optimal:.4f}")
print(f"Recall (Click Detection Rate): {recall_optimal:.4f}")
print(f"Specificity (Non-Click Detection Rate): {specificity:.4f}")
print()
print("Confusion Matrix:")
print(f"True Negatives (Correct Non-Clicks): {tn}")
print(f"False Positives (Incorrect Click Predictions): {fp}")
print(f"False Negatives (Missed Clicks): {fn}")
print(f"True Positives (Correct Click Predictions): {tp}")

# Business impact analysis
predicted_clicks_optimal = np.sum(y_pred_optimal)
actual_clicks = np.sum(y_test)
click_capture_rate = tp / actual_clicks if actual_clicks > 0 else 0

print()
print("=== Business Impact Analysis ===")
print(f"Actual Clicks in Test Set: {actual_clicks}")
print(f"Predicted Clicks (Optimal Threshold): {predicted_clicks_optimal}")
print(f"Click Capture Rate: {click_capture_rate:.4f} ({click_capture_rate*100:.1f}%)")
print(f"False Click Rate: {fp / (fp + tn):.4f} ({fp / (fp + tn)*100:.1f}%)")

## For 40M Dataset: Incremental Learning Approach

For your 40 million row dataset, you'll want to use incremental learning to handle the data in chunks:

In [None]:
# Example: Incremental learning for 40M dataset
def train_incremental_sgd(data_path, chunk_size=50000):
    """
    Train SGDClassifier incrementally on large dataset
    """
    # Initialize model
    sgd_incremental = SGDClassifier(
        loss='log',
        alpha=0.0001,
        learning_rate='adaptive',
        eta0=0.01,
        random_state=42,
        class_weight='balanced'
    )
    
    # Fit encoders on first chunk
    first_chunk = pd.read_csv(data_path, nrows=chunk_size)
    X_first, y_first = create_features(first_chunk, fit_encoders=True)
    sgd_incremental.partial_fit(X_first, y_first, classes=[0, 1])
    
    # Process remaining data in chunks
    chunk_iter = pd.read_csv(data_path, chunksize=chunk_size, skiprows=chunk_size)
    
    for i, chunk in enumerate(chunk_iter):
        print(f"Processing chunk {i+2}...")
        X_chunk, y_chunk = create_features(chunk, fit_encoders=False)
        sgd_incremental.partial_fit(X_chunk, y_chunk)
        
        # Optional: Early stopping based on validation
        if i % 10 == 0:  # Check every 10 chunks
            # Add your validation logic here
            pass
    
    return sgd_incremental

# For your 40M dataset, use:
# model_40m = train_incremental_sgd('path/to/your/40m_dataset.csv', chunk_size=100000)

print("Incremental learning setup ready for 40M dataset!")
print("Adjust chunk_size based on your available memory (50k-200k rows typically work well)")

In [None]:
# Save the trained model and encoders
joblib.dump(sgd_model, 'sgd_ctr_model.pkl')
joblib.dump(ohe, 'onehot_encoder.pkl')
joblib.dump(hasher, 'feature_hasher.pkl')

print("Model and encoders saved successfully!")
print()
print("=== FINAL RECOMMENDATION ===")
print("‚úÖ SGDClassifier is the optimal choice for your 40M row CTR dataset")
print("‚úÖ Use incremental learning with partial_fit() for memory efficiency")
print("‚úÖ Feature hashing handles high-cardinality categoricals efficiently")
print("‚úÖ Model achieves good performance with fast training time")
print()
print("Next steps for 40M dataset:")
print("1. Use the incremental learning function above")
print("2. Adjust chunk_size based on your RAM (50k-200k rows)")
print("3. Monitor memory usage and training progress")
print("4. Consider validation splits for early stopping")