In [1]:
# %% [markdown]
# # Feature Engineering - Customer Churn Prediction
# 
# This notebook focuses on creating new features and preparing the data for modeling.

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

# %%
# Load the cleaned data
# In practice, this would come from the previous notebook
from src.data_processing import DataProcessor
from src.feature_engineering import FeatureEngineer

# Initialize and load data
processor = DataProcessor()
df = processor.load_data()
df = processor.clean_data()

print("Dataset shape:", df.shape)
df.head()

# %% [markdown]
# ## 1. Exploratory Data Analysis for Feature Engineering

# %%
# Check current features and their types
print("Current features:")
print(df.dtypes)

# %%
# Analyze numerical features distribution
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, feature in enumerate(numerical_features):
    axes[i].hist(df[feature], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# %%
# Analyze categorical features
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 
                       'InternetService', 'Contract', 'PaymentMethod']

fig, axes = plt.subplots(3, 3, figsize=(20, 15))
axes = axes.ravel()

for i, feature in enumerate(categorical_features):
    if i < len(axes):
        # Calculate churn rates by category
        churn_by_category = df.groupby(feature)['Churn'].mean().sort_values(ascending=False)
        
        # Plot
        bars = axes[i].bar(range(len(churn_by_category)), churn_by_category.values, 
                          color=['#A23B72', '#2E86AB', '#F18F01', '#C73E1D'][:len(churn_by_category)])
        axes[i].set_title(f'Churn Rate by {feature}', fontsize=12, fontweight='bold')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Churn Rate')
        axes[i].set_xticks(range(len(churn_by_category)))
        axes[i].set_xticklabels(churn_by_category.index, rotation=45)
        
        # Add value labels on bars
        for bar, value in zip(bars, churn_by_category.values):
            axes[i].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                        f'{value:.2%}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# %% [markdown]
# ## 2. Create New Engineered Features

# %%
# Initialize feature engineer
engineer = FeatureEngineer()

# %%
# Create new features
df_engineered = engineer.create_new_features(df)

# Display the new features
new_features = ['TenureGroup', 'MonthlySpendCategory', 'AvgChargePerTenure', 'HasMultipleServices']
print("New features created:")
print(df_engineered[new_features].head())

# %%
# Analyze the new features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Tenure Group analysis
tenure_churn = df_engineered.groupby('TenureGroup')['Churn'].mean()
axes[0,0].bar(tenure_churn.index.astype(str), tenure_churn.values, color='#2E86AB')
axes[0,0].set_title('Churn Rate by Tenure Group', fontweight='bold')
axes[0,0].set_ylabel('Churn Rate')
for i, v in enumerate(tenure_churn.values):
    axes[0,0].text(i, v + 0.01, f'{v:.2%}', ha='center', va='bottom', fontweight='bold')

# Monthly Spend Category analysis
spend_churn = df_engineered.groupby('MonthlySpendCategory')['Churn'].mean()
axes[0,1].bar(spend_churn.index.astype(str), spend_churn.values, color='#A23B72')
axes[0,1].set_title('Churn Rate by Monthly Spend Category', fontweight='bold')
axes[0,1].set_ylabel('Churn Rate')
for i, v in enumerate(spend_churn.values):
    axes[0,1].text(i, v + 0.01, f'{v:.2%}', ha='center', va='bottom', fontweight='bold')

# Avg Charge Per Tenure
axes[1,0].scatter(df_engineered['AvgChargePerTenure'], df_engineered['Churn'], 
                 alpha=0.5, color='#F18F01')
axes[1,0].set_xlabel('Average Charge Per Tenure Month')
axes[1,0].set_ylabel('Churn (0=No, 1=Yes)')
axes[1,0].set_title('Churn vs Average Monthly Charge', fontweight='bold')

# Has Multiple Services
service_churn = df_engineered.groupby('HasMultipleServices')['Churn'].mean()
axes[1,1].bar(['No Multiple Services', 'Has Multiple Services'], service_churn.values, color='#C73E1D')
axes[1,1].set_title('Churn Rate by Multiple Services', fontweight='bold')
axes[1,1].set_ylabel('Churn Rate')
for i, v in enumerate(service_churn.values):
    axes[1,1].text(i, v + 0.01, f'{v:.2%}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# %% [markdown]
# ## 3. Feature Correlation Analysis

# %%
# Select numerical features for correlation analysis
numerical_features_extended = numerical_features + ['AvgChargePerTenure', 'HasMultipleServices']
correlation_matrix = df_engineered[numerical_features_extended + ['Churn']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 4. Prepare Data for Modeling

# %%
# Prepare the final feature set
X = df_engineered.drop('Churn', axis=1)
y = df_engineered['Churn']

print("Feature set shape:", X.shape)
print("Target variable distribution:")
print(y.value_counts())
print("\nChurn rate:", f"{y.mean():.2%}")

# %%
# Identify categorical and numerical features
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 
                       'MultipleLines', 'InternetService', 'OnlineSecurity',
                       'OnlineBackup', 'DeviceProtection', 'TechSupport',
                       'StreamingTV', 'StreamingMovies', 'Contract',
                       'PaperlessBilling', 'PaymentMethod', 'TenureGroup', 
                       'MonthlySpendCategory']

numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgChargePerTenure', 'HasMultipleServices']

print("Categorical features:", len(categorical_features))
print("Numerical features:", len(numerical_features))

# %%
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

# Get feature names after one-hot encoding
cat_encoder = preprocessor.named_transformers_['cat']
cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)

all_feature_names = numerical_features + list(cat_feature_names)

print(f"Original features: {X.shape[1]}")
print(f"After preprocessing: {X_processed.shape[1]}")
print(f"Final feature names: {len(all_feature_names)}")

# %%
# Display some of the transformed features
feature_importance_placeholder = pd.DataFrame({
    'feature': all_feature_names,
    'importance_placeholder': np.random.rand(len(all_feature_names))
}).sort_values('importance_placeholder', ascending=False)

print("Top 20 features (placeholder for model importance):")
feature_importance_placeholder.head(20)

# %% [markdown]
# ## 5. Feature Importance Analysis (Preliminary)

# %%
# Quick Random Forest to see initial feature importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)

# Train a quick Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': all_feature_names,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
sns.barplot(data=top_features, x='importance', y='feature', palette='viridis')
plt.title('Top 20 Most Important Features (Random Forest)', fontsize=16, fontweight='bold')
plt.xlabel('Feature Importance')
plt.tight_layout()
plt.show()

# %%
# Display the top features
print("Top 10 most important features:")
print(top_features.head(10))

# %% [markdown]
# ## 6. Save Processed Data

# %%
# Save the processed data for modeling
import joblib

# Save processed features
processed_data = {
    'X_processed': X_processed,
    'y': y,
    'feature_names': all_feature_names,
    'preprocessor': preprocessor
}

joblib.dump(processed_data, 'data/processed/processed_data.pkl')
print("Processed data saved successfully!")

# %%
# Summary of feature engineering
print("=== FEATURE ENGINEERING SUMMARY ===")
print(f"📊 Original dataset: {processor.df.shape[1]} features")
print(f"🔧 Engineered features: {len(new_features)} new features created")
print(f"🎯 Final feature set: {X_processed.shape[1]} features after encoding")
print(f"📈 Key insights:")
print(f"   - Tenure Group shows clear churn patterns")
print(f"   - Monthly Spend Category correlates with churn")
print(f"   - Customers with multiple services have lower churn")
print(f"💾 Data saved for modeling phase")

# %%
# Next steps
print("\nNext: Proceed to 03_model_training.ipynb for model development and optimization.")

ModuleNotFoundError: No module named 'src'