
## ðŸ’¡ **Why the `telco_analysis/` Package Structure is Better**

**âœ… Proper Python Package**: Creates a true importable package with `__init__.py`

**âœ… Namespace Management**: All your code lives under `telco_analysis.module_name`

**âœ… Future-Proof**: Easily extensible for multiple analysis types

**âœ… Professional Standard**: Industry best practice for Python projects

**âœ… Testing-Friendly**: Pytest can easily discover and import your package

**âœ… Distribution-Ready**: Can be packaged and installed with pip later

### **Don't Forget the `__init__.py` Files!**

**`src/telco_analysis/__init__.py`**:
```python
"""
Telco Customer Churn Analysis Package

A comprehensive toolkit for analyzing customer churn in telecommunications data.
"""

__version__ = "0.1.0"
__author__ = "Your Name"

# Import key functions for easy access
from .data_utils import load_raw_data, validate_data_schema
from .preprocessing import clean_total_charges, prepare_features_target
from .model_utils import create_baseline_model, train_model, evaluate_model

# Define what's available when someone imports the package
__all__ = [
    'load_raw_data', 
    'validate_data_schema',
    'clean_total_charges',
    'prepare_features_target', 
    'create_baseline_model',
    'train_model',
    'evaluate_model'
]
```

This allows cleaner imports later:
```python
from telco_analysis import load_raw_data, create_baseline_model
```

Instead of:
```python  
from telco_analysis.data_utils import load_raw_data
from telco_analysis.model_utils import create_baseline_model
```



---

### **`src/data_utils.py`**
```python
"""
Data loading and basic utilities for the telco churn project.
"""

import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_raw_data(filepath='data/raw/telco_customer_churn.csv'):
    """
    Load the raw telco churn dataset.
    
    Parameters:
    -----------
    filepath : str
        Path to the raw CSV file
        
    Returns:
    --------
    pd.DataFrame
        Raw dataset
    """
    try:
        df = pd.read_csv(filepath)
        logger.info(f"Data loaded successfully. Shape: {df.shape}")
        return df
    except FileNotFoundError:
        logger.error(f"File not found: {filepath}")
        raise
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise

def validate_data_schema(df, expected_columns=None):
    """
    Validate that the dataframe has expected structure.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Dataset to validate
    expected_columns : list, optional
        List of expected column names
        
    Returns:
    --------
    bool
        True if validation passes
    """
    if expected_columns is None:
        expected_columns = [
            'customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
            'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
            'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
            'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'
        ]
    
    missing_cols = set(expected_columns) - set(df.columns)
    if missing_cols:
        logger.error(f"Missing columns: {missing_cols}")
        return False
    
    logger.info("Data schema validation passed")
    return True

def save_processed_data(df, filename, folder='data/processed'):
    """
    Save processed dataframe to CSV.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Processed dataset
    filename : str
        Name for the output file
    folder : str
        Output directory
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    filepath = Path(folder) / filename
    df.to_csv(filepath, index=False)
    logger.info(f"Data saved to: {filepath}")
```

### **`src/preprocessing.py`**
```python
"""
Data preprocessing functions for the telco churn project.
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import logging

logger = logging.getLogger(__name__)

def clean_total_charges(df):
    """
    Clean and convert TotalCharges column to numeric.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Raw dataframe
        
    Returns:
    --------
    pd.DataFrame
        Dataframe with cleaned TotalCharges
    """
    df = df.copy()
    
    # Convert TotalCharges to numeric, coercing errors to NaN
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    
    # Handle missing values (likely new customers with tenure=0)
    mask = df['TotalCharges'].isna()
    df.loc[mask, 'TotalCharges'] = df.loc[mask, 'MonthlyCharges']
    
    logger.info(f"Cleaned TotalCharges column. Fixed {mask.sum()} missing values.")
    return df

def prepare_features_target(df):
    """
    Separate features and target, and identify column types.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Cleaned dataframe
        
    Returns:
    --------
    tuple
        (X, y, numeric_features, categorical_features)
    """
    # Separate features and target
    X = df.drop(['customerID', 'Churn'], axis=1)
    y = (df['Churn'] == 'Yes').astype(int)  # Convert to binary
    
    # Identify feature types
    numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
    
    # SeniorCitizen is already numeric (0/1)
    if 'SeniorCitizen' in X.columns:
        numeric_features.append('SeniorCitizen')
    
    categorical_features = [
        col for col in X.columns 
        if col not in numeric_features
    ]
    
    logger.info(f"Features prepared. Numeric: {len(numeric_features)}, Categorical: {len(categorical_features)}")
    return X, y, numeric_features, categorical_features

def create_preprocessing_pipeline(numeric_features, categorical_features):
    """
    Create preprocessing pipeline for the features.
    
    Parameters:
    -----------
    numeric_features : list
        List of numeric feature names
    categorical_features : list
        List of categorical feature names
        
    Returns:
    --------
    sklearn.compose.ColumnTransformer
        Preprocessing pipeline
    """
    # Numeric preprocessing: just standard scaling for now
    numeric_transformer = StandardScaler()
    
    # Categorical preprocessing: one-hot encoding
    categorical_transformer = OneHotEncoder(
        drop='first',  # Avoid multicollinearity
        handle_unknown='ignore'  # Handle unseen categories gracefully
    )
    
    # Combine preprocessors
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    
    logger.info("Preprocessing pipeline created")
    return preprocessor

def split_data(X, y, test_size=0.2, random_state=42):
    """
    Split data into train and test sets.
    
    Parameters:
    -----------
    X : pd.DataFrame
        Features
    y : pd.Series
        Target variable
    test_size : float
        Proportion of data for testing
    random_state : int
        Random seed for reproducibility
        
    Returns:
    --------
    tuple
        (X_train, X_test, y_train, y_test)
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        random_state=random_state,
        stratify=y  # Maintain class distribution
    )
    
    logger.info(f"Data split completed. Train: {len(X_train)}, Test: {len(X_test)}")
    logger.info(f"Train churn rate: {y_train.mean():.3f}, Test churn rate: {y_test.mean():.3f}")
    
    return X_train, X_test, y_train, y_test
```

### **`src/model_utils.py`**
```python
"""
Model training and evaluation utilities.
"""

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, average_precision_score, 
    classification_report, confusion_matrix
)
import joblib
import json
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

def create_baseline_model(preprocessor, random_state=42):
    """
    Create baseline logistic regression model with preprocessing.
    
    Parameters:
    -----------
    preprocessor : sklearn.compose.ColumnTransformer
        Preprocessing pipeline
    random_state : int
        Random seed
        
    Returns:
    --------
    sklearn.pipeline.Pipeline
        Complete modeling pipeline
    """
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(
            random_state=random_state,
            max_iter=1000
        ))
    ])
    
    logger.info("Baseline model pipeline created")
    return model

def train_model(model, X_train, y_train):
    """
    Train the model pipeline.
    
    Parameters:
    -----------
    model : sklearn.pipeline.Pipeline
        Model pipeline to train
    X_train : pd.DataFrame
        Training features
    y_train : pd.Series
        Training target
        
    Returns:
    --------
    sklearn.pipeline.Pipeline
        Trained model pipeline
    """
    model.fit(X_train, y_train)
    logger.info("Model training completed")
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluate model performance.
    
    Parameters:
    -----------
    model : sklearn.pipeline.Pipeline
        Trained model
    X_test : pd.DataFrame
        Test features
    y_test : pd.Series
        Test target
        
    Returns:
    --------
    dict
        Dictionary containing evaluation metrics
    """
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba)
    
    # Create results dictionary
    results = {
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'classification_report': classification_report(y_test, y_pred, output_dict=True),
        'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
    }
    
    logger.info(f"Model evaluation completed. ROC-AUC: {roc_auc:.3f}, PR-AUC: {pr_auc:.3f}")
    return results

def save_model_artifacts(model, results, model_path='models/baseline_pipeline.joblib', 
                        metadata_path='models/model_metadata.json'):
    """
    Save model and performance metadata.
    
    Parameters:
    -----------
    model : sklearn.pipeline.Pipeline
        Trained model to save
    results : dict
        Model evaluation results
    model_path : str
        Path to save the model
    metadata_path : str
        Path to save the metadata
    """
    # Create directories if they don't exist
    Path(model_path).parent.mkdir(parents=True, exist_ok=True)
    Path(metadata_path).parent.mkdir(parents=True, exist_ok=True)
    
    # Save model
    joblib.dump(model, model_path)
    logger.info(f"Model saved to: {model_path}")
    
    # Save metadata
    with open(metadata_path, 'w') as f:
        json.dump(results, f, indent=2)
    logger.info(f"Model metadata saved to: {metadata_path}")

def load_model(model_path='models/baseline_pipeline.joblib'):
    """
    Load saved model pipeline.
    
    Parameters:
    -----------
    model_path : str
        Path to the saved model
        
    Returns:
    --------
    sklearn.pipeline.Pipeline
        Loaded model pipeline
    """
    model = joblib.load(model_path)
    logger.info(f"Model loaded from: {model_path}")
    return model
```

### **`src/telco_analysis/config.py`** (Alternative to separate configs/ folder)
```python
"""
Configuration settings for the telco churn model.
"""

# Data paths
DATA_PATHS = {
    'raw_data': 'data/raw/telco_customer_churn.csv',
    'processed_train': 'data/processed/train_clean.csv',
    'processed_test': 'data/processed/test_clean.csv'
}

# Model paths
MODEL_PATHS = {
    'baseline_model': 'models/baseline_pipeline.joblib',
    'model_metadata': 'models/model_metadata.json'
}

# Model parameters
MODEL_CONFIG = {
    'test_size': 0.2,
    'random_state': 42,
    'logistic_regression': {
        'max_iter': 1000,
        'random_state': 42
    }
}

# Feature definitions
FEATURE_CONFIG = {
    'numeric_features': ['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen'],
    'categorical_features': [
        'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
        'PaperlessBilling', 'PaymentMethod'
    ],
    'target_column': 'Churn',
    'id_column': 'customerID'
}

# Validation thresholds
VALIDATION_CONFIG = {
    'min_roc_auc': 0.70,  # Minimum acceptable ROC-AUC
    'min_pr_auc': 0.50,   # Minimum acceptable PR-AUC
    'max_train_test_diff': 0.05  # Max difference in performance between train/test
}
```

### **`04_baseline_model.ipynb` Structure**
```python
# Cell 1: Setup and Imports
import sys
sys.path.append('src')

# Now import from your package
from telco_analysis.data_utils import load_raw_data, validate_data_schema, save_processed_data
from telco_analysis.preprocessing import clean_total_charges, prepare_features_target, create_preprocessing_pipeline, split_data
from telco_analysis.model_utils import create_baseline_model, train_model, evaluate_model, save_model_artifacts
from telco_analysis.config import DATA_PATHS, MODEL_PATHS, MODEL_CONFIG, FEATURE_CONFIG

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Cell 2: Load and Prepare Data
df = load_raw_data(DATA_PATHS['raw_data'])
validate_data_schema(df)
df_clean = clean_total_charges(df)

# Cell 3: Feature Engineering and Splitting
X, y, numeric_features, categorical_features = prepare_features_target(df_clean)
X_train, X_test, y_train, y_test = split_data(X, y, **MODEL_CONFIG)

# Cell 4: Create and Train Model
preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
model = create_baseline_model(preprocessor, MODEL_CONFIG['random_state'])
model = train_model(model, X_train, y_train)

# Cell 5: Evaluate and Save
results = evaluate_model(model, X_test, y_test)
save_model_artifacts(model, results, MODEL_PATHS['baseline_model'], MODEL_PATHS['model_metadata'])

# Cell 6: Display Results
print(f"ROC-AUC Score: {results['roc_auc']:.3f}")
print(f"PR-AUC Score: {results['pr_auc']:.3f}")
```
