# Model Training Notebook

This notebook provides an example of how to train machine learning models using the AI/ML Project Template.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import yaml

# Import project modules
import sys
sys.path.append('../src')

from data.preprocessing import preprocess_data
from models.training import train_model
from models.evaluation import evaluate_model

# Load configuration
with open('../config/development.yaml', 'r') as f:
    config = yaml.safe_load(f)

print('Configuration loaded successfully!')

In [None]:
# Load and preprocess data
# Replace 'your_processed_data.csv' with the actual path to your processed data file
df = pd.read_csv('../data/processed/your_processed_data.csv')

# Separate features and target
X = df.drop('target_column', axis=1)  # Replace 'target_column' with your actual target column name
y = df['target_column']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training set size: {X_train.shape}')
print(f'Test set size: {X_test.shape}')

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

# Train and evaluate models
results = {}

for name, model in models.items():
    print(f'
Training {name}...')
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    print(f'{name} Accuracy: {accuracy:.4f}')
    print(f'{name} CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})')

In [None]:
# Compare model performance
print('
Model Comparison:')
print('================')

best_model = None
best_accuracy = 0

for name, result in results.items():
    print(f'{name}:')
    print(f'  Accuracy: {result["accuracy"]:.4f}')
    print(f'  CV Score: {result["cv_mean"]:.4f} (+/- {result["cv_std"] * 2:.4f})')
    
    if result['accuracy'] > best_accuracy:
        best_accuracy = result['accuracy']
        best_model = result['model']
        best_model_name = name

print(f'
Best Model: {best_model_name} with accuracy {best_accuracy:.4f}')

In [None]:
# Detailed evaluation of the best model
best_y_pred = best_model.predict(X_test)

print(f'
Detailed Evaluation for {best_model_name}:')
print('====================================')

print('
Classification Report:')
print(classification_report(y_test, best_y_pred))

print('
Confusion Matrix:')
print(confusion_matrix(y_test, best_y_pred))

In [None]:
# Feature importance (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print('
Feature Importance:')
    print(feature_importance.head(10))
    
    # Plot feature importance
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance.head(10)['feature'], feature_importance.head(10)['importance'])
    plt.xlabel('Importance')
    plt.title('Top 10 Feature Importances')
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
# Save the best model
model_path = f'../models/{best_model_name.lower().replace(" ", "_")}_model.pkl'
joblib.dump(best_model, model_path)

print(f'
Best model saved to: {model_path}')

# Save results to a file
results_df = pd.DataFrame([
    {
        'model': name,
        'accuracy': result['accuracy'],
        'cv_mean': result['cv_mean'],
        'cv_std': result['cv_std']
    }
    for name, result in results.items()
])

results_df.to_csv('../models/training_results.csv', index=False)
print('Training results saved to: ../models/training_results.csv')

In [None]:
# Hyperparameter tuning example (optional)
from sklearn.model_selection import GridSearchCV

print('
Performing hyperparameter tuning for Random Forest...')

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get best parameters and score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_:.4f}')

# Evaluate best model
best_tuned_model = grid_search.best_estimator_
best_tuned_pred = best_tuned_model.predict(X_test)
tuned_accuracy = accuracy_score(y_test, best_tuned_pred)

print(f'Tuned model test accuracy: {tuned_accuracy:.4f}')

In [None]:
print('
Model training completed!')
print('=========================')
print(f'Best model: {best_model_name}')
print(f'Best accuracy: {best_accuracy:.4f}')

if 'tuned_accuracy' in locals():
    print(f'Tuned model accuracy: {tuned_accuracy:.4f}')