In [1]:
%pwd

'/Users/a/Documents/DataScience_World/ML10_end_to_end/dsproject/CompleteDSproject/research'

In [2]:
import os
os.chdir("../")
%pwd


'/Users/a/Documents/DataScience_World/ML10_end_to_end/dsproject/CompleteDSproject'

In [3]:
from dataclasses import dataclass
from pathlib import Path
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from src.datascience import logger
from src.datascience.utils.common import read_yaml, create_directories, save_json
from ensure import ensure_annotations
from box import ConfigBox

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    model_path: Path
    target_column: str
    train_test_ratio: float
    random_state: int
    model_params: dict

In [4]:
from src.datascience.constants import *
from src.datascience.utils.common import read_yaml, create_directories

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = Path("config/config.yaml"),
        params_filepath: Path = Path("params.yaml"),
        schema_filepath: Path = Path("schema.yaml")
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        model_params = {
            "n_estimators": config.model_params.n_estimators,
            "max_depth": config.model_params.max_depth,
            "min_samples_split": config.model_params.min_samples_split,
            "min_samples_leaf": config.model_params.min_samples_leaf
        }

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            train_data_path=Path(self.config.data_transformation.transformed_data_path),
            model_path=Path(config.root_dir) / "model.joblib",
            target_column=config.target_column,
            train_test_ratio=config.train_test_ratio,
            random_state=config.random_state,
            model_params=model_params
        )

        return model_trainer_config

In [6]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        try:
            # Load transformed data
            logger.info("Loading transformed data")
            data = pd.read_csv(self.config.train_data_path)

            # Split features and target
            X = data.drop(columns=[self.config.target_column], axis=1)
            y = data[self.config.target_column]

            # Split data into training and validation sets
            X_train, X_val, y_train, y_val = train_test_split(
                X, y,
                test_size=self.config.train_test_ratio,
                random_state=self.config.random_state
            )

            logger.info("Training Random Forest model")
            rf_classifier = RandomForestClassifier(
                **self.config.model_params,
                random_state=self.config.random_state
            )

            rf_classifier.fit(X_train, y_train)

            # Make predictions on validation set
            logger.info("Making predictions on validation set")
            y_pred = rf_classifier.predict(X_val)

            # Calculate metrics
            accuracy = accuracy_score(y_val, y_pred)
            classification_rep = classification_report(y_val, y_pred, output_dict=True)
            conf_matrix = confusion_matrix(y_val, y_pred)

            # Save metrics
            metrics = {
                "accuracy": float(accuracy),  # Convert numpy float to Python float
                "classification_report": classification_rep,
                "confusion_matrix": conf_matrix.tolist()
            }

            # Save feature importances
            feature_importance = pd.DataFrame({
                'feature': X.columns,
                'importance': rf_classifier.feature_importances_
            }).sort_values('importance', ascending=False)

            metrics['feature_importance'] = feature_importance.to_dict('records')

            # Create directory if it doesn't exist
            os.makedirs(self.config.root_dir, exist_ok=True)

            # Save model and metrics
            metrics_path = Path(self.config.root_dir) / "metrics.json"
            save_json(metrics_path, metrics)
            
            joblib.dump(rf_classifier, self.config.model_path)

            logger.info(f"Model training completed. Accuracy: {accuracy:.4f}")
            
            return metrics

        except Exception as e:
            logger.error(f"Error in model training: {str(e)}")
            raise e



In [7]:
try:
    logger.info("Starting model training pipeline")
    config_manager = ConfigurationManager()
    model_trainer_config = config_manager.get_model_trainer_config()
    model_trainer = ModelTrainer(model_trainer_config)
        
    metrics = model_trainer.train()
    logger.info("Model training completed successfully")
    logger.info(f"Model accuracy: {metrics['accuracy']:.4f}")
        
except Exception as e:
    logger.error(f"Error in model training: {str(e)}")
    raise e


[2025-01-08 14:29:11,040: INFO: 2670371872: Starting model training pipeline]
[2025-01-08 14:29:11,105: INFO: common: YAML file loaded successfully from: config/config.yaml]
[2025-01-08 14:29:11,109: INFO: common: YAML file loaded successfully from: params.yaml]
[2025-01-08 14:29:11,113: INFO: common: YAML file loaded successfully from: schema.yaml]
[2025-01-08 14:29:11,114: INFO: common: Created directory at: artifacts]
[2025-01-08 14:29:11,115: INFO: common: Created directory at: artifacts/model_trainer]
[2025-01-08 14:29:11,116: INFO: 3941991833: Loading transformed data]
[2025-01-08 14:29:11,139: INFO: 3941991833: Training Random Forest model]
[2025-01-08 14:29:11,356: INFO: 3941991833: Making predictions on validation set]
[2025-01-08 14:29:11,389: INFO: common: JSON file saved successfully at: artifacts/model_trainer/metrics.json]
[2025-01-08 14:29:11,457: INFO: 3941991833: Model training completed. Accuracy: 0.3400]
[2025-01-08 14:29:11,458: INFO: 2670371872: Model training comp