In [None]:
# Simple Random Forest Model - Train and Predict
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

In [None]:
class SimpleRandomForestPredictor:
    """
    Minimal Random Forest predictor for training and prediction with optional log transformation
    """
    
    def __init__(self, n_estimators=100, max_depth=10, random_state=42, use_log_transform=True):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.use_log_transform = use_log_transform
        self.pipeline = None
        
    def fit(self, X_train, y_train):
        """
        Train the model on full training set with optional log transformation
        """
        print("ðŸŒ² Training Random Forest model...")
        
        # Apply log transformation to target if enabled
        if self.use_log_transform:
            print("   âœ… Applying log1p transformation to target variable")
            y_train_transformed = np.log1p(y_train.copy())
            print(f"   Target range after log1p: [{y_train_transformed.min():.4f}, {y_train_transformed.max():.4f}]")
        else:
            y_train_transformed = y_train.copy()
        
        # Identify column types
        numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
        
        print(f"Numerical features: {numerical_cols}")
        print(f"Categorical features: {categorical_cols}")
        
        # Create preprocessing pipeline
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_cols),
                ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
            ])
        
        # Create full pipeline
        self.pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                random_state=self.random_state,
                n_jobs=-1
            ))
        ])
        
        # Fit the model on transformed target
        self.pipeline.fit(X_train, y_train_transformed)
        print("âœ… Model training completed!")
        
        return self
    
    def predict(self, X_test):
        """
        Make predictions on test set with inverse transformation if log was used
        """
        if self.pipeline is None:
            raise ValueError("Model not trained yet! Call fit() first.")
        
        # Get predictions in log space
        log_predictions = self.pipeline.predict(X_test)
        
        # Apply inverse transformation if log transform was used
        if self.use_log_transform:
            predictions = np.expm1(log_predictions)
            print("   âœ… Applied expm1 inverse transformation to predictions")
        else:
            predictions = log_predictions
            
        return predictions
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance on test set with proper inverse transformation
        """
        predictions = self.predict(X_test)
        
        # Calculate metrics on original scale
        mape = mean_absolute_percentage_error(y_test, predictions) * 100
        r2 = r2_score(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        
        print(f"Test Set Performance (Original Scale):")
        print(f"MAPE: {mape:.2f}%")
        print(f"RÂ² Score: {r2:.4f}")
        print(f"RMSE: {rmse:.2f}")
        
        return {
            'mape': mape,
            'r2': r2,
            'rmse': rmse,
            'predictions': predictions
        }

In [None]:
# Example usage with log transformation:
# 
# # Load your data
# train_df = pd.read_csv('train_data.csv')
# test_df = pd.read_csv('test_data.csv')
# 
# # Prepare features and target
# target_column = 'estimated_loss'  # Replace with your target column name
# X_train = train_df.drop(columns=[target_column])
# y_train = train_df[target_column]
# X_test = test_df.drop(columns=[target_column]) if target_column in test_df.columns else test_df
# y_test = test_df[target_column] if target_column in test_df.columns else None
# 
# # Train model with log transformation (default)
# model = SimpleRandomForestPredictor(n_estimators=100, max_depth=10, random_state=42, use_log_transform=True)
# model.fit(X_train, y_train)
# 
# # Make predictions (automatically applies inverse transform: expm1)
# predictions = model.predict(X_test)
# 
# # If you have test labels, evaluate performance
# if y_test is not None:
#     results = model.evaluate(X_test, y_test)
#     print(f"Predictions shape: {predictions.shape}")
# else:
#     print(f"Predictions: {predictions[:5]}...")  # Show first 5 predictions
# 
# # Note: Predictions are now in original scale (inverse transformed)