# Phase 1: Import Libraries

In [None]:
# Phase 1: Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# Enable inline plotting for Jupyter Notebook
%matplotlib inline


# Phase 2: Load and Preprocess Data

In [None]:
# Phase 2: Load and Preprocess Data
def load_and_preprocess_data(filepath):
    """
    Load the dataset and preprocess it by scaling features.

    Parameters:
        filepath (str): Path to the CSV file containing the dataset.

    Returns:
        tuple: Scaled features (X) and target variable (y).
    """
    # Load dataset
    data = pd.read_csv(filepath)
    data = data.drop(columns=["Unnamed: 0"], errors="ignore")  # Drop unnecessary column

    # Separate features and target variable
    X = data.drop(columns=["target"])
    y = data["target"]

    # Scale features to the range [0, 1] for better model performance
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y

# Load and preprocess the data
X, y = load_and_preprocess_data('/content/example_data/diabetes.csv')


# Phase 3: Split the Data

In [None]:
# Phase 3: Split the Data
def split_data(X, y):
    """
    Split the data into training, validation, and testing sets.

    Parameters:
        X (ndarray): Scaled feature data.
        y (Series): Target variable.

    Returns:
        tuple: Training, validation, and testing sets for features and target variable.
    """
    # Initial split to get a train set and a temporary set for validation/testing
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    # Split the temporary set further into validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Split the data
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)


# Phase 4: Train Random Forest Model

In [None]:

def train_random_forest(X_train, y_train, X_val, y_val):
    """
    Train and evaluate a Random Forest Regressor model.

    Parameters:
        X_train (ndarray): Training features.
        y_train (Series): Training target variable.
        X_val (ndarray): Validation features.
        y_val (Series): Validation target variable.

    Returns:
        RandomForestRegressor: The trained Random Forest model.
        float: Mean Absolute Error on validation set.
    """
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    val_predictions = rf_model.predict(X_val)
    val_mae = mean_absolute_error(y_val, val_predictions)
    print(f"Random Forest Validation MAE: {val_mae:.4f}")
    
    return rf_model, val_mae

# Train the Random Forest model and get validation MAE
rf_model, rf_mae = train_random_forest(X_train, y_train, X_val, y_val)


# Phase 5: Train Linear Regression Model

In [None]:
# Phase 5: Train Linear Regression Model
def train_linear_regression(X_train, y_train, X_val, y_val):
    """
    Train and evaluate a Linear Regression model.

    Parameters:
        X_train (ndarray): Training features.
        y_train (Series): Training target variable.
        X_val (ndarray): Validation features.
        y_val (Series): Validation target variable.

    Returns:
        LinearRegression: The trained Linear Regression model.
        float: Mean Absolute Error on validation set.
    """
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    val_predictions = lr_model.predict(X_val)
    val_mae = mean_absolute_error(y_val, val_predictions)
    print(f"Linear Regression Validation MAE: {val_mae:.4f}")
    
    return lr_model, val_mae

# Train the Linear Regression model and get validation MAE
lr_model, lr_mae = train_linear_regression(X_train, y_train, X_val, y_val)


# Phase 6: Compare Models on Validation set

In [None]:
# Phase 6: Compare Models on Validation Set
def plot_model_comparison(rf_mae, lr_mae):
    """
    Plot the validation MAE comparison between Random Forest and Linear Regression.

    Parameters:
        rf_mae (float): Random Forest Validation MAE.
        lr_mae (float): Linear Regression Validation MAE.
    """
    models = ['Random Forest', 'Linear Regression']
    mae_scores = [rf_mae, lr_mae]
    plt.figure(figsize=(8, 4))
    plt.bar(models, mae_scores, color=['skyblue', 'salmon'])
    plt.title('Validation MAE Comparison')
    plt.ylabel('Mean Absolute Error')
    plt.show()

# Plot the comparison
plot_model_comparison(rf_mae, lr_mae)


# Phase 7: Evaluate Models on Test Set

In [None]:
# Phase 7: Evaluate Models on Test Set
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model on the test set and print the results.

    Parameters:
        model: The trained model (RandomForestRegressor or LinearRegression).
        X_test (ndarray): Test features.
        y_test (Series): Test target variable.

    Returns:
        float: Mean Absolute Error of the model on the test set.
    """
    test_predictions = model.predict(X_test)
    test_mae = mean_absolute_error(y_test, test_predictions)
    test_mse = mean_squared_error(y_test, test_predictions)
    print(f"Test MAE: {test_mae:.4f}, Test MSE: {test_mse:.4f}")
    return test_mae, test_mse

# Evaluate Random Forest on test set
print("\nRandom Forest Model Evaluation on Test Data:")
evaluate_model(rf_model, X_test, y_test)

# Evaluate Linear Regression on test set
print("\nLinear Regression Model Evaluation on Test Data:")
evaluate_model(lr_model, X_test, y_test)


In [None]:
# Phase 8: Refining Models and Saving in Keras

In [None]:
## Save_Random_Forest

In [None]:
# Phase 8: Refine and Save the Random Forest Model
from sklearn.model_selection import GridSearchCV
import joblib

# Hyperparameter tuning with GridSearchCV
def refine_random_forest(X_train, y_train):
    """
    Refine the Random Forest model using GridSearchCV for hyperparameter tuning.

    Parameters:
        X_train (ndarray): Training features.
        y_train (Series): Training target variable.

    Returns:
        RandomForestRegressor: The best Random Forest model after tuning.
    """
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_search = GridSearchCV(
        estimator=RandomForestRegressor(random_state=42),
        param_grid=param_grid,
        cv=3,
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    print(f"Best Random Forest Parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Refine Random Forest
rf_model_refined = refine_random_forest(X_train, y_train)

# Save the refined Random Forest model
joblib.dump(rf_model_refined, 'random_forest_model.pkl')
print("Random Forest model saved as 'random_forest_model.pkl'")


## Save Linear Regression


In [None]:
# Phase 9: Refine and Save the Linear Regression Model
from sklearn.linear_model import Ridge

# Refine Linear Regression using Ridge regularization
def refine_linear_regression(X_train, y_train):
    """
    Refine the Linear Regression model using Ridge regularization.

    Parameters:
        X_train (ndarray): Training features.
        y_train (Series): Training target variable.

    Returns:
        Ridge: The Ridge regression model after fitting.
    """
    ridge_model = Ridge(alpha=1.0)  # Alpha is the regularization strength
    ridge_model.fit(X_train, y_train)
    return ridge_model

# Refine Linear Regression
lr_model_refined = refine_linear_regression(X_train, y_train)

# Save the refined Linear Regression model
joblib.dump(lr_model_refined, 'linear_regression_model.pkl')
print("Linear Regression model saved as 'linear_regression_model.pkl'")
