# train_Model.py Overview

## Train and Evaluate the Model with Cross-Validation

In this block, I:
- Define the feature set and target variable.
- Split the data into training and test sets.
- Use 5‑fold cross-validation on the training set to evaluate the model’s performance.
- Train a Random Forest Regressor and evaluate it on the held‑out test set.


## 1) Load Processed Data

In this block, we load the cleaned and processed data from a CSV file that was exported from the EDA notebook. This file (e.g., `processed_pr_data.csv`) should contain all the necessary columns such as `is_closed`, `num_labels`, `body_length`, and `time_to_merge_hours`.

In [5]:
import pandas as pd

def load_processed_data():
    """
    Load the preprocessed GitHub PR data from a CSV file.
    """
    data_path = "../data/processed_pr_data.csv"
    df = pd.read_csv(data_path)
    print(f"[INFO] Loaded processed data with {df.shape[0]} rows and {df.shape[1]} columns.")
    return df

# Example usage:
df = load_processed_data()

[INFO] Loaded processed data with 91 rows and 13 columns.


## 2) Train and Evaluate the Model with Cross‑Validation

Using the processed data, we:
- Define our feature set and target variable.
- Split the data into training and test sets.
- Use 5‑fold cross-validation on the training set to get a robust performance estimate.
- Train a Random Forest Regressor and evaluate it on the test set.


In [10]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

def train_and_evaluate(df):
    """
    Process the DataFrame to extract numeric features, then split the data,
    train a Random Forest Regressor using 5-fold cross-validation, and evaluate its performance.
    
    Numeric features extracted:
      - title_length: length of the PR title.
      - body_length: length of the PR body text.
      - num_labels: count of labels.
      - is_closed: binary indicator of whether the PR is closed.
    
    Target variable:
      - time_to_merge_hours
    """
    # Extract numeric features from text columns:
    # We assume df has columns "title", "body", "labels", "state", and "time_to_merge_hours"
    df["title_length"] = df["title"].apply(lambda x: len(x) if isinstance(x, str) else 0)
    df["body_length"] = df["body"].apply(lambda x: len(x) if isinstance(x, str) else 0)
    
    # For labels, count the number of labels (assuming it's a list)
    def count_labels(label_list):
        if isinstance(label_list, list):
            return len(label_list)
        return 0
    df["num_labels"] = df["labels"].apply(count_labels)
    
    # Binary feature: is_closed (1 if closed, else 0)
    df["is_closed"] = df["state"].apply(lambda x: 1 if isinstance(x, str) and x.lower() == "closed" else 0)
    
    # Define the numeric feature set and target variable.
    feature_cols = ["title_length", "body_length", "num_labels", "is_closed"]
    target_col = "time_to_merge_hours"
    
    # Drop rows with missing numeric feature or target values.
    df_model = df.dropna(subset=feature_cols + [target_col]).copy()
    print(f"[INFO] Final training dataset size: {len(df_model)} rows.")
    
    X = df_model[feature_cols]
    y = df_model[target_col]
    
    # Convert features to numeric (should already be numeric, but this is extra safety)
    X = X.apply(pd.to_numeric, errors='coerce')
    
    # Split data: 80% training, 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define the Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    
    # Perform 5-fold cross-validation on the training set
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(rf, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
    cv_mse = -cv_scores.mean()
    cv_rmse = np.sqrt(cv_mse)
    print(f"[INFO] Cross-Validation RMSE (5-fold): {cv_rmse:.2f}")
    
    # Train the model on the full training set
    rf.fit(X_train, y_train)
    
    # Evaluate the model on the test set
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    print("[INFO] Test Set Performance:")
    print(f"  MSE:  {mse:.2f}")
    print(f"  MAE:  {mae:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    
    return rf

rf_model = train_and_evaluate(df)

[INFO] Final training dataset size: 91 rows.
[INFO] Cross-Validation RMSE (5-fold): 38.01
[INFO] Test Set Performance:
  MSE:  807.70
  MAE:  24.21
  RMSE: 28.42
