In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Load data
df = pd.read_csv('noviceV2.csv')

# Define target and features
target = 'Match Length'
features = ['Big County', 'Big Employer', 'Big Enrollment: Record Type', 'Big Days Acceptance to Match', 
            'Big Days Interview to Acceptance', 'Big Days Interview to Match', 'Big Contact: Marital Status', 'Little Difference Between Dates']

# Drop rows with missing target values
df_clean = df.dropna(subset=[target])
X = df_clean[features]
y = df_clean[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  
            ('scaler', StandardScaler()) 
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Define the model
rf = RandomForestRegressor(random_state=1)

# Define parameter space for Bayesian optimization
param_space = {
    'regressor__n_estimators': Integer(50, 200),  # Increased upper limit
    'regressor__max_depth': Integer(5,75),      # Increased upper limit
    'regressor__min_samples_split': Integer(2, 20),  # Increased upper limit
    'regressor__min_samples_leaf': Integer(1, 20),   # Increased upper limit
    'regressor__max_features': Categorical(['sqrt', 'log2', 1.0])  # Replaced 'auto' with 1.0
}

# Create the pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', rf)
])

# Bayesian optimization
bayes_search = BayesSearchCV(
    estimator=model,
    search_spaces=param_space,
    n_iter=100,  # Increased iterations
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=1,
    n_jobs=2
)

# Fit the model to the training data
bayes_search.fit(X_train, y_train)

# Get best parameters
best_params = bayes_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the best model
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error with Best Model: {mse:.2f}")
print(f"R^2 Score with Best Model: {r2:.2f}")

# Cross-validation scores
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-Validation MSE: {-cv_scores.mean():.2f} (±{cv_scores.std():.2f})")

# Feature importances
if hasattr(best_model.named_steps['regressor'], 'feature_importances_'):
    importances = best_model.named_steps['regressor'].feature_importances_
    feature_names = np.concatenate([numerical_features, best_model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features)])
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20), palette='viridis')
    plt.title('Top 20 Feature Importances', fontsize=16)
    plt.xlabel('Importance', fontsize=14)
    plt.ylabel('Feature', fontsize=14)
    plt.show()

