In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [53]:
#Separating features and target variables
data = pd.read_csv('dummy_data.csv')
features = data.iloc[:, 3:-3]
curve_x0 = data['curve_x0']
curve_k = data['curve_k']

In [54]:
string_columns = features.select_dtypes(include=['object','bool']).columns

# Identify numeric columns
numeric_columns = features.select_dtypes(exclude=['object','bool']).columns

# Instantiate OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Use ColumnTransformer to apply OneHotEncoder to string columns and pass through numeric columns
ct = ColumnTransformer(
    [('ohe', ohe, string_columns)],  # Apply OneHotEncoder to string and boolean columns
    remainder='passthrough'  # Pass through numeric columns
)

# Perform one-hot encoding
features_encoded = ct.fit_transform(features)

# Convert the encoded features back to a DataFrame
features_encoded = pd.DataFrame(features_encoded)


In [95]:
# Define models: can try to change parameters if needed
models = {
    'Ridge Regression': (Ridge(), {'alpha': [0.1, 1.0, 10.0]}),
    'Lasso Regression': (Lasso(), {'alpha': [0.1, 1.0, 10.0]}),
    'Elastic Net': (ElasticNet(), {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.1, 0.5, 0.9]}),
    'SVR': (SVR(), {'C': [0.1, 1.0, 10.0], 'gamma': ['scale', 'auto']}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]})
}

# Test different amounts of folds for cross_validation
num_folds = [2, 3, 4, 5]

# Perform cross-validation for each model, each number of folds, and each set of hyperparameters on curve_x0
best_model = None
best_score = float('-inf')
for model_name, (model, param_grid) in models.items():
    for fold in num_folds:
        kf = KFold(n_splits=fold, shuffle=True, random_state=41)
        
        # Hyperparameter tuning using GridSearchCV
        grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='neg_mean_squared_error')
        grid_search.fit(features_encoded, curve_x0)
        
        # Get the best model and its score
        best_estimator = grid_search.best_estimator_
        best_score_for_fold = -grid_search.best_score_
        
        
        # Update the best model if the current model has a better score
        if best_score_for_fold > best_score:
            best_model = best_estimator
            best_score = best_score_for_fold
            best_num_folds = fold

print("Best Model (curve_x0):", best_model)
print("Best CV Score (curve_x0):", best_score)
print("Number of Folds for Best Model (curve_x0):", best_num_folds)

Best Model (curve_x0): Ridge(alpha=0.1)
Best CV Score (curve_x0): 13507.518118672162
Number of Folds for Best Model (curve_x0): 2


In [96]:
# Everything as before but on curve_k
best_model = None
best_score = float('-inf')
for model_name, (model, param_grid) in models.items():
    for fold in num_folds:
        kf = KFold(n_splits=fold, shuffle=True, random_state=41)
        
        # Hyperparameter tuning using GridSearchCV
        grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='neg_mean_squared_error')
        grid_search.fit(features_encoded, curve_k)
        
        # Get the best model and its score
        best_estimator = grid_search.best_estimator_
        best_score_for_fold = -grid_search.best_score_
        
        
        # Update the best model if the current model has a better score
        if best_score_for_fold > best_score:
            best_model = best_estimator
            best_score = best_score_for_fold
            best_num_folds = fold

print("Best Model (curve_k):", best_model)
print("Best CV Score (curve_k):", best_score)
print("Number of Folds for Best Model (curve_k):", best_num_folds)

Best Model (curve_k): RandomForestRegressor(max_depth=10, n_estimators=200)
Best CV Score (curve_k): 8.356652360502297e-06
Number of Folds for Best Model (curve_k): 2


In [99]:
#Multiregressor that uses both curve_x0 and curve_k
y = data[['curve_x0','curve_k']]
models = {
    'Ridge Regression': (Ridge(), {'estimator__alpha': [0.1, 1.0, 10.0]}),
    'Lasso Regression': (Lasso(), {'estimator__alpha': [0.1, 1.0, 10.0]}),
    'Elastic Net': (ElasticNet(), {'estimator__alpha': [0.1, 1.0, 10.0], 'estimator__l1_ratio': [0.1, 0.5, 0.9]}),
    'SVR': (SVR(), {'estimator__C': [1, 10, 100], 'estimator__gamma': [0.1, 1.0, 10.0]}),
    'Random Forest': (RandomForestRegressor(), {'estimator__n_estimators': [50, 100, 200]})
}
best_model = None
best_score = float('-inf')
best_num_folds = None
for model_name, (model, param_grid) in models.items():
    for fold in num_folds:
        kf = KFold(n_splits=fold, shuffle=True, random_state=42)
        
        # MultiOutputRegressor to handle multiple target variables
        mor = MultiOutputRegressor(model)
        
        # Hyperparameter tuning using GridSearchCV
        grid_search = GridSearchCV(mor, param_grid, cv=kf, scoring='neg_mean_squared_error')
        grid_search.fit(features_encoded, y)
        
        # Get the best model and its score
        best_estimator = grid_search.best_estimator_
        best_score_for_fold = -grid_search.best_score_
        
        
        # Update the best model if the current model has a better score
        if best_score_for_fold > best_score:
            best_model = best_estimator
            best_score = best_score_for_fold
            best_num_folds = fold

print("Best Model (MultiRegression):", best_model)
print("Best CV Score (MultiRegression):", best_score)
print("Number of Folds for Best Model(MultiRegression):", best_num_folds)

Best Model (MultiRegression): MultiOutputRegressor(estimator=RandomForestRegressor(n_estimators=200))
Best CV Score (MultiRegression): 5872.725052652666
Number of Folds for Best Model(MultiRegression): 4
