In [29]:
import json
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel

In [30]:
data = pd.read_json("algoparams_from_ui.json")
print(type(data))
json_data = data.to_dict()
print(json_data)
data

<class 'pandas.core.frame.DataFrame'>
{'session_name': {'algorithms': 'test', 'feature_generation': 'test', 'feature_handling': 'test', 'feature_reduction': 'test', 'hyperparameters': 'test', 'metrics': 'test', 'probability_calibration': 'test', 'session_info': 'test', 'target': 'test', 'train': 'test', 'weighting_stratergy': 'test'}, 'session_description': {'algorithms': 'test', 'feature_generation': 'test', 'feature_handling': 'test', 'feature_reduction': 'test', 'hyperparameters': 'test', 'metrics': 'test', 'probability_calibration': 'test', 'session_info': 'test', 'target': 'test', 'train': 'test', 'weighting_stratergy': 'test'}, 'design_state_data': {'algorithms': {'RandomForestClassifier': {'model_name': 'Random Forest Classifier', 'is_selected': False, 'min_trees': 10, 'max_trees': 30, 'feature_sampling_statergy': 'Default', 'min_depth': 20, 'max_depth': 30, 'min_samples_per_leaf_min_value': 5, 'min_samples_per_leaf_max_value': 50, 'parallelism': 0}, 'RandomForestRegressor': {'m

Unnamed: 0,session_name,session_description,design_state_data
algorithms,test,test,{'RandomForestClassifier': {'model_name': 'Ran...
feature_generation,test,test,"{'linear_interactions': [['petal_length', 'sep..."
feature_handling,test,test,{'sepal_length': {'feature_name': 'sepal_lengt...
feature_reduction,test,test,"{'feature_reduction_method': 'Tree-based', 'nu..."
hyperparameters,test,test,"{'stratergy': 'Grid Search', 'shuffle_grid': T..."
metrics,test,test,"{'optomize_model_hyperparameters_for': 'AUC', ..."
probability_calibration,test,test,{'probability_calibration_method': 'Sigmoid - ...
session_info,test,test,"{'project_id': '1', 'experiment_id': 'kkkk-11'..."
target,test,test,"{'prediction_type': 'Regression', 'target': 'p..."
train,test,test,"{'policy': 'Split the dataset', 'time_variable..."


In [31]:
print("Keys:", list(json_data.keys()))
print("Values:", list(json_data.values()))

Keys: ['session_name', 'session_description', 'design_state_data']
Values: [{'algorithms': 'test', 'feature_generation': 'test', 'feature_handling': 'test', 'feature_reduction': 'test', 'hyperparameters': 'test', 'metrics': 'test', 'probability_calibration': 'test', 'session_info': 'test', 'target': 'test', 'train': 'test', 'weighting_stratergy': 'test'}, {'algorithms': 'test', 'feature_generation': 'test', 'feature_handling': 'test', 'feature_reduction': 'test', 'hyperparameters': 'test', 'metrics': 'test', 'probability_calibration': 'test', 'session_info': 'test', 'target': 'test', 'train': 'test', 'weighting_stratergy': 'test'}, {'algorithms': {'RandomForestClassifier': {'model_name': 'Random Forest Classifier', 'is_selected': False, 'min_trees': 10, 'max_trees': 30, 'feature_sampling_statergy': 'Default', 'min_depth': 20, 'max_depth': 30, 'min_samples_per_leaf_min_value': 5, 'min_samples_per_leaf_max_value': 50, 'parallelism': 0}, 'RandomForestRegressor': {'model_name': 'Random For

In [32]:
def parse_json(json_file):
    with open(json_file) as file:
        data = json.load(file)
    return data

def load_data(csv_file):
    data = pd.read_csv(csv_file)
    return data

json_file = 'algoparams_from_ui.json'
csv_file = 'iris.csv'

json_data = parse_json(json_file)
data = load_data(csv_file)

In [33]:
print(type(json_data))

<class 'dict'>


In [34]:
def preprocess_data(data, json_data):
    target = json_data['design_state_data']['target']['target']
    features = [feature for feature, details in json_data['design_state_data']['feature_handling'].items() if details['is_selected']]

    X = data[features]
    y = data[target]

    imputer_dict = {}
    for feature, details in json_data['design_state_data']['feature_handling'].items():
        if details['is_selected']:
            if 'missing_values' in details['feature_details']:
                if details['feature_details']['missing_values'] == 'Impute':
                    if details['feature_details']['impute_with'] == 'Average of values':
                        imputer_dict[feature] = 'mean'
                    elif details['feature_details']['impute_with'] == 'custom':
                        imputer_dict[feature] = details['feature_details']['impute_value']
            else:
                imputer_dict[feature] = 'mean'  # Default imputation strategy if 'missing_values' key is not present

    imputer = SimpleImputer(strategy='constant', fill_value=imputer_dict)
    X = imputer.fit_transform(X)

    return X, y

In [35]:
X, y = preprocess_data(data, json_data)

In [36]:
def feature_reduction(X, y, json_data):
    reduction_method = json_data['design_state_data']['feature_reduction']['feature_reduction_method']

    # Identify categorical features
    categorical_features = [feature for feature, details in json_data['design_state_data']['feature_handling'].items()
                            if details['is_selected'] and details['feature_variable_type'] == 'text']

    # Get the indices of categorical features
    categorical_indices = [list(json_data['design_state_data']['feature_handling'].keys()).index(feature)
                           for feature in categorical_features]

    # Encode categorical features
    label_encoders = {}
    for index, feature in zip(categorical_indices, categorical_features):
        label_encoders[feature] = LabelEncoder()
        X[:, index] = label_encoders[feature].fit_transform(X[:, index])

    if reduction_method == 'No Reduction':
        return X
    elif reduction_method == 'Corr with Target':
        k = int(json_data['design_state_data']['feature_reduction']['num_of_features_to_keep'])
        selector = SelectKBest(score_func=f_regression, k=k)
        X = selector.fit_transform(X, y)
    elif reduction_method == 'Tree-based':
        num_of_trees = int(json_data['design_state_data']['feature_reduction']['num_of_trees'])
        depth_of_trees = int(json_data['design_state_data']['feature_reduction']['depth_of_trees'])
        model = RandomForestRegressor(n_estimators=num_of_trees, max_depth=depth_of_trees)
        model.fit(X, y)
        selector = SelectFromModel(model, prefit=True)
        X = selector.transform(X)
    elif reduction_method == 'PCA':
        n_components = int(json_data['design_state_data']['feature_reduction']['num_of_features_to_keep'])
        pca = PCA(n_components=n_components)
        X = pca.fit_transform(X)

    return X

In [37]:
X = feature_reduction(X, y, json_data)

In [38]:
def build_models(json_data):
    models = []
    for algo, details in json_data['design_state_data']['algorithms'].items():
        if details['is_selected']:
            if algo == 'LinearRegression':
                model = LinearRegression()
                params = {
                    'model__fit_intercept': [True, False],
                    'model__normalize': [True, False],
                    'model__n_jobs': [details['parallelism'] or None]
                }
            elif algo == 'RandomForestRegressor':
                model = RandomForestRegressor()
                params = {
                    'model__n_estimators': range(details['min_trees'], details['max_trees'] + 1),
                    'model__max_depth': range(details['min_depth'], details['max_depth'] + 1),
                    'model__min_samples_leaf': range(details['min_samples_per_leaf_min_value'], details['min_samples_per_leaf_max_value'] + 1),
                    'model__n_jobs': [details['parallelism'] or None]
                }
            else:
                continue

            models.append((algo, model, params))

    return models

In [39]:
models = build_models(json_data)

In [40]:
def evaluate_models(X, y, models):
    for name, model, params in models:
        pipeline = Pipeline(steps=[('model', model)])
        grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X)

        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)

        print(f"Model: {name}")
        print(f"Best Parameters: {grid_search.best_params_}")
        print(f"Mean Squared Error: {mse}")
        print(f"R2 Score: {r2}")
        print()

In [41]:
evaluate_models(X, y, models)

Model: RandomForestRegressor
Best Parameters: {'model__max_depth': 22, 'model__min_samples_leaf': 5, 'model__n_estimators': 12, 'model__n_jobs': None}
Mean Squared Error: 0.001107690445757586
R2 Score: 0.9980853413523937

