In [27]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet, SGDClassifier, SGDRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

# Different json files for different **models** bold text

In [3]:
f = open("/content/drive/MyDrive/Random/algoparams_from_ui.json")
og_json = json.load(f)

In [23]:
f = open("/content/drive/MyDrive/Random/knn_algo.json")
knn_json = json.load(f)

In [5]:
f = open("/content/drive/MyDrive/Random/Elastic_algo.json")
elastic_json = json.load(f)

In [6]:
f = open("/content/drive/MyDrive/Random/GBT_algo.json")
gbt_json = json.load(f)

In [42]:
f = open("/content/drive/MyDrive/Random/Neural_algo.json")
neural_json = json.load(f)

In [9]:
true = True
false = False

# Main function to parse json and create model

In [44]:
def create_model(data, df):
    root = data["design_state_data"]
    target = root["target"]["target"]
    if root["target"]["prediction_type"]=="Regression":
        scoring = "neg_mean_squared_error"
    elif root["target"]["prediction_type"]=="Classification":
        scoring = "accuracy"
    else:
        raise AttributeError("Wrong Prediction Type")
    dependent = []
    maps = {}
    param_grid = {}
    for col in root["feature_handling"].keys():
        if root["feature_handling"][col]["feature_variable_type"]=="text" and root["feature_handling"][col]["is_selected"]==True:
            temp = {}
            for ind,val in enumerate(df[col].unique()):
                temp[val]=ind
            maps[col]=(temp)
            df[col] = df[col].map(temp)
    for i in df.columns:
        if i != target and root["feature_handling"][col]["is_selected"]==True:
            dependent.append(i)
    print(f"Dependent Variables: {dependent}")
    print("-------------------------------------")
    print((f"Target Variable: {target}"))
    x_train, x_test, y_train, y_test = train_test_split(df[dependent],df[target],test_size=0.2)
    algo = []
    for i in root["algorithms"]:
        if root["algorithms"][i]["is_selected"]==True:
            algo.append(i)
    for algo, details in root['algorithms'].items():
        if details['is_selected']:
            if details.get('parallelism')==None or details.get('parallelism')==0:
                n_jobs=-1
            else:
                n_jobs=details.get('parallelism')
            if algo == 'RandomForestClassifier' or algo == 'RandomForestRegressor':
                if algo == "RandomForestClassifier":
                    model = RandomForestClassifier(n_jobs=n_jobs)
                else:
                    model = RandomForestRegressor(n_jobs=n_jobs)
                param_grid.update({
                    'n_estimators': [details['min_trees'], details['max_trees']],
                    'max_depth': range(details['min_depth'], details['max_depth'] + 1),
                    'min_samples_split': [2],
                    'min_samples_leaf': range(details['min_samples_per_leaf_min_value'], details['min_samples_per_leaf_max_value'] + 1)
                })
            elif algo == "GBTClassifier" or algo == "GBTRegressor":
                if algo == "GBTClassifier":
                    model = GradientBoostingClassifier()
                else:
                    model = GradientBoostingRegressor()
                param_grid.update({
                    'n_estimators': details['num_of_BoostingStages'],
                    'subsample': [details['min_subsample'], details['max_subsample']],
                    'min_samples_split': [2],
                    'min_samples_leaf': [details['min_subsample'], details['max_subsample']],
                    'max_depth': range(details['min_depth'], details['max_depth'] + 1),
                    'learning_rate': [0.1],  # Set to a value if needed
                    'min_impurity_decrease': [0],  # Set to a value if needed
                })
            elif algo=="LinearRegression":
                model = LinearRegression()
                param_grid.update({
                    'fit_intercept': [True, False],
                    'normalize': [True, False],
                })
            elif algo=="LogisticRegression":
                model = LogisticRegression()
                param_grid.update({
                    'penalty': ['l1', 'l2'],
                    'dual': [False],  # Set to a value if needed
                    'fit_intercept': [True, False],
                    'intercept_scaling': [1],  # Set to a value if needed
                    'class_weight': [None, 'balanced'],
                    'solver': ['liblinear', 'saga'],
                    'max_iter': details['max_iter'],
                    'C': [details['min_regparam'], details['max_regparam']]
                })

            elif algo=="RidgeRegression":
                model = Ridge()
                param_grid.update({
                    'alpha': [details['min_regparam'], details['max_regparam']],
                    'fit_intercept': [True, False],
                    'normalize': [True, False],
                    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
                    'max_iter': [details['min_iter'], details['max_iter']]
                })

            elif algo=="LassoRegression":
                model = Lasso()
                param_grid.update({
                    'alpha': [details['min_regparam'], details['max_regparam']],
                    'fit_intercept': [True, False],
                    'normalize': [True, False],
                    'precompute': [True, False],
                    'positive': [False],  # Set to a value if needed
                    'selection': ['cyclic', 'random'],
                    'max_iter': [details['min_iter'], details['max_iter']]
                })

            elif algo=="ElasticNetRegression":
                model = ElasticNet()
                param_grid.update({
                    'alpha': [details['min_regparam'], details['max_regparam']],
                    'l1_ratio': [details['min_elasticnet'], details['max_elasticnet']],
                    'fit_intercept': [True, False],
                    'precompute': [True, False],
                    'positive': [False],  # Set to a value if needed
                    'selection': ['cyclic', 'random'],
                    'max_iter': [details['min_iter'], details['max_iter']],
                    'tol': [1e-4],  # Set to a value if needed
                })
            elif algo=="xg_boost":
                model = xgb.XGBClassifier() if details['use_gradient_boosted_tree'] else xgb.XGBRegressor()
                param_grid.update({
                    'max_depth': details['max_depth_of_tree'],
                    'learning_rate': details['learningRate'],
                    'reg_alpha': details['l1_regularization'],
                    'reg_lambda': details['l2_regularization'],
                    'gamma': details['gamma'],
                    'min_child_weight': details['min_child_weight'],
                    'subsample': details['sub_sample'],
                    'colsample_bytree': details['col_sample_by_tree'],
                    'tree_method': details['tree_method'],
                    'n_estimators': details['max_num_of_trees'],
                })
            elif algo=="DecisionTreeRegressor" or algo=="DecisionTreeClassifier":
                if algo=="DecisionTreeRegressor":
                    model = DecisionTreeRegressor()
                else:
                    model = DecisionTreeClassifier()
                param_grid.update({
                    'max_depth': details['max_depth'],
                    'min_samples_split': [2],  # You can change this as needed
                    'min_samples_leaf': details['min_samples_per_leaf'],
                    'criterion': ['gini' if details['use_gini'] else 'entropy'],
                    'splitter': ['best'] if details['use_best'] else ['random'],
                })
            elif algo=="SVM":
                model = SVC()
                param_grid.update({
                    'C': details['c_value'],
                    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                    'gamma': ['scale', 'auto'] if details['auto'] else details['custom_gamma_values'],
                    'shrinking': [True],
                    'tol': [details['tolerance']],
                    'max_iter': [details['max_iterations']],
                })
            elif algo=="SGD":
                model = SGDClassifier(loss='log' if details['use_logistics'] else 'hinge')
                param_grid.update({
                    'alpha': details['alpha_value'],
                    'penalty': ['l1', 'l2', 'elasticnet'],
                    'l1_ratio': details['alpha_value'] if details['use_elastic_net_regularization'] else [0],
                    'max_iter': details['max_iterations'] if details['max_iterations'] else [1000],
                    'tol': [details['tolerance']],
                })
            elif algo=="KNN":
                model = KNeighborsClassifier(weights='distance' if details['distance_weighting'] else 'uniform',
                                 algorithm="auto",
                                 p=details['p_value'])
                param_grid.update({
                    'n_neighbors': details['k_value'],
                })

            elif algo=="extra_random_trees":
                model = ExtraTreesClassifier(n_jobs=n_jobs)
                param_grid.update({
                    'n_estimators': details['num_of_trees'],
                    'max_depth': details['max_depth'],
                    'min_samples_leaf': details['min_samples_per_leaf'],
                })
            elif algo=="neural_network":
                model = MLPClassifier()
                param_grid.update({
                    'hidden_layer_sizes': details['hidden_layer_sizes'],
                    'activation': [details['activation']],
                    'alpha': [details['alpha_value']],
                    'max_iter': [details['max_iterations']],
                    'tol': [details['convergence_tolerance']],
                    'early_stopping': [details['early_stopping']],
                    'solver': [details['solver']],
                    'shuffle': [details['shuffle_data']],
                    'learning_rate_init': [details['initial_learning_rate']],
                    'batch_size': ['auto' if details['automatic_batching'] else None],
                    'beta_1': [details['beta_1']],
                    'beta_2': [details['beta_2']],
                    'epsilon': [details['epsilon']],
                    'power_t': [details['power_t']],
                    'momentum': [details['momentum']],
                    'nesterovs_momentum': [details['use_nesterov_momentum']],
                })


            else:
                raise AttributeError

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scoring,  # You can use a different scoring metric
        cv=5,  # Number of folds for cross-validation
        n_jobs=-1 , # Use all available CPU cores for parallel processing
        verbose=2
    )
    print(f"Original parameter for Grid Search: {param_grid}")
    print("-------------------------------------")
    print("  ")
    grid_search.fit(x_train, y_train)
    best_params = grid_search.best_estimator_
    print(f"Best parameters are: {best_params}")
    print("-------------------------------------")
    print("  ")
    best_model = model.set_params(**grid_search.best_params_)  # Set best parameters
    best_model.fit(x_train, y_train)
    from sklearn.metrics import accuracy_score
    y_pred = best_model.predict(x_test)  # Replace best_model with your trained model

    # Assuming y_test is the true target values for the test set
    try:
        accuracy = accuracy_score(y_test, y_pred)
        print("Accuracy:", accuracy)
    except:
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print("Mean Squared Error:", mse)
        print("-------------------------------------")
        print("  ")
        print("R-squared:", r2)
    return best_model

Data loading

In [11]:
df = pd.read_csv("/content/drive/MyDrive/Random/iris.csv")

## Random Forest Regressor on original json

In [16]:
create_model(og_json,df)

Dependent Variables: ['sepal_length', 'sepal_width', 'petal_length', 'species']
-------------------------------------
Target Variable: petal_width
Original parameter for Grid Search: {'n_estimators': [10, 20], 'max_depth': range(20, 26), 'min_samples_split': [2], 'min_samples_leaf': range(5, 11)}
-------------------------------------
  
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters are: RandomForestRegressor(max_depth=21, min_samples_leaf=6, n_estimators=20,
                      n_jobs=-1)
-------------------------------------
  
Mean Squared Error: 0.0336062537541425
-------------------------------------
  
R-squared: 0.9335319348217118


## Different models on different json files

In [24]:
create_model(knn_json,df)

Dependent Variables: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-------------------------------------
Target Variable: species
Original parameter for Grid Search: {'n_neighbors': [78]}
-------------------------------------
  
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters are: KNeighborsClassifier(n_neighbors=78, weights='distance')
-------------------------------------
  
Accuracy: 0.9


In [45]:
create_model(elastic_json,df)

Dependent Variables: ['sepal_length', 'sepal_width', 'petal_length', 'species']
-------------------------------------
Target Variable: petal_width
Original parameter for Grid Search: {'alpha': [0.5, 0.8], 'l1_ratio': [0.5, 0.8], 'fit_intercept': [True, False], 'precompute': [True, False], 'positive': [False], 'selection': ['cyclic', 'random'], 'max_iter': [30, 50], 'tol': [0.0001]}
-------------------------------------
  
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best parameters are: ElasticNet(alpha=0.5, fit_intercept=False, max_iter=30)
-------------------------------------
  
Mean Squared Error: 0.07486363835419937
-------------------------------------
  
R-squared: 0.8688877492872416


In [28]:
create_model(gbt_json,df)

Dependent Variables: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-------------------------------------
Target Variable: species
Original parameter for Grid Search: {'n_estimators': [67, 89], 'subsample': [1, 2], 'min_samples_split': [2], 'min_samples_leaf': [1, 2], 'max_depth': range(5, 8), 'learning_rate': [0.1], 'min_impurity_decrease': [0]}
-------------------------------------
  
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters are: GradientBoostingClassifier(max_depth=7, min_impurity_decrease=0,
                           min_samples_leaf=2, n_estimators=67, subsample=1)
-------------------------------------
  
Accuracy: 0.9666666666666667


In [43]:
create_model(neural_json,df)

Dependent Variables: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-------------------------------------
Target Variable: species
Original parameter for Grid Search: {'hidden_layer_sizes': [67, 89], 'activation': ['relu'], 'alpha': [0.001], 'max_iter': [200], 'tol': [0], 'early_stopping': [True], 'solver': ['adam'], 'shuffle': [True], 'learning_rate_init': [0.001], 'batch_size': ['auto'], 'beta_1': [0.9], 'beta_2': [0.9], 'epsilon': [1e-06], 'power_t': [0], 'momentum': [0], 'nesterovs_momentum': [False]}
-------------------------------------
  
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters are: MLPClassifier(alpha=0.001, beta_2=0.9, early_stopping=True, epsilon=1e-06,
              hidden_layer_sizes=89, momentum=0, nesterovs_momentum=False,
              power_t=0, tol=0)
-------------------------------------
  
Accuracy: 0.9666666666666667
