In [749]:
# import packages
import pandas as pd
import numpy as np
import json
!pip install striprtf
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
!pip install xgboost
from xgboost import XGBClassifier, XGBRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score,accuracy_score,auc
from striprtf.striprtf import rtf_to_text
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import StratifiedKFold



In [750]:
# loading the iris data
iris = pd.read_csv("/content/iris.csv")

# **Data Preprocessing**

In [751]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [752]:
iris.shape

(150, 5)

In [753]:
iris.nunique()

Unnamed: 0,0
sepal_length,35
sepal_width,23
petal_length,43
petal_width,22
species,3


In [754]:
iris.isnull().sum()

Unnamed: 0,0
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0
species,0


**Loading the JSON file**

In [755]:
# Lopading the JSON file
import json
from striprtf.striprtf import rtf_to_text

rtf_file = "algoparams_from_ui.json.rtf"

with open(rtf_file, 'r') as file:
    rtf_content = file.read()

# Convert RTF to plain text
plain_text = rtf_to_text(rtf_content)

# Assuming the valid JSON is enclosed within curly braces {}, extract it
start_index = plain_text.find('{')
end_index = plain_text.rfind('}') + 1  # Include the closing brace

# Extract the JSON string
json_string = plain_text[start_index:end_index]

json_data = json.loads(json_string)

import pprint
pprint.pprint(json_data)



{'design_state_data': {'algorithms': {'DecisionTreeClassifier': {'is_selected': False,
                                                                 'max_depth': 7,
                                                                 'min_depth': 4,
                                                                 'min_samples_per_leaf': [12,
                                                                                          6],
                                                                 'model_name': 'Decision '
                                                                               'Tree',
                                                                 'use_best': True,
                                                                 'use_entropy': True,
                                                                 'use_gini': False,
                                                                 'use_random': True},
                                      'Decisio

### **Imputing Missing Values**

In [756]:
# feature handling

feature_handling = json_data["design_state_data"]["feature_handling"]

# Apply missing value imputation
for feature, details in feature_handling.items():
    if details["is_selected"] and "missing_values" in details["feature_details"]:
        missing_values_action = details["feature_details"]["missing_values"]
        impute_with = details["feature_details"]["impute_with"]
        impute_value = details["feature_details"]["impute_value"]

        if missing_values_action == "Impute":
            if impute_with == "Average of values":
                iris[feature] = iris[feature].fillna(iris[feature].mean())
            elif impute_with == "custom":
                iris[feature] = iris[feature].fillna(impute_value)


### **Extract Target and Feature data**

In [757]:
# Extract target column and regression type
target_target = json_data["design_state_data"]["target"]["target"]
target_type = json_data["design_state_data"]["target"]["type"]
print(target_target)
print(target_type)

petal_width
regression


In [758]:
# Extract the target variable and selected features from the JSON
selected_features = [feature for feature, details in json_data["design_state_data"]["feature_handling"].items() if details["is_selected"]]

In [759]:
# Remove the target column from selected features (not part of features to reduce)
selected_features = [feature for feature in selected_features if feature != target_target]
print(selected_features)

['sepal_length', 'sepal_width', 'petal_length', 'species']


In [760]:
# Extract target and feature data
X = iris[selected_features]
Y = iris[target_target]

### **Feature extraction**

In [761]:
# Perform one-hot encoding
encoded_species = pd.get_dummies(iris['species'], prefix='species')

# Convert encoded columns to integers (1/0)
encoded_species = encoded_species.astype(int)

# Concatenate and drop original column
X = pd.concat([iris, encoded_species], axis=1)
X.drop('species', axis=1, inplace=True)

X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_Iris-setosa,species_Iris-versicolor,species_Iris-virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0


**Defining different extraction processes**

In [762]:
# Function to perform correlation with target feature
def correlation_with_target(X, Y):
    correlations = X.corrwith(Y).abs()  # Get absolute correlation values
    sorted_features = correlations.sort_values(ascending=False)  # Sort by correlation
    return sorted_features.head(min(num_of_features_to_keep,7)).index.tolist()


# Function to perform Tree-based feature selection
def tree_based_feature_selection(X, Y, num_of_features_to_keep, num_of_trees, depth_of_trees):
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(n_estimators=num_of_trees, max_depth=depth_of_trees, random_state=42)
    model.fit(X, Y)
    importances = model.feature_importances_
    # Get indices of the top N features based on importance
    indices = importances.argsort()[-min(num_of_features_to_keep,7):][::-1]
    return indices

# Function to perform PCA for feature reduction
def pca_feature_reduction(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    pca = PCA(n_components=num_of_features_to_keep)
    X_pca = pca.fit_transform(X_scaled)

    # Return the names of the components as selected features
    return [f"PCA_{i+1}" for i in range(num_of_features_to_keep)]


**Finding the feature reduction method**

In [763]:
feature_reduction = json_data["design_state_data"]["feature_reduction"]

# Apply feature reduction based on the method specified in the JSON
if feature_reduction['No Reduction']["is_selected"] == True:
    reduction_method = 'No Reduction' # No reduction, keep all selected features
elif feature_reduction['Correlation with target']["is_selected"] == True:
    reduction_method = 'Correlation with target'
elif feature_reduction['Tree-based']["is_selected"] == True:
    reduction_method = 'Tree-based'
elif feature_reduction['Principal Component Analysis']["is_selected"] == True:
    reduction_method = 'Principal Component Analysis'
else:
    print("Unknown feature reduction command")

print(reduction_method)

Correlation with target


In [764]:
# Get the minimum number of features to keep
num_of_features_to_keep = int(json_data["design_state_data"]["feature_reduction"][reduction_method]["num_of_features_to_keep"])
print(num_of_features_to_keep)

4


**Applying the feature reduction method**

In [765]:
# Apply feature reduction based on the method specified in the JSON
if feature_reduction['No Reduction']["is_selected"] == True:
    reduced_features = X.columns
elif feature_reduction['Correlation with target']["is_selected"] == True:
    reduced_features = correlation_with_target(X, Y)
elif feature_reduction['Tree-based']["is_selected"] == True:
    reduced_features = tree_based_feature_selection(X, Y)
elif feature_reduction['Principal Component Analysis']["is_selected"] == True:
    reduced_features = pca_feature_reduction(X)
else:
    print("Unknown feature reduction command")

print(reduced_features)

['petal_width', 'petal_length', 'species_Iris-setosa', 'sepal_length']


## **Model Development**

**Loading all models**

In [766]:
def get_model(model_name,target_type):
    if model_name == "LinearRegression":
        return LinearRegression()
    elif model_name == "LogisticRegression":
        return LogisticRegression()
    elif model_name == "RidgeRegression":
        return Ridge()
    elif model_name == "LassoRegression":
        return Lasso()
    elif model_name == "ElasticNetRegression":
        return ElasticNet()
    elif model_name == "RandomForestClassifier":
        return RandomForestClassifier()
    elif model_name == "RandomForestRegressor":
        return RandomForestRegressor()
    elif model_name == "GBTClassifier":
        return GradientBoostingClassifier()
    elif model_name == "GBTRegressor":
        return GradientBoostingRegressor()
    elif model_name == "SVM":
        if target_type == 'classification':
            return SVC()
        elif target_type == 'regression':
            return SVR()
    elif model_name == "DecisionTreeClassifier":
        return DecisionTreeClassifier()
    elif model_name == "DecisionTreeRegressor":
        return DecisionTreeRegressor()
    elif model_name == "KNN":
        if target_type == 'classification':
            return KNeighborsClassifier()
        elif target_type == 'regression':
            return KNeighborsRegressor()
    elif model_name == "xg_boost":
        if target_type == 'classification':
            return XGBClassifier()
        elif target_type == 'regression':
            return XGBRegressor()
    elif model_name == "neural_network":
        if target_type == 'classification':
            return MLPClassifier()
        elif target_type == 'regression':
            return MLPRegressor()
    elif model_name == "SGD":
        if target_type == 'classification':
            return SGDClassifier()
        elif target_type == 'regression':
            return SGDRegressor()
    elif model_name == 'extra_random_trees':
        if target_type == 'classification':
            return ExtraTreesClassifier()
        elif target_type == 'regression':
            return ExtraTreesRegressor()
    else:
        print("Unknown model name")

**Checking for selected models and performing train test split**

In [767]:
# Extracting the selected models according
selected_models = [model for model, details in json_data['design_state_data']['algorithms'].items()
                     if details['is_selected']]

# Display selected regression models
print(selected_models)

['RandomForestRegressor', 'GBTRegressor', 'LinearRegression', 'RidgeRegression', 'LassoRegression', 'ElasticNetRegression', 'xg_boost', 'DecisionTreeRegressor', 'SVM', 'SGD', 'KNN', 'extra_random_trees', 'neural_network']


In [768]:
# train test splitting the data
train_ratio = json_data["design_state_data"]["train"]["train_ratio"]
random_seed = json_data["design_state_data"]["train"]["random_seed"]
print(train_ratio)
print(random_seed)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=train_ratio, random_state=random_seed)


0.8
42


### **Parameter grid corresponding to each model**

In [769]:
# Function to generate parameter grids
def generate_param_grid(model_name, model_details):
    param_grid = {}

    if model_name == "RandomForestClassifier" or model_name == "RandomForestRegressor":
        param_grid = {
            "n_estimators": [model_details['min_trees'], model_details['max_trees']],
            "max_depth": [model_details['min_depth'], model_details['max_depth']],
            "min_samples_split": [model_details['min_samples_per_leaf_min_value'], model_details['min_samples_per_leaf_max_value']],
            "n_jobs": [model_details['parallelism'] if model_details['parallelism'] != 0 else -1 ],
        }
    elif model_name == "GBTClassifier" or model_name == "GBTRegressor":
        param_grid = {
            "n_estimators": model_details['num_of_BoostingStages'],
            "learning_rate": [model_details.get('learningRate', 0.1)],
            "max_depth": [model_details['min_depth'], model_details['max_depth']],
            "min_samples_split": [model_details['min_subsample'], model_details['max_subsample']],
            "max_features": [model_details['fixed_number']],
        }
    elif model_name == "LinearRegression" or model_name == "LogisticRegression":
          param_brid = {
            "max_iter": [model_details['min_iter'], model_details['max_iter']],
            #"tol": [0.0001],
            "fit_intercept": [True, False],
            "normalize": [True, False] if model_name != "LogisticRegression" else [False],
       }
    elif model_name == "RidgeRegression" or model_name == "LassoRegression":
         param_grid = {
            "alpha": [model_details['min_regparam'], model_details['max_regparam']],
            "max_iter": [model_details['min_iter'], model_details['max_iter']],
            #"tol": [0.0001],
            "fit_intercept": [True, False],
         }
    elif model_name == "ElasticNetRegression":
        param_grid = {
            "alpha": [model_details['min_regparam'], model_details['max_regparam']],
            "l1_ratio": [model_details['min_elasticnet'], model_details['max_elasticnet']],
            "max_iter": [model_details['min_iter'], model_details['max_iter']],
            #"tol": [model_details['tolerance'] if model_details['tolerance'] != 0 else 1e-3 ],
            "fit_intercept": [True, False],
            #"normalize": [True, False],
        }
    elif model_name == "DecisionTreeClassifier" or model_name == "DecisionTreeRegressor":
        param_grid = {
            "max_depth": [model_details['min_depth'], model_details['max_depth']],
            "min_samples_split": [model_details['min_samples_per_leaf'][0], model_details['min_samples_per_leaf'][1]],
            "splitter": ['best', 'random'],
        }
    elif model_name == "SVM":
        param_grid = {
            "C": model_details['c_value'],
            "kernel": ['linear', 'rbf', 'poly', 'sigmoid'],
            "gamma": [model_details['custom_gamma_values']],
            "max_iter": [model_details['max_iterations']],
            "tol": [model_details['tolerance'] if model_details['tolerance'] != 0 else 1e-3 ],
        }
    elif model_name == "SGD":
        param_grid = {
            "alpha": model_details['alpha_value'],
            "max_iter": [10],
            "tol":  [model_details['tolerance'] if model_details['tolerance'] != 0 else 1e-3 ],
            "penalty": ['l1', 'l2', 'elasticnet'],
        }
    elif model_name == "KNN":
        param_grid = {
            "n_neighbors": model_details['k_value'],
            "weights": ['uniform', 'distance'],
            "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
            "p": [model_details['p_value'] if model_details['p_value'] != 0 else 1 ],
        }
    elif model_name == "extra_random_trees":
        param_grid = {
            "n_estimators": model_details['num_of_trees'],
            "max_depth": model_details['max_depth'],
            "min_samples_split": model_details['min_samples_per_leaf'],
            "max_features": ['sqrt', 'log2'],
            "n_jobs": [model_details['parallelism'] if model_details['parallelism'] != 0 else -1 ],
        }
    elif model_name == "xg_boost":
        param_grid = {
            "n_estimators":  [model_details['max_num_of_trees'] if model_details['max_num_of_trees'] != 0 else 50 ],
            "learning_rate": model_details['learningRate'],
            "max_depth": model_details['max_depth_of_tree'],
            "gamma": model_details['gamma'],
            "subsample": model_details['sub_sample'],
            "colsample_bytree": model_details['col_sample_by_tree'],
            "min_child_weight": model_details['min_child_weight'],
            "early_stopping_rounds": [model_details['early_stopping_rounds']],
            "n_jobs": [model_details['parallelism'] if model_details['parallelism'] != 0 else -1 ],
        }
    elif model_name == "neural_network":
      # Provide a default value for 'activation' if it's missing or invalid
         param_grid = {
            "hidden_layer_sizes": [tuple(model_details['hidden_layer_sizes'])],
            "activation": [model_details['activation'] if model_details['activation'] != "" else "relu" ],
            "solver": [model_details['solver'] if model_details['solver'] in ['adam', 'sgd', 'lbfgs'] else "adam" ],
            "alpha": [model_details['alpha_value'] if model_details['alpha_value'] != 0 else 0.001 ],
            "learning_rate_init": [model_details['initial_learning_rate'] if model_details['initial_learning_rate'] != 0 else 0.001 ],
            "max_iter": [model_details['max_iterations'] if model_details['max_iterations'] != 0 else 200 ],
        }
    return param_grid

In [770]:
# Printing parameter grids
for model_name in selected_models:
    print(model_name)
    print(generate_param_grid(model_name, json_data['design_state_data']['algorithms'][model_name]))

RandomForestRegressor
{'n_estimators': [10, 20], 'max_depth': [20, 25], 'min_samples_split': [5, 10], 'n_jobs': [-1]}
GBTRegressor
{'n_estimators': [67, 89], 'learning_rate': [0.1], 'max_depth': [5, 7], 'min_samples_split': [2, 3], 'max_features': [22]}
LinearRegression
{}
RidgeRegression
{'alpha': [0.5, 0.8], 'max_iter': [30, 50], 'fit_intercept': [True, False]}
LassoRegression
{'alpha': [0.5, 0.8], 'max_iter': [30, 50], 'fit_intercept': [True, False]}
ElasticNetRegression
{'alpha': [0.5, 0.8], 'l1_ratio': [0.5, 0.8], 'max_iter': [30, 50], 'fit_intercept': [True, False]}
xg_boost
{'n_estimators': [50], 'learning_rate': [0.01, 0.3], 'max_depth': [3, 10], 'gamma': [4], 'subsample': [0.67], 'colsample_bytree': [0.67], 'min_child_weight': [67], 'early_stopping_rounds': [2], 'n_jobs': [-1]}
DecisionTreeRegressor
{'max_depth': [4, 7], 'min_samples_split': [12, 6], 'splitter': ['best', 'random']}
SVM
{'C': [566, 79], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'gamma': [True], 'max_iter'

### **Implementing Hyperparameter tuning**

In [771]:
# hyperparameter tuning
search_method = json_data['design_state_data']['hyperparameters']['search_method']
if  json_data["design_state_data"]["train"]["k_fold"] == False:
  cv = None
else:
  cv = KFold(n_splits=5, shuffle=True, random_state=42)

print(search_method)
print(cv)
print(json_data["design_state_data"]["train"]["k_fold"] )

Grid Search
KFold(n_splits=5, random_state=42, shuffle=True)
True


### **Finding the best model**

In [772]:
# Creating grids and finding the best model
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Initialize variables to store the best model and performance
best_model = None
best_score = float('inf')  # For regression, lower MSE is better
best_model_name = ""

for model_name in selected_models:
    if model_name != 'xg_boost':
        grid_search = GridSearchCV(
            estimator = get_model(model_name, target_type),
            param_grid = generate_param_grid(model_name, json_data['design_state_data']['algorithms'][model_name]),
            cv=cv,
            scoring='accuracy' if target_type == 'classification' else 'neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        grid_search.fit(X_train, Y_train)
        best_estimator = grid_search.best_estimator_
        predictions = best_estimator.predict(X_test)
        mse = mean_squared_error(Y_test, predictions)

        print(f"{model_name} - Best Parameters:\n{grid_search.best_params_}")
        print(f"{model_name} - Mean Squared Error on Test Data: {mse}\n")

        # Check if this model is the best so far
        if mse < best_score:  # For regression, lower MSE is better
            best_score = mse
            best_model_name = model_name

    else:
        # For 'xg_boost' models
        if target_type == "classification":
            model = XGBClassifier()
        else:
            model = XGBRegressor()

        model.fit(X_train, Y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(Y_test, predictions)

        print(f"{model_name} - Mean Squared Error on Test Data: {mse}\n")

        # Check if this model is the best so far
        if mse < best_score:  # For regression, lower MSE is better
            best_score = mse
            best_model_name = model_name

# After all models are evaluated, print the best model's details
print(f"\nBest Model: {best_model_name}")
print(f"Best Model's Mean Squared Error:{best_score}\n")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
RandomForestRegressor - Best Parameters:
{'max_depth': 25, 'min_samples_split': 5, 'n_estimators': 20, 'n_jobs': -1}
RandomForestRegressor - Mean Squared Error on Test Data: 0.00041115191995780276

Fitting 5 folds for each of 8 candidates, totalling 40 fits
GBTRegressor - Best Parameters:
{'learning_rate': 0.1, 'max_depth': 7, 'max_features': 22, 'min_samples_split': 2, 'n_estimators': 67}
GBTRegressor - Mean Squared Error on Test Data: 0.0009874067789847976

Fitting 5 folds for each of 1 candidates, totalling 5 fits
LinearRegression - Best Parameters:
{}
LinearRegression - Mean Squared Error on Test Data: 7.979127759596318e-32

Fitting 5 folds for each of 8 candidates, totalling 40 fits
RidgeRegression - Best Parameters:
{'alpha': 0.5, 'fit_intercept': False, 'max_iter': 30}
RidgeRegression - Mean Squared Error on Test Data: 0.0004816757219879755

Fitting 5 folds for each of 8 candidates, totalling 40 fits
LassoRegression - B