In [45]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC, SVR
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, f1_score
import matplotlib.pyplot as plt
import warnings
import os
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier

warnings.filterwarnings('ignore')

# Load JSON configuration
with open('C:\\Users\\91844\\Desktop\\algorithm1.json') as json_file:
    config = json.load(json_file)

# Load dataset
dataset_path = config['design_state_data']['session_info']['dataset']
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Dataset file not found: {dataset_path}")

df = pd.read_csv(dataset_path)

# Encode categorical 'species' column if it exists
if 'species' in df.columns:
    df['species'] = LabelEncoder().fit_transform(df['species'])

# Extract target and features information
target_column = config['design_state_data']['target']['target']
prediction_type = config['design_state_data']['target']['type']
features = config['design_state_data']['feature_handling']

# Handle missing values and preprocessing
for feature_name, feature_information in features.items():
    if feature_information['is_selected']:
        if feature_information['feature_variable_type'] == 'numerical':
            strategy_used = feature_information['feature_details']['impute_with'].lower()
            if strategy_used == 'average of values':
                strategy_used = 'mean'  # Map 'average of values' to 'mean'
            elif strategy_used not in ['mean', 'median', 'most_frequent', 'constant']:
                raise ValueError(f"Invalid imputation strategy '{strategy_used}' for feature '{feature_name}'")
            impute_value = feature_information['feature_details'].get('impute_value')  # Get impute_value if needed
            imputer = SimpleImputer(strategy=strategy_used, fill_value=impute_value)
            df[feature_name] = imputer.fit_transform(df[[feature_name]])
        elif feature_information['feature_variable_type'] == 'text':
            # Handle text feature as needed (e.g., tokenize and hash)
            pass  # Add your text preprocessing logic here if required

# Split data
X = df.drop(columns=[target_column])
y = df[target_column]

# Feature reduction (if applicable)
if config['design_state_data']['feature_reduction']['feature_reduction_method'] == 'Tree-based':
    model = RandomForestRegressor(
        n_estimators=int(config['design_state_data']['feature_reduction']['num_of_trees']),
        max_depth=int(config['design_state_data']['feature_reduction']['depth_of_trees'])
    )
    model.fit(X, y)
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    num_features = int(config['design_state_data']['feature_reduction']['num_of_features_to_keep'])
    selected_features = [X.columns[indices[f]] for f in range(num_features)]
    X = X[selected_features]

# Model selection and hyperparameter tuning
algorithms = config['design_state_data']['algorithms']
results = {}

for algorithm_name, algo_information in algorithms.items():
    if algo_information['is_selected']:
        if algorithm_name == 'RandomForestRegressor' or algorithm_name == 'RandomForestClassifier':
            parameter_grid = {
                'model__n_estimators': range(algo_information['min_trees'], algo_information['max_trees'] + 1),
                'model__max_depth': range(algo_information['min_depth'], algo_information['max_depth'] + 1),
                'model__min_samples_leaf': range(algo_information['min_samples_per_leaf_min_value'], algo_information['min_samples_per_leaf_max_value'] + 1)
            }
            if algorithm_name == 'RandomForestRegressor':
                model = RandomForestRegressor()
                scorer = 'r2'  # Use r2 for regression
            elif algorithm_name == 'RandomForestClassifier':
                model = RandomForestClassifier()
                scorer = 'f1_weighted'  # Use F1 score for classification

        elif algorithm_name == 'GBTRegressor' or algorithm_name == 'GBTClassifier':
            parameter_grid = {
                'model__n_estimators': algo_information['num_of_BoostingStages'],
                'model__learning_rate': algo_information['learningRate'],
                'model__subsample': np.linspace(algo_information['min_subsample'], algo_information['max_subsample'], 5),
                'model__min_samples_split': range(algo_information['min_stepsize'], algo_information['max_stepsize'] + 1),
                'model__max_depth': range(algo_information['min_depth'], algo_information['max_depth'] + 1)
            }
            if algorithm_name == 'GBTRegressor':
                model = GradientBoostingRegressor()
                scorer = 'r2'  # Use r2 for regression
            elif algorithm_name == 'GBTClassifier':
                model = GradientBoostingClassifier()
                scorer = 'f1_weighted'  # Use F1 score for classification

        elif algorithm_name == 'DecisionTreeRegressor':
            parameter_grid = {
                'model__max_depth': range(algo_information['min_depth'], algo_information['max_depth'] + 1),
                'model__min_samples_leaf': algo_information['min_samples_per_leaf'],
                'model__criterion': ['mse']  # Using Mean Squared Error (MSE) for Decision Tree Regressor
            }
            model = DecisionTreeRegressor()
            scorer = 'r2'  # Use r2 for regression

        elif algorithm_name == 'DecisionTreeClassifier':
            parameter_grid = {
                'model__max_depth': range(algo_information['min_depth'], algo_information['max_depth'] + 1),
                'model__min_samples_leaf': algo_information['min_samples_per_leaf'],
                'model__criterion': ['entropy' if algo_information['use_entropy'] else 'gini']
            }
            model = DecisionTreeClassifier()
            scorer = 'f1_weighted'  # Use F1 score for classification

        elif algorithm_name == 'SVM':
            parameter_grid = {
                'model__C': algo_information['c_value'],
                'model__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                'model__gamma': ['scale', 'auto'] if algo_information['auto'] else algo_information['custom_gamma_values'],
                'model__tol': [algo_information['tolerance']],
                'model__max_iter': [algo_information['max_iterations']]
            }
            model = SVC() if algo_information['linear_kernel'] else SVR()
            scorer = 'f1_weighted'  # Use F1 score for classification

        elif algorithm_name == 'SGD':
            parameter_grid = {
                'model__loss': ['log' if algo_information['use_logistics'] else 'modified_huber'],
                'model__penalty': ['l1', 'l2', 'elasticnet'],
                'model__alpha': algo_information['alpha_value'],
                'model__tol': [algo_information['tolerance']],
                'model__max_iter': [algo_information['max_iterations']],
                'model__l1_ratio': np.linspace(algo_information['min_elasticnet'], algo_information['max_elasticnet'], 5)
            }
            model = SGDClassifier() if algo_information['use_logistics'] else SGDRegressor()
            scorer = 'f1_weighted'  # Use F1 score for classification

        elif algorithm_name == 'KNN':
            parameter_grid = {
                'model__n_neighbors': algo_information['k_value'],
                'model__weights': ['distance' if algo_information['distance_weighting'] else 'uniform'],
                'model__algorithm': [algo_information['neighbour_finding_algorithm']],
                'model__p': [algo_information['p_value']],
                'model__n_jobs': [algo_information['parallelism']]
            }
            model = KNeighborsClassifier() if prediction_type == 'classification' else KNeighborsRegressor()
            scorer = 'f1_weighted' if prediction_type == 'classification' else 'r2'

        elif algorithm_name == 'ExtraRandomTrees':
            parameter_grid = {
                'model__n_estimators': algo_information['num_of_trees'],
                'model__max_features': ['sqrt', 'log2'],
                'model__max_depth': algo_information['max_depth'],
                'model__min_samples_leaf': algo_information['min_samples_per_leaf'],
                'model__n_jobs': [algo_information['parallelism']]
            }
            model = ExtraTreesClassifier() if prediction_type == 'classification' else ExtraTreesRegressor()
            scorer = 'f1_weighted' if prediction_type == 'classification' else 'r2'

        elif algorithm_name == 'NeuralNetwork':
            parameter_grid = {
                'model__hidden_layer_sizes': algo_information['hidden_layer_sizes'],
                'model__activation': [algo_information['activation']],
                'model__alpha': [algo_information['alpha_value']],
                'model__max_iter': [algo_information['max_iterations']],
                'model__tol': [algo_information['convergence_tolerance']],
                'model__early_stopping': [algo_information['early_stopping']],
                'model__solver': [algo_information['solver']],
                'model__shuffle': [algo_information['shuffle_data']],
                'model__learning_rate_init': [algo_information['initial_learning_rate']],
                'model__batch_size': ['auto' if algo_information['automatic_batching'] else 'None'],
                'model__beta_1': [algo_information['beta_1']],
                'model__beta_2': [algo_information['beta_2']],
                'model__epsilon': [algo_information['epsilon']],
                'model__power_t': [algo_information['power_t']],
                'model__momentum': [algo_information['momentum']],
                'model__nesterovs_momentum': [algo_information['use_nesterov_momentum']]
            }
            model = MLPClassifier() if prediction_type == 'classification' else MLPRegressor()
            scorer = 'f1_weighted' if prediction_type == 'classification' else 'r2'

        pipeline = Pipeline([
            ('imputer', SimpleImputer()),
            ('model', model)
        ])

        grid_search = GridSearchCV(estimator=pipeline, param_grid=parameter_grid, cv=5, n_jobs=-1, scoring=scorer)
        grid_search.fit(X, y)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X)
        if algorithm_name in ['RandomForestRegressor', 'GBTRegressor', 'DecisionTreeRegressor', 'LinearRegression',
                              'RidgeRegression', 'LassoRegression', 'ElasticNetRegression', 'XGBoost', 'KNN', 'Extra Random Trees', 'Neural Network']:
            score = r2_score(y, y_pred)
        elif algorithm_name in ['RandomForestClassifier', 'GBTClassifier', 'DecisionTreeClassifier', 'LogisticRegression',
                                'SVM', 'SGD']:
            score = f1_score(y, y_pred, average='weighted')

        results[algorithm_name] = {
            'Best Model': best_model,
            'Best Parameters': grid_search.best_params_,
            'Best Score': score
        }

# Print results best models
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Best Score: {metrics['Best Score']}")
    print(f"Best Parameters: {metrics['Best Parameters']}")
    print("\n")
    


Model: RandomForestRegressor
Best Score: 0.9661141355459152
Best Parameters: {'model__max_depth': 23, 'model__min_samples_leaf': 5, 'model__n_estimators': 10}


