# First we will load our json file in python and read it

In [None]:
import json
with open("algoparams_from_ui.json") as json_file:
    data = json.load(json_file)

# will import required python libraries

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

# Q1  Read the target variable and type of regression to be run

In [None]:
target = data["design_state_data"]["target"]["target"]
prediction_type = data["design_state_data"]["target"]["type"]


# Q2 Read the features (which are column names in the csv) and figure out what missing imputation needs to be applied and apply that to the columns loaded in a dataframe

In [None]:
df = pd.read_csv(data["design_state_data"]["session_info"]["C:\Users\Ak7shy\Desktop\iris.csv"])

for feature, details in data["design_state_data"]["feature_handling"].items():
    if details["missing_values"] == "Impute":
        if details["impute_with"] == "Average of values":
            df[feature] = df[feature].fillna(df[feature].mean())
        else:
            df[feature] = df[feature].fillna(details["impute_value"])

# Q3 Compute feature reduction based on input. See the screenshot below where there can be No Reduction, Corr with Target, Tree-based, PCA. Please make sure you write code so that all options can work. If we rerun your code with a different Json it should work if we switch No Reduction to say PCA.

In [None]:
reduction_method = data["design_state_data"]["feature_reduction"]["feature_reduction_method"]
if reduction_method == "Tree-based":
    num_of_features_to_keep = int(data["design_state_data"]["feature_reduction"]["num_of_features_to_keep"])
    num_of_trees = int(data["design_state_data"]["feature_reduction"]["num_of_trees"])
    depth_of_trees = int(data["design_state_data"]["feature_reduction"]["depth_of_trees"])
    
    # Apply tree-based feature selection
    selector = SelectKBest(f_regression, k=num_of_features_to_keep)
    X = df.drop(columns=[target])
    y = df[target]
    X_reduced = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]
    df = pd.concat([pd.DataFrame(X_reduced, columns=selected_features), df[target]], axis=1)
elif reduction_method == "PCA":
    num_of_features_to_keep = int(data["design_state_data"]["feature_reduction"]["num_of_features_to_keep"])
    pca = PCA(n_components=num_of_features_to_keep)
    X = df.drop(columns=[target])
    y = df[target]
    X_reduced = pca.fit_transform(X)
    selected_features = [f"PC{i+1}" for i in range(num_of_features_to_keep)]
    df = pd.concat([pd.DataFrame(X_reduced, columns=selected_features), df[target]], axis=1)


# Q4  Parse the Json and make the model objects (using sklean) that can handle what is required in the “prediction_type” specified in the JSON (See #1 where “prediction_type” is specified). Keep in mind not to pick models that don’t apply for the prediction_type specified
 

In [None]:
models = {
    "Regression": {
        "LinearRegression": LinearRegression(),
        "Ridge": Ridge(),
        "Lasso": Lasso(),
        "ElasticNet": ElasticNet(),
        "RandomForestRegressor": RandomForestRegressor(),
        "GradientBoostingRegressor": GradientBoostingRegressor(),
        "SVR": SVR(),
        "KNeighborsRegressor": KNeighborsRegressor(),
        "MLPRegressor": MLPRegressor(),
        "DecisionTreeRegressor": DecisionTreeRegressor()
    },
    "Classification": {
        "LogisticRegression": LogisticRegression(),
        "RandomForestClassifier": RandomForestClassifier(),
        "GradientBoostingClassifier": GradientBoostingClassifier(),
        "SVC": SVC(),
        "KNeighborsClassifier": KNeighborsClassifier(),
        "MLPClassifier": MLPClassifier(),
        "DecisionTreeClassifier": DecisionTreeClassifier()
    }
}

selected_models = []
for model_name, model_details in data["design_state_data"]["algorithms"].items():
    if model_details["is_selected"]:
        if model_name in models[prediction_type]:
            selected_models.append((model_name, models[prediction_type][model_name]))


# Q5 Run the fit and predict on each model – keep in mind that you need to do hyper parameter tuning i.e., use GridSearchCV
 

In [None]:
results = {}

for model_name, model in selected_models:
    pipeline = create_pipeline(model, prediction_type)
    hyperparameters = data["design_state_data"]["hyperparameters"]
    grid_search = GridSearchCV(pipeline, hyperparameters, scoring="neg_mean_squared_error", cv=5)
    grid_search.fit(df.drop(columns=[target]), df[target])

    y_true = df[target]
    y_pred = grid_search.predict(df.drop(columns=[target]))
    
    results[model_name] = {
        "best_params": grid_search.best_params_,
        "best_score": grid_search.best_score_,
        **get_model_metrics(y_true, y_pred, prediction_type)
    }

# obtaining required metrics

In [None]:
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {metrics['best_params']}")
    print(f"Best Score (MSE): {metrics['best_score']}")
    print(f"Mean Squared Error: {metrics['mean_squared_error']}")
    print(f"Mean Absolute Error: {metrics['mean_absolute_error']}")
    print(f"R-squared Score: {metrics['r2_score']}")
    print("-------------")