In [9]:
import numpy as np
import pandas as pd

try:
    headers
except NameError:
    headers = []

def import_raw_data() -> pd.DataFrame:
    global headers
    path1 = 'Datasets/communities.data'
    path2 = 'Datasets/communities.names'
    headers = []
    with open(path2, 'r') as file:
        for line in file:
            if line.startswith('@attribute'):
                name = line.split()[1]
                headers.append(name)
    data = pd.read_csv(path1, sep=',', names=headers)
    data.drop('state', axis=1, inplace=True)
    data.drop('county', axis=1, inplace=True)
    data.drop('community', axis=1, inplace=True)
    data.drop('communityname', axis=1, inplace=True)
    data.drop('fold', axis=1, inplace=True)
    data.replace('?', np.NAN, inplace=True)
    return data

['state', 'county', 'community', 'communityname', 'fold', 'population', 'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOn

In [None]:
data = import_raw_data()
data.to_csv("data.csv", index=False)

# DATA IMPUTED BY MICE

In [None]:
from copy import copy

data_mice = copy(data)
mice_X = data_mice.iloc[:,:-1]
mice_y = data_mice.iloc[:,-1]

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import time

imputer = IterativeImputer(max_iter=500, random_state=42, tol=0.001, min_value=0, max_value=1)
time_start = time.time()
imputed_values = imputer.fit_transform(mice_X)
time_stop = time.time()
print(f"Exec time: {time_stop-time_start}")

In [None]:
data_mice = pd.DataFrame(imputed_values)
data_mice['ViolentCrimesPerPop'] = mice_y
data_mice.columns = [col for col in headers if col not in ['state', 'county', 'community', 'communityname', 'fold', ]]

In [None]:
data_mice.to_csv("data_mice.csv", index=False)

In [None]:
mice_X = data_mice.iloc[:,:-1]
mice_y = data_mice.iloc[:,-1]

In [None]:
# TOP 30 MICE FEATURES

In [None]:
correlation_matrix = data_mice.corr().abs()
target_correlation = correlation_matrix['ViolentCrimesPerPop']
target_correlation = target_correlation.drop(labels=['ViolentCrimesPerPop'])
sorted_features = target_correlation.sort_values(ascending=False)
top_30_mice_cols = sorted_features.head(30).index.tolist()
print("Top 30 features with highest correlation to target:", top_30_mice_cols)
top_30_mice_X = mice_X[top_30_mice_cols]
top_30_mice_X

In [None]:
top_30_mice_X.to_csv("top_30_mice_X.csv", index=False)

# DATA IMPUTED BY AUTOENCODER

In [None]:
from keras.src.regularizers import l1_l2
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from keras.src.layers import Dropout
from copy import copy

auto = copy(data.astype(np.float32))
auto_X = auto.iloc[:,:-1]
auto_y = auto.iloc[:,-1]

In [None]:
auto_X_array: np.ndarray = auto_X.astype(np.float32).to_numpy()

mask = pd.isna(auto_X).to_numpy()

auto_X_filled = auto_X.fillna(0)

auto_X_filled_array: np.ndarray = auto_X_filled.to_numpy()

X_train, X_val = train_test_split(auto_X_filled_array, test_size=0.2, random_state=42)
input_dim = X_train.shape[1]
encoding_dim = 50

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu', activity_regularizer=l1_l2(l1=1e-5, l2=1e-4))(input_layer)
encoded = Dropout(0.3)(encoded)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)

autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

autoencoder.fit(X_train, X_train,
                epochs=1000,
                batch_size=256,
                shuffle=True,
                validation_data=(X_val, X_val),
                callbacks=[early_stopping])

predicted = autoencoder.predict(auto_X_filled_array)
is_missing = pd.isna(auto_X)

auto_X_imputed = np.where(is_missing, predicted, auto_X_filled_array)

In [None]:
data_auto = pd.DataFrame(auto_X_imputed)
data_auto.columns = [col for col in headers if col not in ['state', 'county', 'community', 'communityname', 'fold', 'ViolentCrimesPerPop']]
data_auto['ViolentCrimesPerPop'] = auto_y
data_auto

In [None]:
data_auto.to_csv("data_auto.csv", index=False)

In [None]:
auto_X = data_auto.iloc[:,:-1]
auto_y = data_auto.iloc[:,-1]

# TOP 30 AUTOENCODER FEATURES

In [None]:
correlation_matrix = data_auto.corr().abs()
target_correlation = correlation_matrix['ViolentCrimesPerPop']
target_correlation = target_correlation.drop(labels=['ViolentCrimesPerPop'])
sorted_features = target_correlation.sort_values(ascending=False)
top_30_auto_cols = sorted_features.head(30).index.tolist()

print("Top 30 features with highest correlation to target:", top_30_auto_cols)
top_30_auto_X = auto_X[top_30_auto_cols]
top_30_auto_X

In [None]:
top_30_auto_X.to_csv("top_30_auto_X.csv", index=False)

# PCA for MICE  

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
mice_X_scaled = sc.fit_transform(mice_X)
pca = PCA(n_components= 33)
pca_mice_X = pca.fit_transform(mice_X_scaled)
pca_mice_X

In [None]:
pd.DataFrame(pca_mice_X).to_csv("X_pca_mice.csv", index=False)

# PCA for AUTOENCODER

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
auto_X_scaled = sc.fit_transform(auto_X)
pca = PCA(n_components= 33)
pca_auto_X = pca.fit_transform(auto_X_scaled)
pca_auto_X

In [None]:
pd.DataFrame(pca_auto_X).to_csv("X_pca_auto.csv", index=False)

# READING IN ALL DATA

In [None]:
import pandas as pd
data = pd.read_csv("data.csv")
y = data.iloc[:, -1]

data_mice = pd.read_csv("data_mice.csv")
mice_X = data_mice.iloc[:,:-1]
mice_y = data_mice.iloc[:,-1]

data_auto = pd.read_csv("data_auto.csv")
auto_X = data_auto.iloc[:,:-1]
auto_y = data_auto.iloc[:,-1]

top_30_mice_X = pd.read_csv("top_30_mice_X.csv")

top_30_auto_X = pd.read_csv("top_30_auto_X.csv")

pca_mice_X = pd.read_csv("X_pca_mice.csv")

pca_auto_X = pd.read_csv("X_pca_auto.csv")

# DATA ANALYSIS

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corrmat = mice_X.corr()
fig = plt.figure(figsize = (16, 12))

sns.heatmap(corrmat, vmax = 0.8)
plt.show()

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca = pca.fit(mice_X)

cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()

plt.figure(figsize=(10, 6))
plt.plot(cumulative_explained_variance, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Different Principal Components')
plt.grid(True)
plt.show()

# BEST HYPERPARAM SEARCH

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNetCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from hyperopt import hp

hyper_params = {}

hyper_params[LinearRegression] = {

}


# RANSACRegressor
hyper_params[RANSACRegressor] = {
    'min_samples': hp.choice('min_samples', [0.1, 0.5]),
    'max_trials': hp.choice('max_trials', [50, 100, 200]),
    'loss': hp.choice('loss', ['squared_error', 'absolute_error']),
    'residual_threshold': hp.choice('residual_threshold', [5, 10, 15])
}

# Lasso
hyper_params[Lasso] = {
    'alpha': hp.choice('alpha', [0.001, 0.01, 0.1, 1, 10]),
    'max_iter': hp.choice('max_iter', [1000, 5000, 10000]),
    'tol': hp.choice('tol', [0.0001, 0.001])
}

# Ridge
hyper_params[Ridge] = {
    'alpha': hp.choice('alpha', [0.001, 0.01, 0.1, 1, 10]),
    'solver': hp.choice('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
}

# ElasticNetCV
hyper_params[ElasticNetCV] = {
    'l1_ratio': hp.choice('l1_ratio', [0.2, 0.5, 0.8]),
    'max_iter': hp.choice('max_iter', [1000, 5000])
}

# SVR
# hyper_params[SVR] = {
#     'C': hp.choice('C', [0.1, 1, 10, 100]),
#     'gamma': hp.choice('gamma', ['scale', 'auto']),
#     'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
#     'epsilon': hp.choice('epsilon', [0.01, 0.1, 0.2])
# }

# RandomForestRegressor
hyper_params[RandomForestRegressor] = {
    'n_estimators': hp.choice('n_estimators', [100, 200, 500]),
    'max_features': hp.choice('max_features', ['log2', 'sqrt']),
    'max_depth': hp.choice('max_depth', [None, 10, 20, 30]),
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 4])
}

# XGBRegressor
hyper_params[XGBRegressor] = {
    'learning_rate': hp.choice('learning_rate', [0.01, 0.1, 0.2]),
    'n_estimators': hp.choice('n_estimators', [100, 200, 300]),
    'max_depth': hp.choice('max_depth', [3, 6, 10]),
    'min_child_weight': hp.choice('min_child_weight', [1, 3, 5]),
    'subsample': hp.choice('subsample', [0.7, 0.8, 0.9]),
    'colsample_bytree': hp.choice('colsample_bytree', [0.7, 0.8, 0.9])
}

hyper_params

In [None]:
import pandas as pd

results:pd.DataFrame = pd.DataFrame(columns = ["model", "mice_X", "auto_X", "top_30_mice_X", "top_30_auto_X", "pca_mice_X", "pca_auto_X"])

for model in hyper_params.keys():
    results.loc[len(results)] = [model.__name__, "nan", "nan", "nan", "nan", "nan", "nan", ]
results = results.astype(object)
results

In [None]:
from joblib import dump
dump(results, "results.joblib")
results.to_csv("results.csv", index=False)

In [None]:
import pandas as pd
from joblib import load
results: pd.DataFrame = load("results.joblib")
results

In [None]:
def change_value_at(data: pd.DataFrame, col_name_of_row, row_name, col_to_change, new_value):
    data.at[data[data[col_name_of_row] == row_name].index[0], col_to_change] = new_value

def get_value_at(data: pd.DataFrame, col_name_of_row, row_name, col_to_change):
    return data.at[data[data[col_name_of_row] == row_name].index[0], col_to_change]

In [None]:
X_datas = {
    "mice_X": mice_X,
    "auto_X": auto_X,
    "top_30_mice_X": top_30_mice_X,
    "top_30_auto_X": top_30_auto_X,
    "pca_mice_X": pca_mice_X,
    "pca_auto_X": pca_auto_X
}

for v in X_datas.values():
    print(v.columns)

In [None]:
from hyperopt import fmin, tpe, hp, space_eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

from joblib import dump, load

import warnings

warnings.filterwarnings("ignore")

for data_name, X_data in X_datas.items():
    X_train, X_val, y_train, y_val = train_test_split(X_data, y, test_size=0.25, random_state=42)

    for estimator, space in hyper_params.items():
        estimator_name = estimator.__name__
        print(data_name)
        print(estimator_name)

        def objective(params: dict):
            global results
            if "." not in params.keys():
                model = estimator(**params)
            else:
                model = estimator()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            if str(get_value_at(results, "model", estimator_name, data_name)) == "nan":
                print("Params:")
                print(params)
                print(f"MAE:  {mae}")
                print(f"MSE: {mse}")
                print(f"R2:  {r2}")
                info = {
                    "model": model,
                    "params": params,
                    "mse": mse,
                    "mae": mae,
                    "r2": r2,
                    "y_pred": y_pred,
                }
                change_value_at(results, "model", estimator_name, data_name, info)
                dump(results, 'results.joblib')
                results.to_csv("results.csv", index=False)
            else:
                if get_value_at(results, "model", estimator_name, data_name)["mse"] > mse:
                    print("Params:")
                    print(params)
                    print(f"MAE:  {mae}")
                    print(f"MSE: {mse}")
                    print(f"R2:  {r2}")
                    info = {
                        "model": model,
                        "params": params,
                        "mse": mse,
                        "mae": mae,
                        "r2": r2,
                        "y_pred": y_pred,
                    }
                    change_value_at(results, "model", estimator_name, data_name, info)
                    print("NEW_BEST!")
                    print(data_name)
                    print(estimator_name)
                    print("Params:")
                    print(params)
                    print(f"MAE: {mae}")
                    print(f"MSE: {mse}")
                    print(f"R2: {r2}")
                    dump(results, 'results.joblib')
                    results.to_csv("results.csv", index=False)
            return mse

        if len(space) == 0:
            space = {
                ".": "."
            }

        max_evals = 500

        max_evals_exceptions = {
            "SVR" : 2,
            "LinearRegression": 1,
        }

        if estimator_name in max_evals_exceptions.keys():
            max_evals = max_evals_exceptions[estimator_name]

        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    verbose=True)
results

In [None]:
dump(results, 'results.joblib')
results.to_csv("results.csv", index=False)

In [None]:
results_metrics = pd.DataFrame(results.apply(lambda x: x.apply(lambda y: 'r2:'+str(y.get('r2'))+" mse:"+str(y.get('mse')) if isinstance(y, dict) else y)))
results_metrics.to_csv("results_metrics.csv", index=False)
results_metrics

In [None]:
results_params = pd.DataFrame(results.apply(lambda x: x.apply(lambda y: y.get('params') if isinstance(y, dict) else y)))
results_params.to_csv("results_params.csv", index=False)
results_params

In [None]:
results_r2 = pd.DataFrame(results.apply(lambda x: x.apply(lambda y: float(y.get('r2')) if isinstance(y, dict) else y)))
results_r2.to_csv("results_r2.csv", index=False)
results_r2

In [None]:
results_best_r2 = pd.DataFrame(results_r2[[col for col in results.columns if col != "model"]].max())
results_best_r2.to_csv("results_best_r2.csv", index=False)
results_best_r2

In [None]:
for col in results_params[[c for c in results_params.columns if c != "model"]].columns:
    print(col, ":\n")
    for model in results_params["model"]:
        print(model)
        print(get_value_at(results_params, "model", model, col))
        print("R2:",get_value_at(results, "model", model, col)["r2"])
        print()
    print("\n")

# LIGHTGBM

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, space_eval
from sklearn.model_selection import train_test_split
from joblib import dump, load

warnings.filterwarnings("ignore", category=UserWarning)


estimator_name = lgb.__name__

#results.loc[len(results)] = [estimator_name, "nan", "nan", "nan", "nan", "nan", "nan", ]

lightgbm_space = {
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'num_leaves': hp.choice('num_leaves', [20, 30, 40, 50]),
    'max_depth': hp.choice('max_depth', [5, 10, 15, 20]),
    'min_child_weight': hp.uniform('min_child_weight', 0, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'n_estimators': hp.choice('n_estimators', [100, 200, 300]),
    'verbose_eval' : -1,
    'verbose' : -1
}


for data_name, X_data in X_datas.items():
    X_train, X_val, y_train, y_val = train_test_split(X_data, y, test_size=0.25, random_state=42)
    print(data_name)
    print(estimator_name)

    def objective(params: dict):
        global results

        train_data = lgb.Dataset(X_train, label=y_train)
        num_round = 10000
        model = lgb.train(params, train_data, num_round)
        y_pred = model.predict(X_val, num_iteration=model.best_iteration)

        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        if str(get_value_at(results, "model", estimator_name, data_name)) == "nan":
            print("Params:")
            print(params)
            print(f"MAE:  {mae}")
            print(f"MSE: {mse}")
            print(f"R2:  {r2}")
            info = {
                "model": model,
                "params": params,
                "mse": mse,
                "mae": mae,
                "r2": r2,
                "y_pred": y_pred,
            }
            change_value_at(results, "model", estimator_name, data_name, info)
            dump(results, 'results.joblib')
            results.to_csv("results.csv", index=False)
        else:
            if get_value_at(results, "model", estimator_name, data_name)["mse"] > mse:
                print("Params:")
                print(params)
                print(f"MAE:  {mae}")
                print(f"MSE: {mse}")
                print(f"R2:  {r2}")
                info = {
                    "model": model,
                    "params": params,
                    "mse": mse,
                    "mae": mae,
                    "r2": r2,
                    "y_pred": y_pred,
                }
                change_value_at(results, "model", estimator_name, data_name, info)
                print("NEW_BEST!")
                print(data_name)
                print(estimator_name)
                print("Params:")
                print(params)
                print(f"MAE: {mae}")
                print(f"MSE: {mse}")
                print(f"R2: {r2}")
                dump(results, 'results.joblib')
                results.to_csv("results.csv", index=False)
        return mse

    max_evals = 500

    best = fmin(fn=objective,
                space=lightgbm_space,
                algo=tpe.suggest,
                max_evals=max_evals,
                verbose=True)
results

mice_X :
MSE: 0.017265604090334365
R^2: 0.6449182172952841
auto_X :
MSE: 0.017607750523569988
R^2: 0.6378816858873053
top_30_mice_X :
MSE: 0.018909421471191505
R^2: 0.6111117195334994
top_30_auto_X :
MSE: 0.018264735392587623
R^2: 0.6243702351856526
pca_mice_X :
MSE: 0.01872665901810104
R^2: 0.6148703842935224
pca_auto_X :
MSE: 0.01872039032458797
R^2: 0.6149993052890581


# CATBoost

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from catboost import CatBoostRegressor

estimator_name = CatBoostRegressor.__name__

#results.loc[len(results)] = [estimator_name, "nan", "nan", "nan", "nan", "nan", "nan", ]

import hyperopt.hp as hp

catboost_space = {
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'depth': hp.choice('depth', [4, 6, 8, 10]),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
    'border_count': hp.choice('border_count', [32, 64, 128, 256]),
    'iterations': hp.choice('iterations', [500, 1000, 1500]),
    'random_strength': hp.uniform('random_strength', 0, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
    'min_child_samples': hp.choice('min_child_samples', [1, 5, 10, 20]),
    'custom_metric': ['RMSE'],
    'eval_metric': 'RMSE',
    'verbose': False
}

for data_name, X_data in X_datas.items():
    X_train, X_val, y_train, y_val = train_test_split(X_data, y, test_size=0.25, random_state=42)
    print(data_name)
    print(estimator_name)

    def objective(params: dict):
        global results

        model = CatBoostRegressor(**params)
        model.fit(X_train, y_train, eval_set=(X_val, y_val))
        y_pred = model.predict(X_val)

        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        if str(get_value_at(results, "model", estimator_name, data_name)) == "nan":
            print("Params:")
            print(params)
            print(f"MAE:  {mae}")
            print(f"MSE: {mse}")
            print(f"R2:  {r2}")
            info = {
                "model": model,
                "params": params,
                "mse": mse,
                "mae": mae,
                "r2": r2,
                "y_pred": y_pred,
            }
            change_value_at(results, "model", estimator_name, data_name, info)
            dump(results, 'results.joblib')
            results.to_csv("results.csv", index=False)
        else:
            if get_value_at(results, "model", estimator_name, data_name)["mse"] > mse:
                print("Params:")
                print(params)
                print(f"MAE:  {mae}")
                print(f"MSE: {mse}")
                print(f"R2:  {r2}")
                info = {
                    "model": model,
                    "params": params,
                    "mse": mse,
                    "mae": mae,
                    "r2": r2,
                    "y_pred": y_pred,
                }
                change_value_at(results, "model", estimator_name, data_name, info)
                print("NEW_BEST!")
                print(data_name)
                print(estimator_name)
                print("Params:")
                print(params)
                print(f"MAE: {mae}")
                print(f"MSE: {mse}")
                print(f"R2: {r2}")
                dump(results, 'results.joblib')
                results.to_csv("results.csv", index=False)
        return mse

    max_evals = 500

    best = fmin(fn=objective,
                space=catboost_space,
                algo=tpe.suggest,
                max_evals=max_evals,
                verbose=True)
results

# STANDARD NEURAL NETWORK

In [None]:
from keras.src.layers import Dropout
from keras.src.regularizers import l2
from keras.src.layers import LeakyReLU
from keras.src.optimizers.schedules import ExponentialDecay
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from kerastuner.tuners import RandomSearch
import tensorflow as tf

global model

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units', min_value=30, max_value=30, step=1), activation='relu', input_shape=(mice_X.shape[1],)))

    model.add(Dense(units=hp.Int('units1', min_value=55, max_value=65, step=1), activation=LeakyReLU(alpha=hp.Float('alpha1', min_value=0.01, max_value=0.05, step=1))))
    model.add(Dense(units=hp.Int('units2', min_value=40, max_value=50, step=1), activation=LeakyReLU(alpha=hp.Float('alpha2', min_value=0.01, max_value=0.05, step=1))))
    model.add(Dense(units=hp.Int('units3', min_value=31, max_value=41, step=1), activation=LeakyReLU(alpha=hp.Float('alpha3', min_value=0.01, max_value=0.05, step=1))))
    model.add(Dense(units=hp.Int('units3', min_value=56, max_value=66, step=1), activation=LeakyReLU(alpha=hp.Float('alpha4', min_value=0.01, max_value=0.05, step=1))))

    model.add(Dense(1))
    # from this but 0.805 [30 55 51 26]
    # from mlp reg [58, 45, 36, 61]
    lr_scheduler= ExponentialDecay(
        0.00001,
        decay_steps=10000,
        decay_rate=0.9,
    )


    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler),
                  loss='mean_squared_error',
                  metrics=['mean_absolute_error'])
    return model

tuner = RandomSearch(
    build_model,
    objective='val_mean_absolute_error',
    max_trials=100,
    executions_per_trial=1,
    directory='my_dir',
    project_name='nn_model_v7'
)

tuner.search(mice_X, mice_y, epochs=300, validation_split=0.25, verbose=2)

In [None]:
h = tuner.get_best_hyperparameters()[0]
X_train, X_val, y_train, y_val = train_test_split(X_data, y, test_size=0.25, random_state=42)

print("The best hyperparameters are:")
print(f" - Units in the first dense layer: {h.get('units')}")
print(f" - Units in the second dense layer: {h.get('units1')}")
print(f" - Units in the third dense layer: {h.get('units2')}")
print(f" - Units in the fourth dense layer: {h.get('units3')}")
print(f" - Learning rate for the optimizer: {h.get('alpha1')}")

In [None]:
y_pred = model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
print(f"MSE: {mse}")

r2 = r2_score(y_val, y_pred)
print(f"R^2: {r2}")

 # MLP REGRESSOR

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from joblib import dump, load


from hyperopt import fmin, tpe, hp, space_eval


import warnings

warnings.filterwarnings("ignore")


estimators_params = {}


estimators_params[MLPRegressor] ={
    'layer_number': hp.choice('layer_number', range(4,5)),
    'nn1' : hp.choice("nn1", range(53,63,1)),
    'nn2' : hp.choice("nn2", range(40,50,1)),
    'nn3' : hp.choice("nn3", range(31,41,1)),
    'nn4' : hp.choice("nn4", range(56,67,1)),
    'activation': hp.choice('activation', ['relu', 'tanh']),
    'solver': 'lbfgs',
    'alpha': hp.uniform('alpha', 0.17, 0.22),
    'max_iter': hp.choice('max_iter', range(1200, 1300, 10)),
    'tol': hp.uniform('tol', 1e-6, 1e-3),
    'shuffle': hp.choice('shuffle', [True, False]),
    'early_stopping': True,
    'validation_fraction': 0.25,
}


for data_name, X_data in X_datas.items():
    X_train, X_val, y_train, y_val = train_test_split(X_data, y, test_size=0.25, random_state=42)
    print(data_name)
    print(estimator_name)

    try:
        best_combo
    except NameError:
        best_combo = None

    best_estimators = {}
    for estimator, space in estimators_params.items():
        print(estimator)

        def objective(params: dict):

            global best_combo

            params["hidden_layer_sizes"] = [params.pop("nn"+str(i)) for i in range(1,params.pop("layer_number")+1)]
            for i in range(6):
                if "nn"+str(i) in params.keys():
                    params.pop("nn"+str(i))
            model = estimator(**params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            mse = mean_squared_error(y_val, y_pred)
            print(f"MSE: {mse}")
            mae = mean_absolute_error(y_val, y_pred)
            print(f"MAE:  {mae}")
            r2 = r2_score(y_val, y_pred)
            print(f"R2:  {r2}")
            if best_combo is None:
                best_combo = {
                    "model": model,
                    "mse" : mse,
                    "mae" : mae,
                    "r2" : r2,
                    "y_pred": y_pred,
                }
            else:
                if best_combo["mse"] > mse:
                    best_combo = {
                        "model": model,
                        "mse" : mse,
                        "mae" : mae,
                        "r2" : r2,
                        "y_pred": y_pred,
                    }
                    print("NEW_BEST_COMBO!")
                    print(f"MSE: {mse}")
                    print(f"MAE: {mae}")
                    print(f"R2: {r2}")
                    dump(best_combo, 'best_combo_temp2.joblib')
            return mse

        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=10000,
                    verbose=True)

        print("BEST:")
        best_params = space_eval(space, best)
        best_estimator = estimator(**best_params)
        best_estimator.fit(X_train, y_train)
        y_pred_est = best_estimator.predict(X_val)
        mae_est = mean_absolute_error(y_val, y_pred_est)
        mse_est = mean_squared_error(y_val, y_pred_est)
        r2_est = r2_score(y_val, y_pred_est)
        print(f"MAE: {mae_est}")
        print(f"MSE: {mse_est}")
        print(f"R2: {r2_est}")
        print("BEST:",best_params)
        best_estimators[mse_est] = (best_estimator, best_params)

best_mse = min(best_estimators.keys())
best_regressor= (best_estimators[best_mse],  best_mse)
print(best_combo)
dump(best_combo, 'best_combo_new3.joblib')
print(best_regressor)
dump(best_regressor, 'best_regressor_new3.joblib')

In [None]:
print(best_combo["model"].get_params())

In [None]:
print(best_regressor)

In [None]:
def measure(model, X, y):
    preds = model.predict(X)
    return {
        "MAE": mean_absolute_error(y, preds),
        "MSE": mean_squared_error(y, preds),
        "R2": r2_score(y, preds),
    }

In [None]:
from joblib import load
print(load("best_combo_temp_005400.joblib"))

In [None]:
from joblib import load
measure(load("best_combo_temp_005400.joblib")["model"], X_val, y_val)

In [None]:
print(load('mlp_regressor_model.joblib'))