In [1]:
import kaleido
import pandas as pd
import numpy as np
import plotly.graph_objects as go
######## Search space #########
import sys
sys.path.append("../")
from search_space.RegNet import RegNet
from search_space.utils import create_widths_plot, scatter_results
# Initialize an empty list to store dictionaries
def results_to_df(path, name):
    data = []
    # Open the text file
    with open(path, 'r') as file:
        lines = file.readlines()
        # Initialize an empty dictionary to store data for each block
        block_data = {}
        for line in lines:
            # If the line contains dashes, it indicates the end of a block
            if '-------------------------' in line:
                # If block_data is not empty, add it to the list of data dictionaries
                if block_data:
                    data.append(block_data)
                    # Reset block_data for the next block
                    block_data = {}
            elif 'best_acc' in line:
                continue
            else:
                # Split the line by ':'
                #print(line)
                key, value = line.strip().split(': ')
                # Store the key-value pair in the block_data dictionary
                block_data[key] = value

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)

    # Convert columns to appropriate data types if needed
    df['epoch'] = df['epoch'].astype(int)
    df['lr'] = df['lr'].astype(float)
    df['train_acc'] = df['train_acc'].astype(float)
    df['train_loss'] = df['train_loss'].astype(float)
    df['test_acc'] = df['test_acc'].astype(float)
    df['test_acc_top5'] = df['test_acc_top5'].astype(float)
    df['test_loss'] = df['test_loss'].astype(float)
    df['epoch_time'] = df['epoch_time'].astype(float)
    df=df.assign(name=name)
    
    return df

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import os

metadata_dict={}
metadata_dict["Caitie"]={'num_classes': 4,
                         'input_shape': [49260, 3, 64, 64],
                         'codename': 'Caitie',
                         'benchmark': 47.008,
                         }
metadata_dict["Adaline"]={'num_classes': 20,
                         'input_shape': [50000, 3, 28, 28],
                         'codename': 'Adaline',
                         'benchmark': 89.85}
metadata_dict["Chester"]={'input_shape': [49998, 12, 8, 8],
                         'codename': 'Chester',
                         'benchmark': 57.826,
                         'num_classes': 3}
metadata_dict["Gutenberg"]={'input_shape': [45000, 1, 27, 18],
                         'codename': 'Gutenberg',
                         'benchmark': 40.98,
                         'num_classes': 6}
metadata_dict["LaMelo"]={'num_classes': 10,
                         'codename': 'LaMelo',
                         'input_shape': [50000, 1, 24, 24],
                         'benchmark': 85.2}
metadata_dict["Mateo"]={'num_classes': 10,
                         'input_shape': [50000, 3, 28, 28],
                         'codename': 'Mateo',
                         'benchmark': 90.87}
metadata_dict["Sadie"]={'input_shape': [50000, 3, 64, 64],
                         'codename': 'Sadie',
                         'benchmark': 80.33,
                         'num_classes': 10}
def get_predictor_data_from_folder(studies):

    
    train_data=[]
    for study_name in studies: 
        

        metadata=metadata_dict[study_name]
        rg=RegNet(metadata,
                        W0=[16, 120, 8],
                        WA=[16, 64, 8],
                        WM=[2.05,2.9,0.05],
                        D=[8,22,1], 
                        G=[8,8,8], 
                        base_config=f"../configs/search_space/config.yaml")
        data=[]
        study_folder=f"/home/woody/iwb3/iwb3021h/NAS_COMPETITION_RESULTS/classifier_train/{study_name}"
        subjects= os.listdir(study_folder)

        subjects=[subject for subject in subjects if os.path.isdir(f"{study_folder}/{subject}")]
        print(subjects)
        for name in subjects:
            try:
                test_acc=results_to_df(f"{study_folder}/{name}/worklog.txt", f"{name}")[["epoch","test_acc","name"]]
                #test_acc=test_acc.rename(columns={"test_acc":"acc","test_loss":"test_acc"})
                #print(test_acc.columns)
                if len(test_acc)>49:
                    test_acc_piv = test_acc[["epoch","test_acc","name"]].pivot(index='name', columns='epoch', values='test_acc').add_prefix("epoch_").reset_index()
                    test_acc_piv["best_acc"]=test_acc["test_acc"].max()
                    
                    model, info=rg.load_model(config_file=f"{study_folder}/{name}/config.yaml")
                    test_acc_piv["gen"]=1
                    test_acc_piv["num_stages"]=info["num_stages"]
                    test_acc_piv["params"]=info["params"]
                    test_acc_piv["WA"]=info["WA"]
                    test_acc_piv["W0"]=info["W0"]
                    test_acc_piv["WM"]=info["WM"]
                    test_acc_piv["DEPTH"]=info["DEPTH"]
                    test_acc_piv["num_classes"]=metadata["num_classes"]
                    test_acc_piv["num_channels"]=metadata["input_shape"][1]
                    test_acc_piv["benchmark"]=metadata["benchmark"]
                    
                    #print(info)
                    data.append(test_acc_piv)
            except:
                    pass

        data=pd.concat(data)
        train_data.append(data)
    train_data=pd.concat(train_data)
    return train_data
    

In [3]:
import json


def get_predictor_data(tests_folder,studies):
    train_data=[]
    for study_name in studies: 
        #study_name="tests_LaMelo_13_06_2024_02_11"
        file_path=f"{tests_folder}/{study_name}/{study_name}.evonas"
        with open(file_path, 'r') as file:
            results = json.load(file)

        individuals_df=pd.read_json(results["results"]).sort_values("name")[["name","generation"]]
        data_models=pd.read_json(results["results"])[["name","num_stages","params","WA","W0","WM","DEPTH", "best_acc"]]
        data_models["num_classes"]=results["metadata"]["num_classes"]
        data_models["num_channels"]=results["metadata"]["input_shape"][1]
        data_models["benchmark"]=results["metadata"]["benchmark"]
        print(study_name)
        print(results["metadata"])

        data=[]
        for index, row in individuals_df.iterrows():
            name=row["name"]
            generation=row["generation"]
            try:
                test_acc=results_to_df(f"{tests_folder}/{study_name}/Generation_{generation}/{name}/worklog.txt", f"{name}")[["epoch","test_acc","name"]]
                #test_acc=test_acc.rename(columns={"test_loss":"test_acc"})
                test_acc_piv = test_acc.pivot(index='name', columns='epoch', values='test_acc').add_prefix("epoch_").reset_index()
                test_acc_piv["gen"]=generation
                data.append(pd.merge(test_acc_piv, data_models, on="name"))
            except:
                pass
        data=pd.concat(data)
        train_data.append(data)
    train_data=pd.concat(train_data)
    return train_data

In [4]:
test_dataset="Mateo"
corr_scores={}
accuracies={}
test_accuracies={}
epochs_results={}

In [27]:
train_data=get_predictor_data_from_folder(studies=["LaMelo","Caitie", "Adaline","Gutenberg","Sadie", "Chester", "Mateo"])
test_data=get_predictor_data_from_folder(studies=["Caitie"])

['dangerous_degu', 'didactic_armadillo', 'tiny_sturgeon', 'jolly_buzzard', 'great_prawn', 'illustrious_trout', 'notorious_sawfly', 'merry_lizard', 'outstanding_mantis', 'fluffy_boa', 'logical_hornet', 'great_coyote', 'discerning_beluga', 'monumental_galago', 'colossal_hedgehog', 'fragrant_skylark', 'vivid_owl', 'wandering_silkworm', 'jasmine_dugong', 'ultramarine_caiman', 'astute_barracuda', 'silver_ammonite', 'foamy_rat', 'camouflaged_kittiwake', 'quick_prawn', 'angelic_coati', 'tested_nuthatch', 'prompt_dog', 'organic_lorikeet', 'convivial_jellyfish', 'enlightened_impala', 'scarlet_grasshopper', 'jade_baboon', 'adaptable_jacamar', 'real_caterpillar', 'poised_frog', 'cornflower_salamander', 'curious_viper', 'elated_pig', 'gleaming_hawk', 'fluorescent_antelope', 'antique_anteater', 'inventive_jellyfish', 'curly_emu', 'auburn_sturgeon', 'fat_parakeet', 'unique_grasshopper', 'demonic_hornet', 'crafty_hyrax', 'orange_ara', 'thundering_seahorse', 'lime_hyena', 'vanilla_ara', 'peculiar_dorm

In [28]:
# Standardize data
standardize=True
from sklearn.preprocessing import StandardScaler
if standardize:
    def standard_scaler(group):
        scaler = StandardScaler()
        #cols=list(group.columns[1:])
        cols=["num_stages","params","WA","W0","WM","DEPTH"]
        #cols.remove("best_acc")
        #cols.remove("benchmark")
        #cols.remove("gen")
        group[cols] = scaler.fit_transform(group[cols])

        return group
    train_data = train_data.groupby('benchmark').apply(standard_scaler)
    train_data=train_data.reset_index(drop=True)

    test_data = test_data.groupby('benchmark').apply(standard_scaler)
    test_data=test_data.reset_index(drop=True)

  train_data = train_data.groupby('benchmark').apply(standard_scaler)
  test_data = test_data.groupby('benchmark').apply(standard_scaler)


In [29]:
import pandas as pd
import numpy as np
from itertools import combinations

# Initialize an empty list to hold the final data
total_data = []

# Iterate over unique benchmark scores
for benchmark_score in train_data['benchmark'].unique():
    filtered_df = train_data[train_data['benchmark'] == benchmark_score].reset_index(drop=True)
    pairs = np.array(list(combinations(filtered_df.index, 2)))
    
    # Extract constant columns
    num_classes = filtered_df["num_classes"].iloc[0]
    num_channels = filtered_df["num_channels"].iloc[0]

    # Prepare list to collect combined data
    combined_data = []

    for idx1, idx2 in pairs:
        row1 = filtered_df.loc[idx1]
        row2 = filtered_df.loc[idx2]

        combined_row = {f'epoch_{i}_A': row1[f'epoch_{i}'] for i in [5,10,15,20]}
        combined_row.update({f'epoch_{i}_B': row2[f'epoch_{i}'] for i in [5,10,15,20]})
        combined_row.update({
            'name_A': row1['name'],
            'name_B': row2['name'],
            'num_stages_A': row1['num_stages'],
            'params_A': row1['params'],
            'WA_A': row1['WA'],
            'W0_A': row1['W0'],
            'WM_A': row1['WM'],
            'DEPTH_A': row1['DEPTH'],
            'gen_A': row1['gen'],
            'best_acc_A': row1['best_acc'],
            'num_stages_B': row2['num_stages'],
            'params_B': row2['params'],
            'WA_B': row2['WA'],
            'W0_B': row2['W0'],
            'WM_B': row2['WM'],
            'DEPTH_B': row2['DEPTH'],
            'gen_B': row2['gen'],
            'best_acc_B': row2['best_acc'],
            'label': row1['best_acc'] - row2['best_acc'],
            'benchmark': benchmark_score,
            'num_classes': num_classes,
            'num_channels': num_channels
        })

        combined_data.append(combined_row)

    combined_df = pd.DataFrame(combined_data)
    #### Scale differences ###############
    #scaler = MinMaxScaler(feature_range=(-1, 1))
    #scaler= StandardScaler()
    #combined_df['label'] = scaler.fit_transform(combined_df[['label']])

    total_data.append(combined_df)

# Concatenate all dataframes
total_data = pd.concat(total_data).reset_index(drop=True)



In [30]:
import pandas as pd
import numpy as np
from itertools import combinations

# Initialize an empty list to hold the final data
total_data_test = []

# Iterate over unique benchmark scores
for benchmark_score in test_data['benchmark'].unique():
    filtered_df = test_data[test_data['benchmark'] == benchmark_score].reset_index(drop=True)
    pairs = np.array(list(combinations(filtered_df.index, 2)))
    
    # Extract constant columns
    num_classes = filtered_df["num_classes"].iloc[0]
    num_channels = filtered_df["num_channels"].iloc[0]

    # Prepare list to collect combined data
    combined_data = []

    for idx1, idx2 in pairs:
        row1 = filtered_df.loc[idx1]
        row2 = filtered_df.loc[idx2]

        combined_row = {f'epoch_{i}_A': row1[f'epoch_{i}'] for i in [5, 10, 15, 20]}
        combined_row.update({f'epoch_{i}_B': row2[f'epoch_{i}'] for i in [5, 10, 15, 20]})
        combined_row.update({
            'name_A': row1['name'],
            'name_B': row2['name'],
            'num_stages_A': row1['num_stages'],
            'params_A': row1['params'],
            'WA_A': row1['WA'],
            'W0_A': row1['W0'],
            'WM_A': row1['WM'],
            'DEPTH_A': row1['DEPTH'],
            'gen_A': row1['gen'],
            'best_acc_A': row1['best_acc'],
            'num_stages_B': row2['num_stages'],
            'params_B': row2['params'],
            'WA_B': row2['WA'],
            'W0_B': row2['W0'],
            'WM_B': row2['WM'],
            'DEPTH_B': row2['DEPTH'],
            'gen_B': row2['gen'],
            'best_acc_B': row2['best_acc'],
            'label': row1['best_acc'] - row2['best_acc'],
            'benchmark': benchmark_score,
            'num_classes': num_classes,
            'num_channels': num_channels
        })

        combined_data.append(combined_row)

    combined_df = pd.DataFrame(combined_data)
    
    #### Scale differences ###############
    #scaler= StandardScaler()
    #combined_df['label'] = scaler.fit_transform(combined_df[['label']])
    
    total_data_test.append(combined_df)

# Concatenate all dataframes
total_data_test = pd.concat(total_data_test).reset_index(drop=True)




In [31]:
total_data.num_classes.unique()

array([ 6,  4,  3, 10, 20])

In [32]:
#scaler = StandardScaler()
#total_data[['num_classes', 'benchmark', "num_channels"]]=scaler.fit_transform(total_data[['num_classes', 'benchmark', "num_channels"]])

In [33]:
#total_data_test[['num_classes', 'benchmark', "num_channels"]]=scaler.transform(total_data_test[['num_classes', 'benchmark', "num_channels"]])

In [34]:
cols_train=[  
            'num_stages_A', 'params_A', 'WA_A', 'W0_A','WM_A', 'DEPTH_A',

        'num_stages_B', 'params_B', 'WA_B', 'W0_B', 'WM_B', 'DEPTH_B',#]
        'num_classes', 'benchmark', "num_channels"]
#cols_train=[  
#        'num_stages_A', 'WA_A', 'W0_A', "params_A",
#       'WM_A', 'DEPTH_A',
        
#        'num_stages_B', 'WA_B', 'W0_B', "params_B",
#       'WM_B', 'DEPTH_B',"num_classes", "benchmark", "num_channels"]
cols_test=["label"]
X=total_data[cols_train]
y=total_data[cols_test]
gens=[1]
X_test_new=total_data_test[(total_data_test["gen_B"].isin(gens)) & (total_data_test["gen_A"].isin(gens))][cols_train]
y_test_new=total_data_test[(total_data_test["gen_B"].isin(gens)) & (total_data_test["gen_A"].isin(gens))][cols_test]

In [36]:
rfr_regressor=load(f'regressors/rfr_model_50.joblib')
y_pred = rfr_regressor.predict(X_test_new)
error=mean_squared_error(y_test_new, y_pred)
print(f"Error: {error}")

print("\n")

spearman_corr, tau_distance = get_correlations(y_pred)
print("########################")
print(spearman_corr)
print(tau_distance)

Error: 3.6794170858572763


########################
0.9374260066837506
0.7972295903330384


In [35]:
sgdr_regressor=load(f'regressors/sgdr_model_50.joblib')
y_pred = sgdr_regressor.predict(X_test_new)
error=mean_squared_error(y_test_new, y_pred)
print(f"Error: {error}")

print("\n")

spearman_corr, tau_distance = get_correlations(y_pred)
print("########################")
print(spearman_corr)
print(tau_distance)

Error: 29.274740104521737


########################
0.7339237812645176
0.5540819333922782


In [11]:
X

Unnamed: 0,num_stages_A,params_A,WA_A,W0_A,WM_A,DEPTH_A,num_stages_B,params_B,WA_B,W0_B,WM_B,DEPTH_B,num_classes,benchmark,num_channels
0,0.940647,2.398599,1.113748,-0.252080,0.293049,1.245631,-0.299767,-0.434987,-0.349570,-0.863182,0.879148,0.325781,6,40.98,1
1,0.940647,2.398599,1.113748,-0.252080,0.293049,1.245631,-0.299767,-0.620090,1.601520,0.664574,-1.074514,-1.513921,6,40.98,1
2,0.940647,2.398599,1.113748,-0.252080,0.293049,1.245631,0.940647,-0.760254,-0.837343,-1.168734,-1.660612,-0.594070,6,40.98,1
3,0.940647,2.398599,1.113748,-0.252080,0.293049,1.245631,0.940647,0.468918,-0.349570,-1.168734,0.879148,1.475594,6,40.98,1
4,0.940647,2.398599,1.113748,-0.252080,0.293049,1.245631,-0.299767,-0.764783,-1.325116,-1.779836,1.074514,0.095818,6,40.98,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49384,1.127668,1.727307,1.595256,0.209848,-1.001272,0.938035,1.127668,0.376374,0.055599,-1.220932,-0.181117,1.158317,10,90.87,3
49385,1.127668,1.727307,1.595256,0.209848,-1.001272,0.938035,1.127668,-0.393226,-0.970839,-1.220932,-1.001272,0.938035,10,90.87,3
49386,-0.343203,-0.324661,0.055599,-0.076308,0.023921,0.056906,1.127668,0.376374,0.055599,-1.220932,-0.181117,1.158317,10,90.87,3
49387,-0.343203,-0.324661,0.055599,-0.076308,0.023921,0.056906,1.127668,-0.393226,-0.970839,-1.220932,-1.001272,0.938035,10,90.87,3


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor

from scipy.stats import spearmanr
from scipy.stats import kendalltau

def kendall_tau_distance(list1, list2):
    tau, _ = kendalltau(list1, list2)
    return tau
def spearman_rank_correlation(list1, list2):
    corr, _ = spearmanr(list1, list2)
    return corr

In [13]:
def get_correlations_epochs(total_data_test, gens):
    cols_train=[ 'epoch_5_A', 'epoch_10_A', 'epoch_15_A', 'epoch_20_A', 
                'num_stages_A', 'params_A', 'WA_A', 'W0_A','WM_A', 'DEPTH_A',
             'epoch_5_B', 'epoch_10_B', 'epoch_15_B',  'epoch_20_B',
            'num_stages_B', 'params_B', 'WA_B', 'W0_B', 'WM_B', 'DEPTH_B',
            'num_classes', 'benchmark', "num_channels"]
    cols_test=["label"]
    X_test_new=total_data_test[(total_data_test["gen_B"].isin(gens)) & (total_data_test["gen_A"].isin(gens))][cols_train]
    y_test_new=total_data_test[(total_data_test["gen_B"].isin(gens)) & (total_data_test["gen_A"].isin(gens))][cols_test]
    
    accs={}
    corrs={}
    results={}
    for epoch in [5,10,15,20]:
        y_epoch_10_pred= (X_test_new[f'epoch_{epoch}_A'] - X_test_new[f'epoch_{epoch}_B']).values

        #accuracy = accuracy_score(y_test_new, y_epoch_10_pred)
        #accs[epoch]=accuracy
        
        ranking_test_df=total_data_test[(total_data_test["gen_B"].isin(gens)) & (total_data_test["gen_A"].isin(gens))][["name_A","name_B", "label", "gen_A","gen_B"]]
        ranking_test_df["pred_10_epochs"]=y_epoch_10_pred

            # Create a new column by applying the function to each row
        ranking_target={}
        ranking_10={}
        for ind in list(ranking_test_df.name_A.unique())+list(ranking_test_df.name_B.unique()):
            ranking_target[ind]=0
            ranking_10[ind]=0
        for index, row in ranking_test_df.iterrows():
            
            ranking_target[row["name_A"]]=ranking_target[row["name_A"]]+row[f"label"]
            ranking_target[row["name_B"]]=ranking_target[row["name_B"]]-row[f"label"]
            
            ranking_10[row["name_A"]]=ranking_10[row["name_A"]]+row["pred_10_epochs"]
            ranking_10[row["name_B"]]=ranking_10[row["name_B"]]-row["pred_10_epochs"]

        ranking_target_df=pd.DataFrame([ranking_target]).T.rename(columns={0:"score"}).sort_values(by="score", ascending=False)
        ranking_10_df=pd.DataFrame([ranking_10]).T.rename(columns={0:"score"}).sort_values(by="score", ascending=False)

            
        ######## Correlation #########
        # Example usage
        target =ranking_target_df.index.tolist()

        list_to_compare_10 = ranking_10_df.index.tolist()
        # Convert the lists to ranks
        target_ranks = {k: i for i, k in enumerate(target)}
        list_to_compare_ranks_10 = [target_ranks[x] for x in list_to_compare_10]
        # Calculate Spearman rank correlation
        spearman_corr_10 = spearman_rank_correlation(list_to_compare_ranks_10, list(range(len(target))))
        print(f'Spearman rank correlation: {spearman_corr_10}')
        # Calculate Kendall Tau distance
        tau_distance_10 = kendall_tau_distance(list_to_compare_ranks_10, list(range(len(target))))
        print(f'Kendall Tau distance: {tau_distance_10}')
        corrs[epoch]=(spearman_corr_10, tau_distance_10)
    
    results["correlation"]=corrs
    return results
    
def get_correlations(y_pred):
    ranking_test_df=total_data_test[(total_data_test["gen_B"].isin(gens)) & (total_data_test["gen_A"].isin(gens))][["name_A","name_B", "label", "gen_A","gen_B"]]
    ranking_test_df["y_pred"]=y_pred

    # Create a new column by applying the function to each row
    ranking_target={}
    ranking_predict={}
    for ind in list(ranking_test_df.name_A.unique())+list(ranking_test_df.name_B.unique()):
        ranking_target[ind]=0
        ranking_predict[ind]=0
    for index, row in ranking_test_df.iterrows():
        ranking_target[row["name_A"]]=ranking_target[row["name_A"]]+row[f"label"]
        ranking_target[row["name_B"]]=ranking_target[row["name_B"]]-row[f"label"]

        ranking_predict[row["name_A"]]=ranking_predict[row["name_A"]]+row[f"y_pred"]
        ranking_predict[row["name_B"]]=ranking_predict[row["name_B"]]-row[f"y_pred"]
    
    ranking_predict_df=pd.DataFrame([ranking_predict]).T.rename(columns={0:"score"}).sort_values(by="score", ascending=False)

    ranking_target_df=pd.DataFrame([ranking_target]).T.rename(columns={0:"score"}).sort_values(by="score", ascending=False)
    
    ######## Correlation #########
    # Example usage
    target =ranking_target_df.index.tolist()
    list_to_compare = ranking_predict_df.index.tolist()

    #list_to_compare = ranking_10_df.index.tolist()
    # Convert the lists to ranks
    target_ranks = {k: i for i, k in enumerate(target)}
    list_to_compare_ranks = [target_ranks[x] for x in list_to_compare]
    # Calculate Spearman rank correlation
    spearman_corr = spearman_rank_correlation(list_to_compare_ranks, list(range(len(target))))
    #print(f'Spearman rank correlation: {spearman_corr}')
    # Calculate Kendall Tau distance
    tau_distance = kendall_tau_distance(list_to_compare_ranks, list(range(len(target))))
    #print(f'Kendall Tau distance: {tau_distance}')

    return spearman_corr, tau_distance

In [14]:
if test_dataset not in epochs_results:
    epochs_results[test_dataset] = []
epochs_results[test_dataset].append(get_correlations_epochs(total_data_test, gens))

Spearman rank correlation: 0.6151677199805542
Kendall Tau distance: 0.4299719887955181
Spearman rank correlation: 0.6663309952080005
Kendall Tau distance: 0.48543417366946773
Spearman rank correlation: 0.6522050142370996
Kendall Tau distance: 0.4697478991596638
Spearman rank correlation: 0.7379123550246544
Kendall Tau distance: 0.5456582633053221


In [15]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
# Define classifiers
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

regressors = {
    #"Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    #"XGBoost": xgb.XGBRegressor(random_state=42),
    "SGD": SGDRegressor(random_state=42)
    
}

# Parameter grids
param_grids = {
    "Gradient Boosting": {
        'n_estimators': [100, 300,500],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 4, 5]
    },
    "Random Forest": {
        'n_estimators': [100, 300,500,1000],
        'max_depth': [3, 5, 7, 10]
    },
    "XGBoost": {
        'n_estimators': [100, 300,500, 1000],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 4, 5, 6]
    },
    "SGD": {
    'loss': ['squared_error', 'huber', 'epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'max_iter': [1000, 2000, 3000],
    'eta0': [0.01, 0.1, 1]
}
}

# Perform grid search for each classifier
best_estimators = {}
for rg_name in regressors:
    grid_search = GridSearchCV(estimator=regressors[rg_name], param_grid=param_grids[rg_name],
                               cv=3, n_jobs=-1, verbose=0, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_estimators[rg_name] = grid_search.best_estimator_



  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [16]:
from sklearn.metrics import mean_squared_error
if test_dataset not in corr_scores:
    corr_scores[test_dataset] = []
if test_dataset not in test_accuracies:
    test_accuracies[test_dataset] = []
corr_gen={}
test_acc_gen={}
for rg_name, rg in best_estimators.items():
    y_pred = rg.predict(X_test_new)
    print(f"Results for {rg_name}:")
    error=mean_squared_error(y_test_new, y_pred)
    print(f"Error: {error}")

    print("\n")
    
    spearman_corr, tau_distance = get_correlations(y_pred)
    print("########################")
    print(spearman_corr)
    print(tau_distance)
    corr_gen[rg_name]=(spearman_corr, tau_distance)
    test_acc_gen[rg_name]=(error)
corr_scores[test_dataset].append(corr_gen)
test_accuracies[test_dataset].append(test_acc_gen)

Results for Random Forest:
Error: 0.19279313653762223


########################
0.9094242655739981
0.7509803921568626
Results for SGD:
Error: 0.9848967261143201


########################
0.9111535523300227
0.7507002801120448


In [55]:
from sklearn.metrics import mean_squared_error
params= {'alpha': 0.001,
 'average': False,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.01,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'squared_error',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'penalty': 'elasticnet',
 'power_t': 0.25,
 'random_state': 42,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}
rg=SGDRegressor(**params)
rg.fit(X, y)

y_pred = rg.predict(X_test_new)
error=mean_squared_error(y_test_new, y_pred)
print(f"Error: {error}")

print("\n")

spearman_corr, tau_distance = get_correlations(y_pred)
print("########################")
print(spearman_corr)
print(tau_distance)


  y = column_or_1d(y, warn=True)


Error: 0.26627109350817985


########################
0.9021668171400792
0.7392156862745097


In [17]:
rg.get_params()

{'alpha': 0.001,
 'average': False,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.1,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'adaptive',
 'loss': 'squared_error',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'penalty': 'l2',
 'power_t': 0.25,
 'random_state': 42,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [18]:
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load
import os
os.makedirs(f"regressors", exist_ok=True)

dump(best_estimators["Random Forest"], f'regressors/rfr_model_50.joblib')
dump(best_estimators["SGD"], f'regressors/sgdr_model_50.joblib')

['regressors/sgdr_model_50.joblib']