In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
######## Search space #########
import sys
sys.path.append("../")
from search_space.RegNet import RegNet
from search_space.utils import create_widths_plot, scatter_results
# Initialize an empty list to store dictionaries
def results_to_df(path, name):
    data = []
    # Open the text file
    with open(path, 'r') as file:
        lines = file.readlines()
        # Initialize an empty dictionary to store data for each block
        block_data = {}
        for line in lines:
            # If the line contains dashes, it indicates the end of a block
            if '-------------------------' in line:
                # If block_data is not empty, add it to the list of data dictionaries
                if block_data:
                    data.append(block_data)
                    # Reset block_data for the next block
                    block_data = {}
            elif 'best_acc' in line:
                continue
            else:
                # Split the line by ':'
                #print(line)
                key, value = line.strip().split(': ')
                # Store the key-value pair in the block_data dictionary
                block_data[key] = value

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)

    # Convert columns to appropriate data types if needed
    df['epoch'] = df['epoch'].astype(int)
    df['lr'] = df['lr'].astype(float)
    df['train_acc'] = df['train_acc'].astype(float)
    df['train_loss'] = df['train_loss'].astype(float)
    df['test_acc'] = df['test_acc'].astype(float)
    df['test_acc_top5'] = df['test_acc_top5'].astype(float)
    df['test_loss'] = df['test_loss'].astype(float)
    df['epoch_time'] = df['epoch_time'].astype(float)
    df=df.assign(name=name)
    
    return df

  from .autonotebook import tqdm as notebook_tqdm


In [174]:
import json

tests_folder="/home/woody/iwb3/iwb3021h/NAS_COMPETITION_RESULTS"
studies_train=["tests_LaMelo_13_06_2024_02_11", 
               #"tests_LaMelo_11_06_2024_19_48",
               #"tests_Caitie_12_06_2024_08_42",
               #"tests_Caitie_13_06_2024_00_05", 
               #"tests_Adaline_13_06_2024_00_08",
               #"tests_Adaline_12_06_2024_03_18",
               #"tests_Sadie_12_06_2024_08_38",
               #"tests_Sadie_13_06_2024_00_10",
               #"tests_Chester_13_06_2024_09_31",
               #"tests_Chester_12_06_2024_08_32",
               #"tests_Gutenberg_12_06_2024_23_37"
              ]
def get_predictor_data(tests_folder,studies):
    train_data=[]
    for study_name in studies: 
        #study_name="tests_LaMelo_13_06_2024_02_11"
        file_path=f"{tests_folder}/{study_name}/{study_name}.evonas"
        with open(file_path, 'r') as file:
            results = json.load(file)

        individuals_df=pd.read_json(results["results"]).sort_values("name")[["name","generation"]]
        data_models=pd.read_json(results["results"])[["name","num_stages","params","WA","W0","WM","DEPTH", "best_acc"]]
        data_models["num_classes"]=results["metadata"]["num_classes"]
        data_models["benchmark"]=results["metadata"]["benchmark"]
        print(study_name)
        print(results["metadata"])

        data=[]
        for index, row in individuals_df.iterrows():
            name=row["name"]
            generation=row["generation"]
            test_acc=results_to_df(f"{tests_folder}/{study_name}/Generation_{generation}/{name}/worklog.txt", f"{name}")[["epoch","test_acc","name"]]
            test_acc_piv = test_acc.pivot(index='name', columns='epoch', values='test_acc').add_prefix("epoch_").reset_index()
            test_acc_piv["gen"]=generation
            data.append(pd.merge(test_acc_piv, data_models, on="name"))
        data=pd.concat(data)
        train_data.append(data)
    train_data=pd.concat(train_data)
    return train_data

In [175]:
train_data=get_predictor_data(tests_folder,studies_train)

  individuals_df=pd.read_json(results["results"]).sort_values("name")[["name","generation"]]
  data_models=pd.read_json(results["results"])[["name","num_stages","params","WA","W0","WM","DEPTH", "best_acc"]]


tests_LaMelo_13_06_2024_02_11
{'num_classes': 10, 'codename': 'LaMelo', 'input_shape': [50000, 1, 24, 24], 'benchmark': 85.2, 'time_remaining': 7199.416898488998, 'rand_augment': False, 'train_config_path': 'our_submission/configs/train/vanilla_generation_adam.yaml', 'mode': 'NAS'}


In [38]:
studies_test=["tests_Gutenberg_13_06_2024_09_35"]
#tests_folder="/home/woody/iwb3/iwb3021h/NAS_COMPETITION_RESULTS/full_training"
test_data=get_predictor_data(tests_folder,studies_test)

  individuals_df=pd.read_json(results["results"]).sort_values("name")[["name","generation"]]
  data_models=pd.read_json(results["results"])[["name","num_stages","params","WA","W0","WM","DEPTH", "best_acc"]]


tests_Gutenberg_13_06_2024_09_35
{'input_shape': [45000, 1, 27, 18], 'codename': 'Gutenberg', 'benchmark': 40.98, 'num_classes': 6, 'time_remaining': 7199.527697086334, 'rand_augment': False, 'train_config_path': 'our_submission/configs/train/vanilla_generation_adam.yaml', 'mode': 'NAS'}


In [5]:
test_data.columns

Index(['name', 'epoch_1', 'epoch_2', 'epoch_3', 'epoch_4', 'epoch_5',
       'epoch_6', 'epoch_7', 'epoch_8', 'epoch_9', 'epoch_10', 'epoch_11',
       'epoch_12', 'epoch_13', 'epoch_14', 'epoch_15', 'epoch_16', 'epoch_17',
       'epoch_18', 'epoch_19', 'epoch_20', 'gen', 'num_stages', 'params', 'WA',
       'W0', 'WM', 'DEPTH', 'best_acc', 'num_classes', 'benchmark'],
      dtype='object')

In [7]:
cols_train=[ 'epoch_1', 'epoch_2', 'epoch_3', 'epoch_4', 'epoch_5',
       'epoch_6', 'epoch_7', 'epoch_8', 'epoch_9', 'epoch_10', 'num_stages', 'params', 'WA', 'W0',
       'WM', 'DEPTH', 'num_classes', 'benchmark']
#cols_train=[ 'epoch_1', 'epoch_2', 'epoch_3', 'epoch_4', 'epoch_5', 'epoch_6', 'epoch_7','num_stages', 'params', 'WA', 'W0',
#       'WM', 'DEPTH', 'num_classes', 'benchmark']
#cols_train=[ 'epoch_1', 'epoch_2','epoch_3', 'epoch_4','num_stages', 'params', 'WA', 'W0',
#       'WM', 'DEPTH', 'num_classes', 'benchmark']

cols_test=['epoch_11',
       'epoch_12', 'epoch_13', 'epoch_14', 'epoch_15', 'epoch_16', 'epoch_17',
       'epoch_18', 'epoch_19', 'epoch_20']
cols_test=["best_acc"]
X=train_data[cols_train]
y=train_data[cols_test]
gens=[1]

X_test_new=test_data[test_data["gen"].isin(gens)][cols_train]
y_test_new=test_data[test_data["gen"].isin(gens)][cols_test]

In [179]:
train_data[train_data.gen==3].DEPTH.value_counts()

DEPTH
17    4
18    2
14    2
11    2
10    2
22    2
8     2
9     1
15    1
20    1
13    1
Name: count, dtype: int64

In [180]:
test_data[test_data.gen==3].DEPTH.value_counts()

DEPTH
12    2
19    2
20    2
17    2
10    2
14    2
13    1
16    1
15    1
11    1
22    1
8     1
21    1
9     1
Name: count, dtype: int64

In [173]:
test_data[test_data.gen==3].DEPTH.value_counts()

DEPTH
12    2
19    2
20    2
17    2
10    2
14    2
13    1
16    1
15    1
11    1
22    1
8     1
21    1
9     1
Name: count, dtype: int64

# Performance predictor

Gradient Boosting Machines (GBMs) (e.g., XGBoost, LightGBM):
RandomForest

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Sample data
# X should include first 10 epochs' accuracy and model parameters
# y should include the accuracy of the subsequent 10 epochs
#X = np.random.rand(100, 15)  # 100 samples, 10 accuracies + 5 parameters
#y = np.random.rand(100, 10)  # 100 samples, next 10 epochs' accuracy

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Standardize the data
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

# Train a RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_regressor.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print(f'RandomForestRegressor RMSE: {rmse_rf}')



  return fit_method(estimator, *args, **kwargs)


RandomForestRegressor RMSE: 0.9052349272426432


In [11]:
# Predict and evaluate
#X_test_new = scaler.transform(X_test_new)

y_pred_rf = rf_regressor.predict(X_test_new)
rmse_rf = np.sqrt(mean_squared_error(y_test_new, y_pred_rf))
print(f'RandomForestRegressor RMSE: {rmse_rf}')


RandomForestRegressor RMSE: 13.511911125947334


In [17]:
y_test_new.reset_index(drop=True).sort_values("best_acc", ascending=False)

Unnamed: 0,best_acc
8,42.88
0,42.5
6,42.41
15,42.37
19,42.33
5,42.23
11,42.19
16,41.9
4,41.83
17,41.81


In [19]:
pd.DataFrame(y_pred_rf, columns=["pred_acc"]).sort_values("pred_acc", ascending=False)

Unnamed: 0,pred_acc
17,56.5878
11,55.6209
19,55.6166
8,55.6025
5,55.3889
16,55.362
18,55.3368
15,55.2496
4,55.187
7,55.1822


In [301]:
np.corrcoef(y_pred_rf[:][0:60],y_test_new.values[:][0:60].reshape(-1))

array([[1.        , 0.43816592],
       [0.43816592, 1.        ]])

In [303]:
np.corrcoef(X_test_new["epoch_10"].values[:20].reshape(-1),y_test_new[:20].values[:].reshape(-1))

array([[1.        , 0.90872616],
       [0.90872616, 1.        ]])

In [123]:
y_pred_rf

array([59.719 , 58.936 , 56.4102, 60.0944, 59.9151, 60.3432, 60.3657,
       61.5988, 60.8167, 59.7897, 59.6178, 60.3493, 59.7265, 59.0941,
       56.9798, 61.0983, 61.577 , 60.5617, 61.8244, 61.6281, 60.7059,
       61.0587, 61.5172, 61.6237, 60.298 , 60.2908, 61.3481, 61.0361,
       61.2968, 61.2947, 60.2082, 62.1292, 61.1402, 61.8315, 60.477 ,
       60.1731, 60.2328, 60.4864, 62.0043, 61.3174, 60.997 , 61.1983,
       61.2318, 60.3409, 62.127 , 61.6775, 60.9963, 61.133 , 61.3531,
       60.9014, 61.0975, 60.2715, 61.6547, 59.5361, 61.6507, 60.9652,
       60.5947, 61.6178, 60.2765, 60.58  ])

In [8]:
# Train a Support Vector Regressor
svr = SVR()
param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf']
}
param_grid = {
    'C': [0.1],
    'epsilon': [0.01],
    'kernel': [ 'poly']
}
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Predict and evaluate
best_svr = grid_search.best_estimator_
y_pred_svr = best_svr.predict(X_test)
rmse_svr = np.sqrt(mean_squared_error(y_test, y_pred_svr))
print(f'Support Vector Regressor RMSE: {rmse_svr}')


Support Vector Regressor RMSE: 16.52822026363254


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [9]:
y_pred_rf = best_svr.predict(X_test_new)
rmse_rf = np.sqrt(mean_squared_error(y_test_new, y_pred_rf))
print(f'RandomForestRegressor RMSE: {rmse_rf}')

RandomForestRegressor RMSE: 42.694856840907796


In [None]:
print("yes")

In [24]:
# Install scikit-learn if not already installed
#%pip install -q scikit-learn

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming X_train, X_test, y_train, y_test are already defined

# Initialize and fit the GradientBoostingRegressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=42)
gb_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gb_regressor.predict(X_test)

# Calculate RMSE
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
print(f'GradientBoostingRegressor RMSE: {rmse_gb}')


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


GradientBoostingRegressor RMSE: 1.3756102173972162


In [25]:
y_pred_rf = gb_regressor.predict(X_test_new)
rmse_rf = np.sqrt(mean_squared_error(y_test_new, y_pred_rf))
print(f'GradientBoostingRegressor RMSE: {rmse_rf}')

RandomForestRegressor RMSE: 14.966455878098962


In [22]:
y_pred_rf

array([56.32557 , 56.43364 , 56.237415, 56.43364 , 56.32557 , 56.43364 ,
       56.43364 , 56.43364 , 56.43364 , 56.43364 , 56.516373, 56.516373,
       56.43364 , 56.43364 , 56.516373, 56.32557 , 56.32557 , 56.516373,
       56.516373, 56.32557 ], dtype=float32)

In [26]:
pd.DataFrame(y_pred_rf, columns=["pred_acc"]).sort_values("pred_acc", ascending=False)

Unnamed: 0,pred_acc
10,56.6729
18,56.6729
17,56.6729
11,56.6729
14,56.6729
3,56.594088
6,56.594088
7,56.594088
8,56.594088
9,56.594088


In [288]:
import numpy as np

In [289]:
np.corrcoef(y_pred_rf[:],y_test_new.values[:].reshape(-1))

array([[ 1.        , -0.04096569],
       [-0.04096569,  1.        ]])

In [212]:
np.corrcoef(X_test_new["epoch_7"].values.reshape(-1)[:],y_test_new.values.reshape(-1)[:])

array([[1.        , 0.83109927],
       [0.83109927, 1.        ]])

# Ranking classifier

In [39]:
import pandas as pd
from itertools import combinations


In [40]:
total_data=[]

for benchmark_score in train_data['benchmark'].unique():
    filtered_df = train_data[(train_data['benchmark'] == benchmark_score)].reset_index(drop=True)
    pairs = list(combinations(filtered_df.index, 2))
    combined_data = []

    for idx1, idx2 in pairs:
        row1 = filtered_df.loc[idx1]
        row2 = filtered_df.loc[idx2]
        combined_row = {
            'name_A': row1['name'],
            'name_B': row2['name'],
            'epoch_1_A': row1['epoch_1'],
            'epoch_2_A': row1['epoch_2'],
            'epoch_3_A': row1['epoch_3'],
            'epoch_4_A': row1['epoch_4'],
            'epoch_5_A': row1['epoch_5'],
            'epoch_6_A': row1['epoch_6'],
                        'epoch_7_A': row1['epoch_7'],
                       'epoch_8_A': row1['epoch_8'],
                       'epoch_9_A': row1['epoch_9'],
                       'epoch_10_A': row1['epoch_10'],
            'num_stages_A': row1['num_stages'],
            'params_A': row1['params'],
            'WA_A': row1['WA'],
            'W0_A': row1['W0'],
            'WM_A': row1['WM'],
            'DEPTH_A': row1['DEPTH'],
            "gen_A":row1["gen"],
            'best_acc_A': row1['best_acc'],
            'epoch_1_B': row2['epoch_1'],
            'epoch_2_B': row2['epoch_2'],
            'epoch_3_B': row2['epoch_3'],
            'epoch_4_B': row2['epoch_4'],
            'epoch_5_B': row2['epoch_5'],
            'epoch_6_B': row2['epoch_6'],
            'epoch_7_B': row2['epoch_7'],
                       'epoch_8_B': row2['epoch_8'],
                       'epoch_9_B': row2['epoch_9'],
                       'epoch_10_B': row2['epoch_10'],
            'num_stages_B': row2['num_stages'],
            'params_B': row2['params'],
            'WA_B': row2['WA'],
            'W0_B': row2['W0'],
            'WM_B': row2['WM'],
            'DEPTH_B': row2['DEPTH'],
            "gen_B":row2["gen"],
            'best_acc_B': row2['best_acc'],
            'label': 1 if row1['best_acc'] > row2['best_acc'] else 0
        }

        combined_data.append(combined_row)

    combined_df = pd.DataFrame(combined_data)
    combined_df["benchmark"]=benchmark_score
    combined_df["num_classes"]=filtered_df["num_classes"].unique()[0]
    
    total_data.append(combined_df)
total_data=pd.concat(total_data)


In [41]:
total_data_test=[]

for benchmark_score in test_data['benchmark'].unique():
    filtered_df = test_data[(test_data['benchmark'] == benchmark_score)].reset_index(drop=True)
    pairs = list(combinations(filtered_df.index, 2))

    combined_data = []

    for idx1, idx2 in pairs:
        row1 = filtered_df.loc[idx1]
        row2 = filtered_df.loc[idx2]
        combined_row = {
            'name_A': row1['name'],
            'name_B': row2['name'],
            'epoch_1_A': row1['epoch_1'],
            'epoch_2_A': row1['epoch_2'],
            'epoch_3_A': row1['epoch_3'],
            'epoch_4_A': row1['epoch_4'],
            'epoch_5_A': row1['epoch_5'],
            'epoch_6_A': row1['epoch_6'],
                       'epoch_7_A': row1['epoch_7'],
                       'epoch_8_A': row1['epoch_8'],
                       'epoch_9_A': row1['epoch_9'],
                       'epoch_10_A': row1['epoch_10'],
            'num_stages_A': row1['num_stages'],
            'params_A': row1['params'],
            'WA_A': row1['WA'],
            'W0_A': row1['W0'],
            'WM_A': row1['WM'],
            'DEPTH_A': row1['DEPTH'],
            'gen_A': row1['gen'],
            'best_acc_A': row1['best_acc'],
            'epoch_1_B': row2['epoch_1'],
            'epoch_2_B': row2['epoch_2'],
            'epoch_3_B': row2['epoch_3'],
            'epoch_4_B': row2['epoch_4'],
            'epoch_5_B': row2['epoch_5'],
            'epoch_6_B': row2['epoch_6'],
                       'epoch_7_B': row2['epoch_7'],
                       'epoch_8_B': row2['epoch_8'],
                       'epoch_9_B': row2['epoch_9'],
                       'epoch_10_B': row2['epoch_10'],
            'num_stages_B': row2['num_stages'],
            'params_B': row2['params'],
            'WA_B': row2['WA'],
            'W0_B': row2['W0'],
            'WM_B': row2['WM'],
            'DEPTH_B': row2['DEPTH'],
            'gen_B': row2['gen'],
            'best_acc_B': row2['best_acc'],
            'label': 1 if row1['best_acc'] > row2['best_acc'] else 0
        }

        combined_data.append(combined_row)

    combined_df = pd.DataFrame(combined_data)
    combined_df["benchmark"]=benchmark_score
    combined_df["num_classes"]=filtered_df["num_classes"].unique()[0]
    
    total_data_test.append(combined_df)
total_data_test=pd.concat(total_data_test)


In [42]:
total_data.num_classes.value_counts()

num_classes
10    14280
4      7140
20     7140
3      7140
Name: count, dtype: int64

In [150]:
cols_train=[ 'epoch_1_A', 'epoch_2_A', 'epoch_3_A', 'epoch_4_A', 'epoch_5_A', 'epoch_6_A', 'epoch_7_A',
            'epoch_8_A','epoch_9_A','epoch_10_A',
        'num_stages_A', 'params_A', 'WA_A', 'W0_A',
       'WM_A', 'DEPTH_A',
        'num_stages_B', 'params_B', 'WA_B', 'W0_B', 'epoch_4_B', 'epoch_5_B', 'epoch_6_B', 'epoch_7_B',
        'epoch_8_B','epoch_9_B','epoch_10_B',
       'WM_B', 'DEPTH_B', 'num_classes', 'benchmark']
cols_train=[ 
        'num_stages_A', 'params_A', 'WA_A', 'W0_A', 
       'WM_A', 'DEPTH_A', 
        'num_stages_B', 'params_B', 'WA_B', 'W0_B',
       'WM_B', 'DEPTH_B',"num_classes", "benchmark"]
#cols_train=[ 'epoch_1', 'epoch_2', 'epoch_3', 'epoch_4', 'epoch_5', 'epoch_6', 'epoch_7','num_stages', 'params', 'WA', 'W0',
#       'WM', 'DEPTH', 'num_classes', 'benchmark']
#cols_train=[ 'epoch_1', 'epoch_2','epoch_3', 'epoch_4','num_stages', 'params', 'WA', 'W0',
#       'WM', 'DEPTH', 'num_classes', 'benchmark']

cols_test=["label"]
X=total_data[cols_train]
y=total_data[cols_test]
gens=[2]
X_test_new=total_data_test[(total_data_test["gen_B"].isin(gens)) & (total_data_test["gen_A"].isin(gens))][cols_train]

y_test_new=total_data_test[(total_data_test["gen_B"].isin(gens)) & (total_data_test["gen_A"].isin(gens))][cols_test]

In [147]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_epoch_10_pred= (X_test_new['epoch_10_A'] > X_test_new['epoch_10_B']).astype(int).values
#y_epoch_10_pred=np.repeat(0,len(y_test_new))
# Calculate accuracy
accuracy = accuracy_score(y_test_new, y_epoch_10_pred)
print(f"Accuracy: {accuracy}")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test_new, y_epoch_10_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_new, y_epoch_10_pred))

KeyError: 'epoch_10_B'

In [128]:
len(y_epoch_10_pred)

190

In [149]:
ranking_test_df=total_data_test[(total_data_test["gen_B"].isin(gens)) & (total_data_test["gen_A"].isin(gens))][["name_A","name_B", "label", "gen_A","gen_B"]]
ranking_test_df["pred_10_epochs"]=y_epoch_10_pred


In [163]:
ranking_test_df["y_pred"]=y_pred

In [54]:
test_data[test_data.gen==1].sort_values("best_acc", ascending=False)[["name","best_acc"]]

Unnamed: 0,name,best_acc
0,demonic_pheasant,42.88
0,amaranth_python,42.5
0,dandelion_turkey,42.41
0,spectral_dolphin,42.37
0,wakeful_rottweiler,42.33
0,cherry_nightingale,42.23
0,masterful_gecko,42.19
0,thundering_mastodon,41.9
0,careful_degu,41.83
0,tidy_newt,41.81


In [164]:


# Create a new column by applying the function to each row
ranking_target={}
ranking_10={}
ranking_predict={}
for ind in list(ranking_test_df.name_A.unique())+list(ranking_test_df.name_B.unique()):
    ranking_target[ind]=0
    ranking_predict[ind]=0
    ranking_10[ind]=0
for index, row in ranking_test_df.iterrows():
    if row["label"]==1:
        ranking_target[row["name_A"]]=ranking_target[row["name_A"]]+1
    else:
        ranking_target[row["name_B"]]=ranking_target[row["name_B"]]+1

    if row["pred_10_epochs"]==1:
        ranking_10[row["name_A"]]=ranking_10[row["name_A"]]+1
    else:
        ranking_10[row["name_B"]]=ranking_10[row["name_B"]]+1

    if row["y_pred"]==1:
        ranking_predict[row["name_A"]]=ranking_predict[row["name_A"]]+1
    else:
        ranking_predict[row["name_B"]]=ranking_predict[row["name_B"]]+1

ranking_predict_df=pd.DataFrame([ranking_predict]).T.rename(columns={0:"score"}).sort_values(by="score", ascending=False)
    
ranking_target_df=pd.DataFrame([ranking_target]).T.rename(columns={0:"score"}).sort_values(by="score", ascending=False)
ranking_10_df=pd.DataFrame([ranking_10]).T.rename(columns={0:"score"}).sort_values(by="score", ascending=False)

In [165]:
ranking_predict_df

Unnamed: 0,score
classy_seriema,19
fragrant_gazelle,18
frisky_baboon,17
hilarious_herring,16
notorious_trogon,15
wealthy_limpet,14
tireless_muskrat,13
discreet_lionfish,12
monumental_bison,11
gay_cow,9


In [167]:
ranking_target_df

Unnamed: 0,score
fragrant_gazelle,19
classy_seriema,18
discreet_lionfish,17
greedy_jackrabbit,16
notorious_trogon,15
wealthy_limpet,14
frisky_baboon,13
gay_cow,12
monumental_bison,11
hilarious_herring,10


In [111]:
ranking_10_df

Unnamed: 0,score
classy_seriema,19
fragrant_gazelle,18
hilarious_herring,17
swinging_quoll,16
discreet_lionfish,15
energetic_terrier,14
monumental_bison,13
frisky_baboon,12
kickass_ibex,11
wealthy_limpet,10


In [169]:
from scipy.stats import spearmanr
from scipy.stats import kendalltau

def kendall_tau_distance(list1, list2):
    tau, _ = kendalltau(list1, list2)
    return tau
def spearman_rank_correlation(list1, list2):
    corr, _ = spearmanr(list1, list2)
    return corr
# Example usage
target =ranking_target_df.index.tolist()
list_to_compare = ranking_predict_df.index.tolist()
#list_to_compare = ranking_10_df.index.tolist()
# Convert the lists to ranks
target_ranks = {k: i for i, k in enumerate(target)}
list_to_compare_ranks = [target_ranks[x] for x in list_to_compare]

# Calculate Spearman rank correlation
spearman_corr = spearman_rank_correlation(list_to_compare_ranks, list(range(len(target))))
print(f'Spearman rank correlation: {spearman_corr}')
# Calculate Kendall Tau distance
tau_distance = kendall_tau_distance(list_to_compare_ranks, list(range(len(target))))
print(f'Kendall Tau distance: {tau_distance}')

Spearman rank correlation: 0.8285714285714285
Kendall Tau distance: 0.6210526315789474


In [151]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


In [152]:
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X_train, y_train)


  return fit_method(estimator, *args, **kwargs)


In [153]:
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9081232492997199
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      1654
           1       0.91      0.92      0.92      1916

    accuracy                           0.91      3570
   macro avg       0.91      0.91      0.91      3570
weighted avg       0.91      0.91      0.91      3570

Confusion Matrix:
[[1471  183]
 [ 145 1771]]


In [154]:
#X_test_new= scaler.transform(X_test_new)
y_pred = clf.predict(X_test_new)

# Calculate accuracy
accuracy = accuracy_score(y_test_new, y_pred)
print(f"Accuracy: {accuracy}")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test_new, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_new, y_pred))


Accuracy: 0.7842105263157895
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.72      0.75        86
           1       0.78      0.84      0.81       104

    accuracy                           0.78       190
   macro avg       0.78      0.78      0.78       190
weighted avg       0.78      0.78      0.78       190

Confusion Matrix:
[[62 24]
 [17 87]]


In [161]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting model
gb_clf = GradientBoostingClassifier(n_estimators=100,random_state=42)

# Train the model
gb_clf.fit(X_train, y_train)

# Make predictions
y_pred = gb_clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


  y = column_or_1d(y, warn=True)


Accuracy: 0.8204481792717087
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.76      0.80      1654
           1       0.81      0.87      0.84      1916

    accuracy                           0.82      3570
   macro avg       0.82      0.82      0.82      3570
weighted avg       0.82      0.82      0.82      3570

Confusion Matrix:
[[1264  390]
 [ 251 1665]]


In [162]:

#X_test_new=total_data_test[cols_train]
#y_test_new=total_data_test[cols_test]
y_pred = gb_clf.predict(X_test_new)
#y_pred=best_gb_clf.predict(X_test_new)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test_new, y_pred)}")
print("Classification Report:")
print(classification_report(y_test_new, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test_new, y_pred))


Accuracy: 0.8052631578947368
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        86
           1       0.82      0.83      0.82       104

    accuracy                           0.81       190
   macro avg       0.80      0.80      0.80       190
weighted avg       0.81      0.81      0.81       190

Confusion Matrix:
[[67 19]
 [18 86]]


In [153]:
# Best parameters
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'learning_rate': 0.2, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
