In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime as dt
import os
import textwrap
import seaborn as sns

COLORS = ['#4F6272', '#B7C3F3', '#DD7596', '#8EB897', '#f6bd60', '#e76f51', '#2a9d8f', "#c77dff", "#f7d6e0"]

rads = ["#ea5545", "#e1a692", "#DD7596", '#9A3B3B', "#b30000"]
maroon = ["#900c3f", "#c70039", '#900c3f']
COLORS_20 = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000']
semi_balck = '#4F6272'


In [None]:
results_path = "/Volumes/noamaz/modularity/xor/"
local_base_path = '/Users/noamazmon/PycharmProjects/network_modularity'
task_name = 'XOR'
task = "xor"
num_layers = "4"
time_str = dt.now().strftime("%Y-%m-%d-%H-%M-%S")
res_folder = f"{task}_{num_layers}_layers"
plot_path = f"{local_base_path}/plots/xor_multi_arch/{res_folder}"
first_analsis_res_folder = f"{results_path}/{res_folder}/first_analysis_results"
first_analsis_file_name = "2023-11-16-14-52-18_all_results_from_teach_archs_results_with_motifs_5000_ep_no_duplicates.csv"
lgbm_folder_name = 'exp_2023-11-16-17-38-02_nice_features'

lgb_res_path = f"{results_path}/{res_folder}/lightgbm_feature_selection/{lgbm_folder_name}"
lgb_res_nn_path = f"{lgb_res_path}/teach_archs_regression_feature_selection_results"
n_features = 5
corrlated_features_values = pd.read_csv(f"/Volumes/noamaz/modularity/xor/xor_4_layers/feature_correlation/2023-11-17-11-50-45_all_data.csv")
correlated_features_csv_name = '2023-11-17-11-50-45_feature_correlation.csv'
correlated_features = pd.read_csv(f"{results_path}/{res_folder}/feature_correlation/{correlated_features_csv_name}",
                                  index_col=0)

In [None]:
plt.show()
orignaly_chosen_features_values = pd.read_csv(f"{lgb_res_path}/{n_features}_feature_values.csv").drop("Unnamed: 0",
                                                                                                      axis=1)
originaly_selected_feature_names = list(orignaly_chosen_features_values.columns)[:-1]
correlated_features = pd.read_csv(f"{results_path}/{res_folder}/feature_correlation/{correlated_features_csv_name}",
                                  index_col=0)
corrlated_selected_feature_names = [
    correlated_features[feature].drop(index=feature).idxmax()
    for feature in originaly_selected_feature_names
]

In [None]:
num_columns = int(np.ceil(len(originaly_selected_feature_names) / 2))
fig, axs = plt.subplots(num_columns, 2, figsize=(12, 16), )
min_x = 1.0
max_x = 0
idxs = []
r, c, = axs.shape
for i in range(r):
    for j in range(c):
        idxs.append((i, j))
for k, (origin_f_name, coor_f_name, (i, j)) in enumerate(
        zip(originaly_selected_feature_names, corrlated_selected_feature_names, idxs)):
    correlation_val = round(correlated_features[origin_f_name].loc[coor_f_name], 3)
    x_data = corrlated_features_values[origin_f_name]
    y_data =  corrlated_features_values[coor_f_name]
    axs[i, j].scatter(x_data + np.random.normal(scale = 0.2, size = x_data.shape[0]),y_data +  np.random.normal(scale = 0.5, size = y_data.shape[0]),
                      color=COLORS[2], s=0.2)
    axs[i, j].set_xlabel(textwrap.fill(origin_f_name.replace ('_', ' '), width=30,
                      break_long_words=False), fontsize=12)
    axs[i, j].set_ylabel(textwrap.fill(coor_f_name.replace ('_', ' '), width=30,
                      break_long_words=False), fontsize=12)
    axs[i, j].set_title(f'Correlation value {correlation_val}')
    axs[i, j].tick_params(axis='x', labelsize=12)
    axs[i, j].tick_params(axis='y', labelsize=12)
plt.suptitle(f"The distribution of selected features as a function of the feature most correlated with it - noise added")
plt.tight_layout()
plt.savefig(
    f"{plot_path}/{time_str}_{task}_{n_features}_noise_correlated_features_dist.png")
plt.show()

In [None]:
lgb_res_small_nn_path = f"{results_path}/{res_folder}/lightgbm_feature_selection/{lgbm_folder_name}/teach_archs_regression_feature_selection_results_small_ann"
num_features = []
feature_selection_small_ann_res = pd.DataFrame()
for file_name in os.listdir(lgb_res_small_nn_path):
    if 'best.csv' not in file_name:
        continue
    temp = pd.read_csv(f"{lgb_res_small_nn_path}/{file_name}")
    temp['num_features'] = int(file_name.split('only_')[1].split('_features')[0])
    feature_selection_small_ann_res = pd.concat([feature_selection_small_ann_res, temp], ignore_index=True)
feature_selection_small_ann_res = feature_selection_small_ann_res.sort_values('num_features')
plt.plot(feature_selection_small_ann_res['num_features'], feature_selection_small_ann_res['best mape train'],
         label='train', c=rads[2])
plt.plot(feature_selection_small_ann_res['num_features'], feature_selection_small_ann_res['best mape test'],
         label='test', c=rads[3])

plt.xlabel('number of features', fontsize=12)
plt.ylabel(f'Mean absolute percentage error', fontsize=12)
plt.legend()
plt.title(
    f"Prediction of an ANN based on structural features {task.capitalize()}",
    wrap=True, fontsize=12)
plt.yticks(fontsize=11)
plt.xticks(fontsize=11)
plt.tight_layout()
plt.savefig(
    f"{plot_path}/{time_str}_{task}_nn_feature_selection_r2_mape_small_ann.png")
plt.show()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 6), sharex='all', sharey='all')
max_y = max((feature_selection_res['best mape test'].max(), feature_selection_res['best mape train'].max(),
             feature_selection_small_ann_res['best mape test'].max(),
             feature_selection_small_ann_res['best mape train'].max()))
min_y = min((feature_selection_res['best mape test'].min(), feature_selection_res['best mape train'].min(),
             feature_selection_small_ann_res['best mape test'].min(),
             feature_selection_small_ann_res['best mape train'].min()))
ax1.vlines(ymax=max_y, x=n_features, ymin=min_y, color='grey', linestyles='--', alpha=0.8)
ax2.vlines(ymax=max_y, x=n_features, ymin=min_y, color='grey', linestyles='--', alpha=0.8)
ax1.plot(feature_selection_small_ann_res['num_features'], feature_selection_small_ann_res['best mape train'],
         label='samll ANN', c=rads[2])
ax1.plot(feature_selection_res['num_features'], feature_selection_res['best mape train'], label='big ANN', c=rads[3])
ax2.plot(feature_selection_small_ann_res['num_features'], feature_selection_small_ann_res['best mape test'],
         label='samll ANN', c=rads[2])
ax2.plot(feature_selection_res['num_features'], feature_selection_res['best mape test'], label='big ANN', c=rads[3])

ax1.set_xlabel('number of features', fontsize=12)
ax2.set_xlabel('number of features', fontsize=12)
ax1.set_ylabel(f'Mean absolute percentage error', fontsize=12)
ax1.set_title('train data')
ax2.set_title('test data')
ax1.legend()
ax2.legend()
plt.suptitle(
    f"Prediction of an ANN based on structural features {task.capitalize()}",
    wrap=True, fontsize=16)
plt.yticks(fontsize=11)
plt.xticks(fontsize=11)
plt.tight_layout()
plt.savefig(
    f"{plot_path}/{time_str}_{task}_nn_feature_selection_r2_mape_big_vs_smal_ann.png")
plt.show()
