In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
from time import time
from CBFV import composition
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler, normalize, OrdinalEncoder
from sklearn.model_selection import learning_curve, RandomizedSearchCV, cross_validate, GroupKFold
import shap
import random
import difflib

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RNG_SEED = 8
np.random.seed(RNG_SEED)
curr_dir = os.getcwd()
DATA_PATH = os.path.join(os.path.dirname(curr_dir), 'Data')
MODELS_PATH = os.path.join(os.path.dirname(curr_dir), 'Models')
WEIGHTS_PATH = os.path.join(os.path.dirname(curr_dir), 'Weights')
ASSETS_PATH = os.path.join(os.path.dirname(curr_dir), 'Assets')
RESULTS_PATH = os.path.join(os.path.dirname(curr_dir), 'Results')

In [3]:
df_train = pd.read_csv(DATA_PATH + "/intermetallics_train7.csv")
df_val = pd.read_csv(DATA_PATH + "/intermetallics_val7.csv")
df_test = pd.read_csv(DATA_PATH + "/intermetallics_test7.csv")

In [4]:
df_train.head()

Unnamed: 0,material_id,formula,jml_bp_mult_atom_rad,jml_hfus_add_bp,jml_elec_aff_mult_voro_coord,jml_mol_vol_subs_atom_mass,jml_is_halogen,jml_atom_mass_subs_first_ion_en,jml_row,jml_mol_vol_mult_atom_mass,...,SumCoNi,SumMnFeCo,SumMnFeNi,SumMnFeCr,SumCrMnCo,SumCrFeCo,SumCrMnNi,SumCrFeNi,SumCrCoNi,SumFeNiCo
0,mp-1217137,Ti2CrTe4,3623.666667,2588.508433,5.541333,-63.044367,0.0,68.28629,4.333333,1165.310894,...,0.0,0.0,0.0,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.0
1,mp-1203616,Y6Al43Cr4,4702.6,3115.143667,4.928667,-43.591163,0.0,49.63797,4.0,804.398496,...,0.0,0.0,0.0,0.075472,0.075472,0.075472,0.075472,0.075472,0.075472,0.0
2,mp-1022649,YMg6Cr,4220.766667,2638.8029,3.228667,-41.36565,0.0,48.192302,4.0,827.883367,...,0.0,0.0,0.0,0.125,0.125,0.125,0.125,0.125,0.125,0.0
3,mp-2132,CrN,2085.942,1510.786775,3.32,-22.6164,0.0,22.351095,3.0,282.791261,...,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.0
4,mp-1190957,GdCrB4,4677.666667,3555.936233,2.588333,-62.8457,0.0,66.280923,4.666667,1184.222364,...,0.0,0.0,0.0,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667,0.0


In [5]:
df_train.columns

Index(['material_id', 'formula', 'jml_bp_mult_atom_rad', 'jml_hfus_add_bp',
       'jml_elec_aff_mult_voro_coord', 'jml_mol_vol_subs_atom_mass',
       'jml_is_halogen', 'jml_atom_mass_subs_first_ion_en', 'jml_row',
       'jml_mol_vol_mult_atom_mass',
       ...
       'SumCoNi', 'SumMnFeCo', 'SumMnFeNi', 'SumMnFeCr', 'SumCrMnCo',
       'SumCrFeCo', 'SumCrMnNi', 'SumCrFeNi', 'SumCrCoNi', 'SumFeNiCo'],
      dtype='object', length=1599)

In [None]:
columns = df_train.columns

similarity_matrix = pd.DataFrame(
    [[difflib.SequenceMatcher(None, col1, col2).ratio()
      for col2 in columns] for col1 in columns],
    index=columns,
    columns=columns
)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Heatmap of Similarity Between Column Names')
plt.show()