# Imports

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [None]:
DATA_PATH = os.path.join(os.getcwd(), 'Data')
MODELS_PATH = os.path.join(os.getcwd(), 'Models')
WEIGHTS_PATH = os.path.join(os.getcwd(), 'Weights')
ASSETS_PATH = os.path.join(os.getcwd(), 'Assets')
RESULTS_PATH = os.path.join(os.getcwd(), 'Results')

In [None]:
df = pd.read_csv(
    r"C:\Users\joeya\Documents\Fokwa Group\ML_tutorial\intermetallics\intermetallics_Fulldataset_featurized7.csv")

enc = OrdinalEncoder()
enc.fit(df[["crystal_system", "symbol", "point_group"]])
df[["crystal_system", "symbol", "point_group"]] = enc.transform(
    df[["crystal_system", "symbol", "point_group"]])

X = pd.DataFrame()  # input data
y = pd.DataFrame()  # prediction targets for the model.

X = df.copy(deep=True)

In [None]:
X = pd.DataFrame()  # input data
y = pd.DataFrame()  # prediction targets for the model.

X = df.copy(deep=True)

y = X.pop('total_magnetization_normalized_formula_units')

print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')

In [None]:
# Setting a rng seed to ensure reproducibility
RNG_SEED = 8
np.random.seed(seed=RNG_SEED)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RNG_SEED)

print(X_train.shape)
print(X_test.shape)

In [None]:
# Checking if there are multiple entries with the same
num_rows = len(X_train)
print(f'There are in total {num_rows} rows in the X_train DataFrame.')

num_unique_formulae = len(X_train['formula'].unique())
print(f'But there are only {num_unique_formulae} unique formulae!\n')

print('Unique formulae and their number of occurances in the X_train DataFrame:')
print(X_train['formula'].value_counts(), '\n')
print('Unique formulae and their number of occurances in the X_test DataFrame:')
print(X_test['formula'].value_counts())

In [None]:
# Need to manually split by formula

unique_formulae = X['formula'].unique()
print(f'{len(unique_formulae)} unique formulae:\n{unique_formulae}')

# Set a random seed to ensure reproducibility across runs
np.random.seed(seed=RNG_SEED)

# Store a list of all unique formulae
all_formulae = unique_formulae.copy()

# Define the proportional size of the dataset split
val_size = 0.20
test_size = 0.10
train_size = 1 - val_size - test_size

# Calculate the number of samples in each dataset split
num_val_samples = int(round(val_size * len(unique_formulae)))
num_test_samples = int(round(test_size * len(unique_formulae)))
num_train_samples = int(
    round((1 - val_size - test_size) * len(unique_formulae)))

# Randomly choose the formulate for the validation dataset, and remove those from the unique formulae list
val_formulae = np.random.choice(
    all_formulae, size=num_val_samples, replace=False)
all_formulae = [f for f in all_formulae if f not in val_formulae]

# Randomly choose the formulate for the test dataset, and remove those from the unique formulae list
test_formulae = np.random.choice(
    all_formulae, size=num_test_samples, replace=False)
all_formulae = [f for f in all_formulae if f not in test_formulae]

# The remaining formulae will be used for the training dataset
train_formulae = all_formulae.copy()


print('Number of training formulae:', len(train_formulae))
print('Number of validation formulae:', len(val_formulae))
print('Number of testing formulae:', len(test_formulae))

In [None]:

# Split the original dataset into the train/validation/test datasets using the formulae lists above
df_train = df[df['formula'].isin(train_formulae)]
df_val = df[df['formula'].isin(val_formulae)]
df_test = df[df['formula'].isin(test_formulae)]

print(f'train dataset shape: {df_train.shape}')
print(f'validation dataset shape: {df_val.shape}')
print(f'test dataset shape: {df_test.shape}\n')

print(df_train.head(), '\n')
print(df_val.head(), '\n')
print(df_test.head(), '\n')

# Check
train_formulae = set(df_train['formula'].unique())
val_formulae = set(df_val['formula'].unique())
test_formulae = set(df_test['formula'].unique())

# save dataframe of unique formulas and how many there are in all 3 data subsets
all_formulae_df = pd.DataFrame(all_formulae, columns=['Number of formulae'])
all_formulae_df.to_csv(
    'C:/Users/joeya/Documents/Fokwa Group/ML_tutorial/intermetallics/tables&lists/all_formulae.csv', index=False)

common_formulae1 = train_formulae.intersection(test_formulae)
common_formulae2 = train_formulae.intersection(val_formulae)
common_formulae3 = test_formulae.intersection(val_formulae)

print(
    f'# of common formulae in intersection 1: {len(common_formulae1)}; common formulae: {common_formulae1}')
print(
    f'# of common formulae in intersection 2: {len(common_formulae2)}; common formulae: {common_formulae2}')
print(
    f'# of common formulae in intersection 3: {len(common_formulae3)}; common formulae: {common_formulae3}')

In [None]:
# saving these splits into csv files
train_path = os.path.join(
    DATA_PATH, 'intermetallics_train7.csv')
val_path = os.path.join(
    DATA_PATH, 'intermetallics_val7.csv')
test_path = os.path.join(
    DATA_PATH, 'intermetallics_test7.csv')

df_train.to_csv(train_path, index=False)
df_val.to_csv(val_path, index=False)
df_test.to_csv(test_path, index=False)

In [None]:
# %% Analyzing magnetization distribution in training and

df_train_new = pd.concat((df_train, df_val), axis=0)
print(df_train_new.shape)

print(df_train_new['total_magnetization_normalized_formula_units'].describe())

plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 600

plot1 = plt.hist(df_train_new['total_magnetization_normalized_formula_units'],
                 color='blue', edgecolor='black', bins=132, log=True)

plot1 = plt.xlabel('Magnetization ($\u03bc_B$/ f.u.)')
plot1 = plt.ylabel('log(counts)')
plot1 = plt.title('Frequency of magnetization values in Train Set')

plt.savefig(r'C:\Users\joeya\Documents\Fokwa Group\ML_tutorial\intermetallics\figures\different_training_sets\trainSet7_magn_hist_transparent.png',
            bbox_inches='tight', dpi=600, transparent=True)
plt.savefig(r'C:\Users\joeya\Documents\Fokwa Group\ML_tutorial\intermetallics\figures\different_training_sets\trainSet7_New_magn_hist.png',
            bbox_inches='tight', dpi=600)