<a href="https://colab.research.google.com/github/aghosh92/Cation-Ordering-ML/blob/main/SissoRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook has been prepared by Dennis P. Trujillo and Ayana Ghosh.

Email: dptru10@gmail.com 

Email: research.aghosh@gmail.com

It shows the following:

1. How SISSO approach can be implemented within a regression environment to find the best combination of non-linearized features with respect to the target using Matminer and Automatminer.

Link to all datasets: https://doi.org/10.5281/zenodo.6570994

Install packages

In [None]:
!pip install -q matminer 
!pip install -q automatminer 

Import essential packages

In [None]:
import os
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.linear_model import Lasso
from matminer.featurizers.function import FunctionFeaturizer
from automatminer import DataCleaner

In [None]:
#filename = "final_layer_predict_energy_diff.csv"
df = pd.read_csv("https://zenodo.org/record/6570994/files/Model_V_VI_VII.csv?download=1", on_bad_lines='skip').head(n=20)
df.columns

Index(['Index', 'Compound name', 'A site', 'A'site', 'B site', 'B'site', 'C_A',
       'C_A'', 'Cn_A', 'Cn_A'', 'C_B', 'C_B'', 'Cn_B', 'Cn_B'', 'r_Asite',
       'r_A'site', 'r_Bsite', 'r_B'site', 'Avg_AA'', 'Avg_BB'', 'TF',
       'Fermi_energy', '_cellength_a', '_cellength_b', '_cellength_c',
       '_cell_angle_alpha', '_cell_angle_beta', '_cell_angle_gamma',
       '_cell_volume', 'space_groupS', 'Space_group_NoS', 'space_group_HS',
       'Space_group_No_HS', 'Total_mag_Outcar', 'mag_B', 'mag_B'', 'mag_O',
       'A_s', 'A_p', 'A_d', 'A'_s', 'A'_p', 'A'_d', 'B_s', 'B_p', 'B_d',
       'B'_s', 'B'_p', 'B'_d', 'O_s', 'O_p', 'dis_x_A1', 'dis_x_A2',
       'dis_y_A1', 'dis_y_A2', 'dis_x_A'1', 'dis_x_A'2', 'dis_y_A'1',
       'dis_y_A'2', '|x_A_dis|', '|y_A_dis|', '|x_A'_dis|', '|y_A'_dis|',
       '|A_dis|', '|A'_dis|', 'tilt_angle', 'rot_angle', 'Energy', 'Energy_C',
       'Energy_R', 'Energy_diff', 'Energy_diff(meV)', 'Energy_5-atom_unitcell',
       'Q_R+', 'Q_T', 'Q_AFE_(A)', 'Q_

In [None]:
target   = 'Energy_diff'
selected_feature_list = ['C_B','r_B\'site','B\'_p','B\'_d','|y_A_dis|','dis_y_A\'2','dis_y_A\'1','_cell_volume','r_Asite','_cellength_a']
df_x = df[selected_feature_list]

In [None]:
#rename columns for FunctionFeaturizer to work properly (no bar or quotes allowed)
df_x.columns = ['C_B','r_B_prime_site','B_prime_p','B_prime_d','_bar_y_A_dis_bar_','dis_y_A_prime_2','dis_y_A_prime_1','_cell_volume','r_Asite','_cellength_a']
selected_feature_list = df_x.columns

In [None]:
def get_data(selected_feature_list):
    
    function_featurizer = FunctionFeaturizer(multi_feature_depth=2,combo_function=np.sum)
    function_featurizer.set_n_jobs(4)
    function_featurizer=function_featurizer.fit(df_x[selected_feature_list])
    df_combined=function_featurizer.featurize_dataframe(df_x[selected_feature_list],selected_feature_list)

    df_combined[target] = df[target]
    df_combined=df_combined.replace([np.inf,-np.inf],np.nan)
    df_combined=df_combined.dropna(axis=1)
    df_combined=df_combined.drop(columns=selected_feature_list,axis=1)
    df_combined.to_csv('functionalized_data.csv')

    P = df_combined[target].values
    df_combined = df_combined.loc[:, df_combined.columns != target]

    return P, df_combined

In [None]:
print('generating functionalized data...')
P, df_D = get_data(selected_feature_list)
features_list = df_D.columns.to_list()
D = df_D.values

generating functionalized data...


FunctionFeaturizer: 100%|██████████| 20/20 [02:21<00:00,  7.09s/it]


In [None]:
def lasso_fit(lam, P, D, feature_list):
    #LASSO
    #D_standardized = ss.zscore(D)
    lasso =  Lasso(alpha=lam)
    lasso.fit(D, P) 
    coef =  lasso.coef_
    
    # get strings of selected features
    selected_indices = coef.nonzero()[0]
    selected_features = [feature_list[i] for i in selected_indices]
    
    # get RMSE of LASSO model
    P_predict = lasso.predict(D)

    return coef,selected_features

In [None]:
alpha = 0.2

coef, selected_features = lasso_fit(alpha, P, D, features_list)

  positive)


In [None]:
print("alpha: %.3f\t dimension of descriptor: %s" 
      %(alpha, len(selected_features)))
lasso_features=pd.DataFrame({'features':np.array(selected_features), 'abs(nonzero_coefs_LASSO)': np.abs(coef[coef.nonzero()])}).sort_values(by='abs(nonzero_coefs_LASSO)',ascending=False)
print(lasso_features.head(n=10))
lasso_features.to_csv('lasso_equations.csv')

alpha: 0.200	 dimension of descriptor: 535
                                      features  abs(nonzero_coefs_LASSO)
447      exp(_cellength_a) + 1/dis_y_A_prime_2                  0.003061
489      exp(_cellength_a) + 1/dis_y_A_prime_1                  0.002405
402      1/dis_y_A_prime_2 + 1/dis_y_A_prime_1                  0.001851
254         exp(B_prime_d) + 1/dis_y_A_prime_2                  0.001529
303         exp(B_prime_d) + exp(_cellength_a)                  0.001145
262         exp(B_prime_d) + 1/dis_y_A_prime_1                  0.000270
4                              _cell_volume**2                  0.000039
302            exp(B_prime_d) + 1/log(r_Asite)                  0.000039
442     1/log(r_Asite) + dis_y_A_prime_2**(-2)                  0.000036
403  1/dis_y_A_prime_2 + dis_y_A_prime_1**(-2)                  0.000033
