In [None]:
# Installing required python libraries

!pip install pandas==2.2.3
!pip install numpy==2.1.3
!pip install matplotlib==3.9.2
!pip install seaborn==0.13.2
!pip install scikit-learn==1.5.2

In [14]:
# Importing python libraries
import pandas as pd

In [15]:
data = pd.read_csv("Datasets/single_oxides.csv")
data.sample(5)

Unnamed: 0,sample,crystal_system,a,b,c,alpha,beta,gamma,cbm,vbm,band_gap
421,WO3,Orthorhombic,7.678,7.734,7.764,90.0,90.0,90.0,1.6338,0.3566,1.2772
563,Nd2O3,Hexagonal,6.165,6.165,3.217,90.0,90.0,120.0,1.9394,0.3691,1.5703
308,TiO2,Monoclinic,4.813,5.041,4.872,90.0,90.0,99.86,6.1769,3.9452,2.2317
220,PtO2,Trigonal,3.135,3.135,4.167,90.0,90.0,120.0,3.7877,2.1778,1.6099
560,CrO3,Orthorhombic,4.863,4.863,3.097,90.0,90.0,119.45,-0.47,-1.9134,1.4434


In [16]:
data.describe() # Short summary of the data

Unnamed: 0,a,b,c,alpha,beta,gamma,cbm,vbm,band_gap
count,769.0,769.0,769.0,769.0,769.0,769.0,769.0,769.0,769.0
mean,6.825186,7.419311,9.003382,89.128934,88.891495,90.966294,4.168422,1.665676,2.502746
std,3.374361,3.335529,4.588286,15.701099,15.463876,21.646661,2.50924,2.230998,1.105912
min,2.661,2.661,2.661,8.56,8.56,8.56,-4.8272,-6.1055,1.0035
25%,4.275,5.211,5.971,88.08,89.65,88.86,2.4109,0.3065,1.5102
50%,5.72,6.798,8.536,90.0,90.0,90.0,4.208,1.7885,2.286
75%,9.382,9.535,10.594,91.24,90.16,98.08,6.0089,3.3186,3.4687
max,40.343,52.349,52.661,158.77,160.07,165.0,12.8669,9.0935,5.0


In [17]:
data.crystal_system.unique() # Crystal sysems present in the data

array(['Triclinic', 'Trigonal', 'Orthorhombic', 'Cubic', 'Tetragonal',
       'Hexagonal', 'Monoclinic'], dtype=object)

In [18]:
data.head()

Unnamed: 0,sample,crystal_system,a,b,c,alpha,beta,gamma,cbm,vbm,band_gap
0,BeO,Triclinic,8.894,9.895,9.133,89.11,92.04,89.15,6.5452,2.0609,4.4843
1,Na2O,Trigonal,6.054,6.054,3.569,90.0,90.0,120.0,3.0836,1.4576,1.626
2,MgO,Triclinic,9.741,10.461,8.981,87.33,92.04,86.4,5.2468,3.3239,1.9229
3,PbO,Orthorhombic,4.728,5.621,6.101,90.0,90.0,90.0,6.0606,3.9041,2.1565
4,OF2,Orthorhombic,3.15,5.411,9.581,90.0,90.0,90.0,-0.1324,-2.2565,2.1241


In [19]:
# Processing and Standardising the data
def calculate_a_mod(x):
    if x < 3:
        return (10 * x) / 1.976
    else:
        return (10 * x) / 2.987
def calculate_b_mod(crystal_system, a_mod):
    a_b_sys = ['Cubic', 'Tetragonal', 'Trigonal', 'Hexagonal']
    if crystal_system in a_b_sys:
        return a_mod
    else:
        return 1.2 * a_mod
def calculate_c_mod(crystal_system, a_mod,c):
    cubic_sys = ['Cubic', 'Trigonal']
    if crystal_system in cubic_sys:
        return a_mod
    else:
        return c

data['a_mod'] = data['band_gap'].apply(calculate_a_mod)
data["b_mod"] = data.apply(lambda row: calculate_b_mod(row["crystal_system"], row["a_mod"]), axis=1)
data["c_mod"] = data.apply(lambda row: calculate_c_mod(row["crystal_system"], row["a_mod"],row['c']), axis=1)
columns = ['a','b','c','cbm','vbm']
data.drop(columns, axis=1, inplace=True)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

columns=['band_gap', 'crystal_system', 'sample']
X = data.drop(columns, axis=1)
y = data['band_gap']

scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
features = pd.DataFrame(scaled_features, columns=['alpha', 'beta', 'gamma', 'a', 'b', 'c'])
single_oxide_data = pd.concat([data['sample'], data['crystal_system'], features, data['band_gap']], axis=1)

In [20]:
single_oxide_data.head()

Unnamed: 0,sample,crystal_system,alpha,beta,gamma,a,b,c,band_gap
0,BeO,Triclinic,-0.001207,0.203736,-0.083961,1.523715,1.744298,-0.125478,4.4843
1,Na2O,Trigonal,0.055514,0.07173,1.342129,-0.716784,-1.015149,-0.325365,1.626
2,MgO,Triclinic,-0.114648,0.203736,-0.211084,-0.220553,-0.042716,-0.159078,1.9229
3,PbO,Orthorhombic,0.055514,0.07173,-0.044668,0.169879,0.357285,-0.795704,2.1565
4,OF2,Orthorhombic,0.055514,0.07173,-0.044668,0.115727,0.301806,-0.026448,2.1241


In [21]:
single_oxide_data.to_csv("Datasets/single_oxide_standardised_data.csv", index=False)