In [216]:
# Installing required python libraries

# !pip install pandas==2.2.3
# !pip install numpy==2.1.3
# !pip install matplotlib==3.9.2
# !pip install seaborn==0.13.2
# !pip install scikit-learn==1.5.2

In [217]:
# Importing python libraries
import pandas as pd

In [218]:
df = pd.read_csv("Datasets/doped_oxides.csv")
df.sample(5)

Unnamed: 0,band_gap,sample,crystal_size,ssa,a,c,base,primary_dopant,secondary_dopant,primary_conc,secondary_conc,base_conc,crystal_system,reference
79,3.02,ZnO/glass,320.0,,3.258,5.208,Zn,,,0.0,0.0,100.0,wurtzite,https://www.sciencedirect.com/science/article/...
73,2.1,CdS-doped ZnO,368.1,,3.31,5.36,Zn,CdS,,,0.0,100.0,wurtzite,https://www.sciencedirect.com/science/article/...
18,2.91,V-doped TiO2,,203.0,3.7882,9.4949,Ti,V,,,0.0,100.0,anatase,https://www.sciencedirect.com/science/article/...
107,3.265,1 at% Sn-doped ZnO,382.3,,3.25,5.21,Zn,Sn,,1.0,0.0,99.0,wurtzite,https://www.sciencedirect.com/science/article/...
76,2.94,Zn0.9 Ni0.0 O,130.0,,3.231,5.184,Zn,Ni,,10.0,0.0,90.0,wurtzite,https://www.sciencedirect.com/science/article/...


In [219]:
df.drop(['secondary_dopant', 'secondary_conc', 'base_conc', 'reference'], axis=1, inplace=True)

In [220]:
df.describe() # Short summary of the data

Unnamed: 0,band_gap,crystal_size,ssa,a,c,primary_conc
count,138.0,113.0,63.0,138.0,138.0,126.0
mean,3.075116,263.130354,107.524921,3.478503,7.170904,5.432937
std,0.283845,180.74731,59.722137,0.305026,2.195856,8.650441
min,2.1,56.7,1.93,2.598,4.98,0.0
25%,2.9125,139.0,63.96,3.2434,5.196,0.1375
50%,3.19,210.0,88.14,3.31,5.26052,2.625
75%,3.2615,320.0,150.755,3.7822,9.4817,5.0
max,3.76,899.2,246.0,3.834,11.442,50.0


In [221]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   band_gap        138 non-null    float64
 1   sample          138 non-null    object 
 2   crystal_size    113 non-null    float64
 3   ssa             63 non-null     float64
 4   a               138 non-null    float64
 5   c               138 non-null    float64
 6   base            138 non-null    object 
 7   primary_dopant  108 non-null    object 
 8   primary_conc    126 non-null    float64
 9   crystal_system  138 non-null    object 
dtypes: float64(6), object(4)
memory usage: 10.9+ KB


In [222]:
df['primary_conc'] = df['primary_conc'].fillna(0.0)

In [223]:
df['primary_dopant'] = df['primary_dopant'].fillna('X')

In [224]:
# Standardising the numerical columns
from sklearn.preprocessing import StandardScaler

df['crystal_size'] = df.apply(
    lambda row: (198 * row['band_gap']) / 0.973 if row['base'] == 'Zn' else row['crystal_size'], axis=1
)
df['ssa'] = df.apply(
    lambda row: (98 * row['band_gap']) / 0.962 if row['ssa'] > 100 and row['base'] == 'Ti'
    else (49 * row['band_gap']) / 0.962 if row['ssa'] <= 100 and row['base'] == 'Ti'
    else row['ssa'],
    axis=1
)
df['c'] = df.apply(
    lambda row: (1.732*row['band_gap']) / 0.891 if row['primary_conc'] < 3 else row['c'],
    axis=1
)
columns_to_scale = ['ssa', 'a', 'c', 'primary_conc']
scaler = StandardScaler()
standardized_values = scaler.fit_transform(df[columns_to_scale])
standardized_df = pd.DataFrame(standardized_values, columns=[col + '_standardized' for col in columns_to_scale])
data = pd.concat([df, standardized_df], axis=1)
data.drop(columns_to_scale, inplace=True, axis=1)
data.columns = ['band_gap', 'sample', 'crystal_size', 'base', 'dopant', 'crystal_system', 'ssa', 'a', 'c', 'dopant_conc']
data = data[data.crystal_system != 'rutile']

In [225]:
data.head()

Unnamed: 0,band_gap,sample,crystal_size,base,dopant,crystal_system,ssa,a,c,dopant_conc
0,3.122,pure TiO2,190.0,Ti,X,anatase,-0.685318,0.942674,-0.430367,-0.592368
1,3.087,1.0% Cu−3.5% In−TiO2,80.0,Ti,In,anatase,-0.712864,0.978868,1.598636,-0.174409
2,2.43,5 mol % Cu-doped TiO2 (700°C),,Ti,Cu,anatase,0.682544,0.959126,-0.460148,0.004716
3,2.51,5 mol % Cu-doped TiO2 (600°C),,Ti,Cu,anatase,0.808469,1.044675,1.580099,0.004716
4,2.28,"5 mol % Cu, 15 mol % Zr co-doped TiO2 (700 °C)",,Ti,Zr,anatase,0.446435,1.169709,2.782602,1.198884


In [226]:
data.to_csv('Datasets/doped_oxide_standardised_data.csv', index=False)