In [1]:
import pandas as pd

data = pd.read_csv('../0_raw_data/dataset_ms2_c.csv')
data

Unnamed: 0,Ti,V,Fe,Co,Ni,Zr,Mo,Sn,W,CP,...,MS2_morph,CP_morph,SSA,C,Cation,Anion,P_low,P_high,CD,Cs
0,1,0,0,0,0,0,0,0,0,1,...,irregular nanoparticles,supported,289.4,6.0,K,OH,-0.5,0.5,1.0,97.0
1,1,0,0,0,0,0,0,0,0,1,...,irregular nanoparticles,supported,301.8,6.0,K,OH,-0.5,0.5,1.0,102.0
2,1,0,0,0,0,0,0,0,0,1,...,irregular nanoparticles,supported,319.2,6.0,K,OH,-0.5,0.5,1.0,113.0
3,0,1,0,0,0,0,0,0,0,0,...,nanosheets,0,5.5,6.0,K,OH,0.0,0.5,1.0,152.0
4,0,1,0,0,0,0,0,0,0,0,...,nanosheets,0,5.5,6.0,K,OH,0.0,0.5,2.0,145.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,0,0,0,0,0,0,0,0,1,1,...,flower-like clusters,supported,10.8,1.0,K,OH,0.0,0.5,1.0,600.0
778,0,0,0,0,0,0,0,0,1,1,...,flower-like clusters,supported,10.8,1.0,K,OH,0.0,0.5,2.0,527.0
779,0,0,0,0,0,0,0,0,1,1,...,flower-like clusters,supported,10.8,1.0,K,OH,0.0,0.5,4.0,476.0
780,0,0,0,0,0,0,0,0,1,1,...,flower-like clusters,supported,10.8,1.0,K,OH,0.0,0.5,8.0,391.0


In [2]:
# Manual One-Hot coding
# Convert Ti, V, Fe, Co, Ni, Zr, Mo, Sn, W, CP columns to Boolean values
boolean_columns = ['Ti', 
                   'V', 
                   'Fe', 
                   'Co', 
                   'Ni', 
                   'Zr', 
                   'Mo', 
                   'Sn', 
                   'W', 
                   'CP']
data[boolean_columns] = data[boolean_columns].astype(bool)

# CM_type column
    # 0. NoCP
    # means pure MS2
    # 1. CNF
    # includes CNF and CC.
    # 2. CNT
    # includes CNT and MWCNT.
    # 3. G-based
    # includes Graphene, GO and RGO.
    # 4. derived carbon-based
    # such carbon is obtained by carbonization of organic precursors.

replace_values = ['derived carbon', 'derived carbon & CNT']
data['CM_type'] = data['CM_type'].replace(replace_values, 'derived carbon-based')

replace_values = ['CNT', 'MWCNT']
data['CM_type'] = data['CM_type'].replace(replace_values, 'CNT')

replace_values = ['CNF', 'CC']
data['CM_type'] = data['CM_type'].replace(replace_values, 'CNF')

replace_values = ['Graphene', 'GO', 'RGO']
data['CM_type'] = data['CM_type'].replace(replace_values, 'G-based')

# CM_morph column
    # 0. NoCP
    # means pure MS2
    # 1. 0D
    # includes 0D QDs.
    # 2. 1D fibers.
    # includes 1D fibers.
    # 3. 1D tubes
    # includes 1D tubes.
    # 4. 2D
    # includes 2D nanosheets (few layers, thick < 10 nm).
    # 5. 3D porous
    # includes 3D porous and 3D networks.
    # 6. 3D special
    # includes nanoparticles, nanospheres, rods, tubes and flower-like.

replace_values = ['OD QDs']
data['CM_morph'] = data['CM_morph'].replace(replace_values, '0D')

replace_values = ['2D nanosheets']
data['CM_morph'] = data['CM_morph'].replace(replace_values, '2D')

replace_values = ['3D porous', '3D networks']
data['CM_morph'] = data['CM_morph'].replace(replace_values, '3D porous')

replace_values = ['nanoparticles', 'nanospheres', 'rods', 'tubes', 'flower-like']
data['CM_morph'] = data['CM_morph'].replace(replace_values, '3D special')


# MS2_morph column
    # 0. bulk
    # no regular shape.
    # 1. nanosheets
    # flake，not few-layer sheet.
    # 2. flower-like clusters
    # aggregated from few-layer nanosheets.
    # 3. irregular nanoparticles
    # particles less than 1 μm in size, but the size and dispersion degree are not uniform.
    # 4. nanoparticles
    # includes nanoparticles, octahedron, cubes, rods and spheres. 
    # the size and dispersion are uniform.
    # 5. hollow morph
    # includes hollow spheres, hollow rods, hollow cubes, hollow nanoparticles,
    # double shell spheres, yolk-shell spheres and core-shell nanoparticles. 

replace_values = ['nanosheets']
data['MS2_morph'] = data['MS2_morph'].replace(replace_values, 'nanosheets')

replace_values = ['flower-like clusters']
data['MS2_morph'] = data['MS2_morph'].replace(replace_values, 'flower-like clusters')

replace_values = ['irregular nanoparticles']
data['MS2_morph'] = data['MS2_morph'].replace(replace_values, 'irregular nanoparticles')

replace_values = ['hollow spheres', 'hollow rods', 'hollow cubes', 'hollow nanoparticles',
                  'double shell spheres', 'yolk-shell spheres', 'core-shell nanoparticles']
data['MS2_morph'] = data['MS2_morph'].replace(replace_values, 'hollow morph')

replace_values = ['nanoparticles', 'octahedron', 'cubes', 'rods', 'spheres']
data['MS2_morph'] = data['MS2_morph'].replace(replace_values, 'nanoparticles')

# Because the classification feature will be One-Hot encoded later, 
# need to replace the 0 in the classification feature with the 'NoCP' placeholder
# CM_type, CM_morph, CP_morph columns replace all '0' with 'NoCP'
replace_values = {'0'}
categorical_columns = ['CM_type', 'CM_morph', 'CP_morph']
data[categorical_columns] = data[categorical_columns].replace(replace_values, 'NoCP')

data

Unnamed: 0,Ti,V,Fe,Co,Ni,Zr,Mo,Sn,W,CP,...,MS2_morph,CP_morph,SSA,C,Cation,Anion,P_low,P_high,CD,Cs
0,True,False,False,False,False,False,False,False,False,True,...,irregular nanoparticles,supported,289.4,6.0,K,OH,-0.5,0.5,1.0,97.0
1,True,False,False,False,False,False,False,False,False,True,...,irregular nanoparticles,supported,301.8,6.0,K,OH,-0.5,0.5,1.0,102.0
2,True,False,False,False,False,False,False,False,False,True,...,irregular nanoparticles,supported,319.2,6.0,K,OH,-0.5,0.5,1.0,113.0
3,False,True,False,False,False,False,False,False,False,False,...,nanosheets,NoCP,5.5,6.0,K,OH,0.0,0.5,1.0,152.0
4,False,True,False,False,False,False,False,False,False,False,...,nanosheets,NoCP,5.5,6.0,K,OH,0.0,0.5,2.0,145.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,False,False,False,False,False,False,False,False,True,True,...,flower-like clusters,supported,10.8,1.0,K,OH,0.0,0.5,1.0,600.0
778,False,False,False,False,False,False,False,False,True,True,...,flower-like clusters,supported,10.8,1.0,K,OH,0.0,0.5,2.0,527.0
779,False,False,False,False,False,False,False,False,True,True,...,flower-like clusters,supported,10.8,1.0,K,OH,0.0,0.5,4.0,476.0
780,False,False,False,False,False,False,False,False,True,True,...,flower-like clusters,supported,10.8,1.0,K,OH,0.0,0.5,8.0,391.0


In [4]:
data_output = data.copy()
data_output = data_output.drop(columns='CP')
data_output['SSA'] = data_output['SSA'].round(1)
data_output['Cs'] = data_output['Cs'].round(0)
data_output.to_csv('dataset_all.csv', index=False)
data_output.to_csv('../2_EDA/dataset_all.csv', index=False)