# 读取数据

In [1]:
import pandas as pd

data_1 = pd.read_excel('dataset_procession_ver_1.xlsx')

# 人工特征处理

为了减少数据集在编码后的维度，对各分类特征进行进一步的归类。

## CM_type 列

In [2]:
# CM_type 列
    # 1. derived carbon-based
    # includes derived carbon and derived carbon & CNT.
    # 2. CNT
    # includes CNT and MWCNT.
    # 3. CNF
    # includes CNF and CC.
    # 4. G-based
    # includes Graphene, GO and RGO.

# 替换 CM_type 中的特定值为 'derived carbon-based'
replace_values = ['derived carbon', 'derived carbon & CNT']
data_1['CM_type'] = data_1['CM_type'].replace(replace_values, 'derived carbon-based')

# 替换 CM_type 中的特定值为 'CNT'
replace_values = ['CNT', 'MWCNT']
data_1['CM_type'] = data_1['CM_type'].replace(replace_values, 'CNT')

# 替换 CM_type 中的特定值为 'CNF'
replace_values = ['CNF', 'CC']
data_1['CM_type'] = data_1['CM_type'].replace(replace_values, 'CNF')

# 替换 CM_type 中的特定值为 'G-based'
replace_values = ['Graphene', 'GO', 'RGO']
data_1['CM_type'] = data_1['CM_type'].replace(replace_values, 'G-based')

data_1.to_csv('dataset_procession_ver_2.csv', index=False)

## CM_morph 列

In [3]:
data_2 = pd.read_csv('dataset_procession_ver_2.csv')

# CM_morph列
    # 1. 0D
    # includes 0D QDs.
    # 2. 1D fibers.
    # includes 1D fibers.
    # 3. 1D tubes
    # includes 1D tubes.
    # 4. 2D
    # includes 2D nanosheets.
    # 5. 3D porous
    # includes 3D porous and 3D networks.
    # 6. 3D special
    # includes nanoparticles, nanospheres, rods, tubes and flower-like.

# 替换 CM_morph 中的特定值为 '0D'
replace_values = ['OD QDs']
data_2['CM_morph'] = data_2['CM_morph'].replace(replace_values, '0D')

# 替换 CM_morph 中的特定值为 '2D'
replace_values = ['2D nanosheets']
data_2['CM_morph'] = data_2['CM_morph'].replace(replace_values, '2D')

# 替换 CM_morph 中的特定值为 '3D porous'
replace_values = ['3D porous', '3D networks']
data_2['CM_morph'] = data_2['CM_morph'].replace(replace_values, '3D porous')

# 替换 CM_morph 中的特定值为 '3D special'
replace_values = ['nanoparticles', 'nanospheres', 'rods', 'tubes', 'flower-like']
data_2['CM_morph'] = data_2['CM_morph'].replace(replace_values, '3D special')

data_2.to_csv('dataset_procession_ver_3.csv', index=False)


## MS2_morph 列

In [4]:
data_3 = pd.read_csv('dataset_procession_ver_3.csv')

# MS2_morph列
    # 1. bulk
    # includes bulk. 无规则块体。
    # 2. nanosheets
    # includes nanosheets. 指片状, 但厚度大于10 nm.
    # 3. flower-like clusters
    # includes flower-like clusters. 指花状团簇。
    # 4. irregular nanoparticles
    # includes irregular nanoparticles. 指不规则形状的纳米颗粒, 粒径小于1 μm。
    # 5. hollow morph
    # includes hollow spheres, hollow rods, hollow cubes, hollow nanoparticles,
    # double shell spheres, yolk-shell spheres and core-shell nanoparticles. 含空心形貌。
    # 6. nanoparticles
    # includes nanoparticles, octahedron, cubes, rods and spheres. 具有均匀规则形貌。

# 替换 MS2_morph 中的特定值为 'nanosheets'
replace_values = ['nanosheets']
data_3['MS2_morph'] = data_3['MS2_morph'].replace(replace_values, 'nanosheets')

# 替换 MS2_morph 中的特定值为 'flower-like clusters'
replace_values = ['flower-like clusters']
data_3['MS2_morph'] = data_3['MS2_morph'].replace(replace_values, 'flower-like clusters')

# 替换 MS2_morph 中的特定值为 'irregular nanoparticles'
replace_values = ['irregular nanoparticles']
data_3['MS2_morph'] = data_3['MS2_morph'].replace(replace_values, 'irregular nanoparticles')

# 替换 MS2_morph 中的特定值为 'hollow morph'
replace_values = ['hollow spheres', 'hollow rods', 'hollow cubes', 'hollow nanoparticles',
                  'double shell spheres', 'yolk-shell spheres', 'core-shell nanoparticles']
data_3['MS2_morph'] = data_3['MS2_morph'].replace(replace_values, 'hollow morph')

# 替换 MS2_morph 中的特定值为 'nanoparticles'
replace_values = ['nanoparticles', 'octahedron', 'cubes', 'rods', 'spheres']
data_3['MS2_morph'] = data_3['MS2_morph'].replace(replace_values, 'nanoparticles')

data_3.to_csv('dataset_procession_ver_4.csv', index=False)

## 统计改动后的列信息

In [5]:
# 统计 CM_type 列中的值
data_4 = pd.read_csv('dataset_procession_ver_4.csv')
data_4['CM_type'].value_counts()

# 统计 CM_morph 列中的值
data_4['CM_morph'].value_counts()

# 统计 MS2_morph 列中的值
data_4['MS2_morph'].value_counts()

# 统计 CP_morph 列中的值
data_4['CP_morph'].value_counts()

# 输出统计结果，保存为 md 文件
with open('dataset_procession_ver_4.md', 'w') as f:
    f.write('## CM_type\n')
    f.write(data_4['CM_type'].value_counts().to_markdown())
    f.write('\n\n')
    f.write('## CM_morph\n')
    f.write(data_4['CM_morph'].value_counts().to_markdown())
    f.write('\n\n')
    f.write('## MS2_morph\n')
    f.write(data_4['MS2_morph'].value_counts().to_markdown())
    f.write('\n\n')
    f.write('## CP_morph\n')
    f.write(data_4['CP_morph'].value_counts().to_markdown())
    f.write('\n\n')

## 金属中心原子特征

In [6]:
data_4 = pd.read_csv('dataset_procession_ver_4.csv')

# 手动One-Hot编码
# 将 Ti, V, Mn, Fe, Co, Ni, Zr, Mo, Sn, W, CP 列转换为布尔值
boolean_columns = ['Ti', 'V', 'Mn', 'Fe', 'Co', 'Ni', 'Zr', 'Mo', 'Sn', 'W', 'CP']
data_4[boolean_columns] = data_4[boolean_columns].astype(bool)

data_4.to_csv('dataset_procession_ver_5.csv', index=False)

## 材料形貌特征

In [7]:
data_5 = pd.read_csv('dataset_procession_ver_5.csv')

# 由于后续将对分类特征进行One-Hot编码，因此需要将分类特征中的0替换为None占位符
# CM_type, CM_morph, MS2_morph, CP_morph列中的0全部替换为None
replace_values = {0: None}
data_5['CM_type'] = data_5['CM_type'].replace(replace_values)
data_5['CM_morph'] = data_5['CM_morph'].replace(replace_values)
data_5['MS2_morph'] = data_5['MS2_morph'].replace(replace_values)
data_5['CP_morph'] = data_5['CP_morph'].replace(replace_values)

data_5.to_csv('dataset_procession_ver_6.csv', index=False)