In [1]:
import pandas as pd
def select_columns_by_suffix(df, suffix):
    filtered_columns = df.filter(regex=f'{suffix}$')
    return filtered_columns

def extract_and_store_columns(csv_file, suffixes):
    # 读取 CSV 文件
    df = pd.read_csv(csv_file)

    selected_columns = {}
    for suffix in suffixes:
        selected_columns[suffix] = select_columns_by_suffix(df, suffix)
        print('********************************************************************************')
        print(f"Columns ending with '{suffix}':")
        print('********************************************************************************')
        print(selected_columns[suffix])
        # 如果需要保存到新的DataFrame中，取消注释下一行
        global df_selected
        df_selected = pd.concat(selected_columns, axis=1)
        # df_combined = pd.concat(selected_columns.values(), axis=1)
        # df_combined.to_csv('selected_columns.csv', index=False)
        selected_columns[suffix].to_csv(f'{suffix}_selected_columns.csv', index=False)

    # 获取未被选中的列
    unselected_columns = df.drop(columns=[col for cols in selected_columns.values() for col in cols.columns])

    # 保存未被选中的列到 CSV 文件
    unselected_columns.to_csv('unselected_columns.csv', index=False)


    return selected_columns

# 用法示例
file_path = 'Inorganic-Organic-Hybrid-template.csv'
suffixes = ['InorganicFormula', 'OrganicSmiles']
selected_columns = extract_and_store_columns(file_path, suffixes)

********************************************************************************
Columns ending with 'InorganicFormula':
********************************************************************************
  Layer1_Perovskite_InorganicFormula Layer3_Perovskite_InorganicFormula
0                       (Fe2AgCu2)O3                                 Si
1                              Fe2O3                               TiO2
2                             CsPbI3                                H2O
3                               MoS2                              Fe2O3
4                           CuInGaSe                       (Fe2AgCu2)O3
********************************************************************************
Columns ending with 'OrganicSmiles':
********************************************************************************
     Layer2_OrganicSmiles    Layer4_OrganicSmiles
0               Cc1ccccc1               Cc1ccccc1
1               C=CCC=CCO          CC1=CC(Br)CCC1
2          CC1=CC

In [4]:
original_data = pd.DataFrame(pd.read_csv('InorganicFormula_selected_columns.csv'))

In [5]:
import pandas as pd

# 假设 original_data 是您的原始数据集
# 创建一个空字典，用于存储新的数据集
new_datasets = {}

# 遍历原始数据集的每一列
for col_name in original_data.columns:
    # 创建新的数据集，将当前列命名为 'Name'
    new_dataset = pd.DataFrame({ 'Name': original_data[col_name] })
    
    # 将新数据集存储在字典中，字典的键是 'data1'，'data2'，依此类推
    new_datasets['data' + str(len(new_datasets) + 1)] = new_dataset

# 打印或使用新的数据集
for key, value in new_datasets.items():
    print(f"{key}:\n{value}\n")

data1:
           Name
0  (Fe2AgCu2)O3
1         Fe2O3
2        CsPbI3
3          MoS2
4      CuInGaSe

data2:
           Name
0            Si
1          TiO2
2           H2O
3         Fe2O3
4  (Fe2AgCu2)O3



In [6]:
import pandas as pd
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition.orbital import AtomicOrbitals
from matminer.featurizers.composition import ElementProperty
from matminer.featurizers.composition.element import ElementFraction
from pymatgen.core import Composition

# 假设 new_datasets 是包含拆分数据集的字典，如 'data1', 'data2', ...
# 每个数据集中应该有 'Name' 列

# 初始化 StrToComposition
str_to_comp = StrToComposition(target_col_id='composition')

# 初始化 AtomicOrbitals
comp_to_orbital = AtomicOrbitals()

# 初始化 ElementProperty
features_element_property = ['Number', 'MendeleevNumber', 'AtomicWeight', 'MeltingT', 
                              'Column', 'Row', 'CovalentRadius', 'Electronegativity', 
                              'NsValence', 'NpValence', 'NdValence', 'NfValence', 'NValence', 
                              'NsUnfilled', 'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled', 
                              'GSvolume_pa', 'GSbandgap', 'GSmagmom', 'SpaceGroupNumber']
stats_element_property = ['mean', 'minimum', 'maximum', 'range', 'avg_dev', 'mode']
element_property_featurizer = ElementProperty(data_source='magpie', features=features_element_property, stats=stats_element_property)

# 初始化 ElementFraction
element_fraction = ElementFraction()

# 用于存储特征转换后的数据集
result_datasets = {}

# 遍历拆分的数据集
for i, (key, dataset) in enumerate(new_datasets.items(), start=1):
    # 特征转换1: StrToComposition
    df_comp = str_to_comp.featurize_dataframe(dataset, col_id='Name')
    
    # 特征转换2: AtomicOrbitals
    orbital_features = comp_to_orbital.featurize_dataframe(df_comp, col_id='composition')
    orbital_features = orbital_features.iloc[:, [4, 7, 8]]  # 选择感兴趣的列
    
    # 特征转换3: ElementProperty
    element_property_features = element_property_featurizer.featurize_dataframe(df_comp, col_id='composition')
    element_property_features = element_property_features.iloc[:, 2:-1]  # 选择感兴趣的列
    
    # 特征转换4: ElementFraction
    element_fraction_features = element_fraction.featurize_dataframe(df_comp, col_id='composition')
    element_fraction_features = element_fraction_features.iloc[:, 2:-1]  # 选择感兴趣的列
    
    # 添加前缀
    prefix_orbital = f'inorganic_formula_{i}_orbital_'
    orbital_features = orbital_features.add_prefix(prefix_orbital)
    
    prefix_element_property = f'inorganic_formula_{i}_element_property_'
    element_property_features = element_property_features.add_prefix(prefix_element_property)
    
    prefix_element_fraction = f'inorganic_formula_{i}_element_fraction_'
    element_fraction_features = element_fraction_features.add_prefix(prefix_element_fraction)
    
    # 合并特征转换后的数据集
    result_datasets[key] = pd.concat([orbital_features, element_property_features, element_fraction_features], axis=1)

# 合并所有数据集
merged_result = pd.concat(result_datasets.values(), axis=1)

# 将合并后的结果保存为 CSV 文件
merged_result.to_csv('merged_result.csv', index=False)

# 打印或使用合并后的结果
print(merged_result)

StrToComposition: 100%|██████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 32.55it/s]
AtomicOrbitals: 100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 73.72it/s]
ElementProperty: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 58.55it/s]
ElementFraction: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 79.58it/s]
StrToComposition: 100%|██████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 75.96it/s]
AtomicOrbitals: 100%|████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 53.14it/s]
ElementProperty: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 68.31it/s]
ElementFraction: 100%|███████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 71.45it/s]


   inorganic_formula_1_orbital_HOMO_energy  \
0                                -0.202272   
1                                -0.295049   
2                                -0.267904   
3                                -0.153347   
4                                -0.172056   

   inorganic_formula_1_orbital_LUMO_energy  \
0                                -0.202272   
1                                -0.295049   
2                                -0.141831   
3                                -0.153347   
4                                -0.172056   

   inorganic_formula_1_orbital_gap_AO  \
0                            0.000000   
1                            0.000000   
2                            0.126073   
3                            0.000000   
4                            0.000000   

   inorganic_formula_1_element_property_MagpieData mean Number  \
0                                          22.625000             
1                                          15.200000             
2