In [2]:
import os
import numpy as np
import pandas as pd

In [9]:
import glob

# 获取当前文件夹下所有csv文件路径
csv_files = glob.glob("../csv/spin_C/*.csv")

# 创建一个空的DataFrame用于存储所有拼接的数据
combined_df = pd.DataFrame()

# 遍历所有csv文件并读取它们
for file in csv_files:
    df = pd.read_csv(file)
    try:
        # 获取所有列名并排除掉前几个无关的列
        keys = df.keys()
        filtered_keys = [col for col in keys if col not in ['canonical_smiles', 'id', 'Index', 'shielding_constants', 'log_filename', 'splitting_results']]
        # 只保留smiles和从splitting_results后面的所有列
        filtered_df = df[['smiles'] + filtered_keys]
        # 将最后一列（key=0）移动到最后一个负数和第一个正数中间
        if '0' in filtered_keys:
            filtered_keys.remove('0')
            negative_keys = [col for col in filtered_keys if col.replace('.', '', 1).lstrip('-').isdigit() and float(col) < 0]
            positive_keys = [col for col in filtered_keys if col.replace('.', '', 1).lstrip('-').isdigit() and float(col) > 0]
            reordered_keys = negative_keys + ['0'] + positive_keys
            filtered_df = df[['smiles'] + reordered_keys]
        # 拼接当前csv的内容到总DataFrame中
        combined_df = pd.concat([combined_df, filtered_df], ignore_index=True)
    except KeyError as e:
        print(f"Skipping file {file} due to missing columns: {e}")
    except ValueError as e:
        print(f"Skipping file {file} due to value error: {e}")

# 将结果保存到一个新的csv文件
# combined_df.to_csv("combined_filtered.csv", index=False)
combined_df


Unnamed: 0,smiles,-5.5,-5.4,-5.3,-5.2,-5.1,-5.0,-4.9,-4.8,-4.7,...,284.8,284.9,285.0,285.1,285.2,285.3,285.4,285.5,285.6,285.7
0,O=C(C#C)C1CC(=O)C1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,O=C(C#C)C1CC(=O)N1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,O=C(C#C)C1CC(=O)O1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,O=C(C#C)N1CC(=O)C1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,O=C(C#N)C1CC(=O)C1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128135,CCC1=NN(C)C=C1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128136,CCC1=NN(C)C=N1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128137,CCC1=NN(C)N=C1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128138,CCC1=NN(C)N=N1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
list(combined_df.keys())

['smiles',
 '-5.5',
 '-5.4',
 '-5.3',
 '-5.2',
 '-5.1',
 '-5.0',
 '-4.9',
 '-4.8',
 '-4.7',
 '-4.6',
 '-4.5',
 '-4.4',
 '-4.3',
 '-4.2',
 '-4.1',
 '-4.0',
 '-3.9',
 '-3.8',
 '-3.7',
 '-3.6',
 '-3.5',
 '-3.4',
 '-3.3',
 '-3.2',
 '-3.1',
 '-3.0',
 '-2.9',
 '-2.8',
 '-2.7',
 '-2.6',
 '-2.5',
 '-2.4',
 '-2.3',
 '-2.2',
 '-2.1',
 '-2.0',
 '-1.9',
 '-1.8',
 '-1.7',
 '-1.6',
 '-1.5',
 '-1.4',
 '-1.3',
 '-1.2',
 '-1.1',
 '-1.0',
 '-0.9',
 '-0.8',
 '-0.7',
 '-0.6',
 '-0.5',
 '-0.4',
 '-0.3',
 '-0.2',
 '-0.1',
 '0',
 '0.1',
 '0.2',
 '0.3',
 '0.4',
 '0.5',
 '0.6',
 '0.7',
 '0.8',
 '0.9',
 '1.0',
 '1.1',
 '1.2',
 '1.3',
 '1.4',
 '1.5',
 '1.6',
 '1.7',
 '1.8',
 '1.9',
 '2.0',
 '2.1',
 '2.2',
 '2.3',
 '2.4',
 '2.5',
 '2.6',
 '2.7',
 '2.8',
 '2.9',
 '3.0',
 '3.1',
 '3.2',
 '3.3',
 '3.4',
 '3.5',
 '3.6',
 '3.7',
 '3.8',
 '3.9',
 '4.0',
 '4.1',
 '4.2',
 '4.3',
 '4.4',
 '4.5',
 '4.6',
 '4.7',
 '4.8',
 '4.9',
 '5.0',
 '5.1',
 '5.2',
 '5.3',
 '5.4',
 '5.5',
 '5.6',
 '5.7',
 '5.8',
 '5.9',
 '6.0',
 '6.1',


In [3]:
# ir
ir_spe = pd.read_csv('../csv/ir_spe_filtered.csv')
ir_spe

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,500.0,501.0,502.0,503.0,504.0,505.0,506.0,507.0,...,3992.0,3993.0,3994.0,3995.0,3996.0,3997.0,3998.0,3999.0,4000.0,smiles
0,0,0,0.005416,0.005429,0.005441,0.005454,0.005466,0.005479,0.005492,0.005504,...,0.005008,0.004998,0.004988,0.004978,0.004969,0.004959,0.004949,0.004940,0.004930,C
1,1,1,0.055118,0.055347,0.055578,0.055810,0.056043,0.056278,0.056515,0.056753,...,0.002362,0.002358,0.002355,0.002351,0.002348,0.002344,0.002341,0.002337,0.002334,N
2,2,2,0.003755,0.003762,0.003768,0.003775,0.003782,0.003789,0.003796,0.003803,...,0.048280,0.047874,0.047473,0.047077,0.046686,0.046300,0.045919,0.045542,0.045170,O
3,3,3,0.220005,0.221837,0.223693,0.225572,0.227475,0.229402,0.231354,0.233330,...,0.012502,0.012468,0.012435,0.012402,0.012369,0.012336,0.012303,0.012271,0.012238,C#C
4,4,4,0.039505,0.039826,0.040151,0.040481,0.040814,0.041152,0.041494,0.041840,...,0.009035,0.009009,0.008982,0.008956,0.008930,0.008903,0.008877,0.008851,0.008826,C#N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128135,129809,129809,0.237567,0.228677,0.220534,0.213059,0.206186,0.199854,0.194011,0.188611,...,0.020055,0.020016,0.019978,0.019939,0.019900,0.019862,0.019824,0.019785,0.019747,COC(C#N)C(F)(F)F
128136,129810,129810,0.180525,0.177744,0.175135,0.172686,0.170386,0.168223,0.166190,0.164278,...,0.016942,0.016909,0.016876,0.016844,0.016811,0.016778,0.016746,0.016714,0.016681,CCC(CO)C(F)(F)F
128137,129811,129811,0.436393,0.443748,0.452052,0.461350,0.471698,0.483167,0.495841,0.509818,...,0.015260,0.015231,0.015203,0.015174,0.015146,0.015118,0.015090,0.015062,0.015034,COC1CC1C(F)(F)F
128138,129812,129812,1.447464,1.334545,1.235457,1.148152,1.070928,1.002365,0.941274,0.886659,...,0.010466,0.010447,0.010427,0.010408,0.010388,0.010369,0.010350,0.010330,0.010311,CCOC(C)C(F)(F)F


In [4]:
smiles_l = pd.read_csv('/workspace/chenyize/instructmol/SLM4CRP-main/smiles-transformer-master/aligned_smiles_id_aux_task.csv')
smiles_l

Unnamed: 0,smiles,id,H_count,C_count,N_count,O_count,F_count,Alkyl_H,Alkenyl_H,Aromatic_H,...,alcohol_count,aldehyde_count,ketone_count,amine_primary_count,amine_secondary_count,amine_tertiary_count,amide_count,ester_count,ether_count,nitrile_count
0,C,0,4,1,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
1,N,1,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,O,2,2,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C#C,3,2,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C#N,4,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128135,C1C2C3C4C5OC14C5N23,128135,7,7,1,1,0,7,0,0,...,0,0,0,0,0,9,0,0,1,0
128136,C1N2C3C2C2C4OC12C34,128136,7,7,1,1,0,7,0,0,...,0,0,0,0,0,7,0,0,1,0
128137,C1N2C3C4C5C2C13CN45,128137,8,7,2,0,0,8,0,0,...,0,0,0,0,0,14,0,0,0,0
128138,C1N2C3C4C5CC13C2C45,128138,9,8,1,0,0,9,0,0,...,0,0,0,0,0,7,0,0,0,0


In [5]:
new_csv_df = smiles_l
new_smiles_order = new_csv_df[['smiles']]

# 合并并按照新 CSV 中的 smiles 列顺序对齐
aligned_df = new_smiles_order.merge(combined_df, on='smiles', how='left')
aligned_df

NameError: name 'combined_df' is not defined

In [8]:
np.save('../csv/spin_nmrc_values.npy', aligned_df.drop(columns=['smiles']).values)

In [3]:
ir_spe.rename(columns={'smiles_all': 'smiles'}, inplace=True)

In [4]:
# 去除不必要的列，只保留光谱数据部分
# 假设光谱数据从第3列开始
spectra_data = ir_spe.iloc[:, 2:-1]

# 将 DataFrame 转换为 numpy array
spectra_array = spectra_data.to_numpy()
print(spectra_array.shape)

(129817, 3501)


In [5]:
# nmr-H
nmrh_spe = pd.read_csv('../csv/smiles_nmr_H.csv')
nmrh_spe

Unnamed: 0,smiles,-2.2,-2.1500000000000004,-2.1,-2.0500000000000003,-2.0,-1.9500000000000002,-1.9000000000000001,-1.85,-1.8000000000000003,...,19.150000000000002,19.200000000000003,19.250000000000004,19.3,19.35,19.400000000000002,19.450000000000003,19.500000000000004,19.55,19.6
0,C,0.055475,0.057852,0.060384,0.063086,0.065972,0.069060,0.072368,0.075919,0.079737,...,0.000885,0.000880,0.000875,0.000871,0.000866,0.000862,0.000857,0.000853,0.000848,0.000844
1,N,0.069530,0.073402,0.077603,0.082174,0.087157,0.092604,0.098572,0.105132,0.112361,...,0.000627,0.000624,0.000620,0.000617,0.000614,0.000611,0.000608,0.000605,0.000602,0.000599
2,O,0.029873,0.031202,0.032622,0.034140,0.035766,0.037511,0.039386,0.041404,0.043580,...,0.000438,0.000436,0.000434,0.000431,0.000429,0.000427,0.000425,0.000423,0.000420,0.000418
3,C#C,0.013460,0.013859,0.014276,0.014713,0.015169,0.015647,0.016148,0.016673,0.017224,...,0.000495,0.000493,0.000490,0.000487,0.000485,0.000482,0.000479,0.000477,0.000474,0.000472
4,C#N,0.003550,0.003626,0.003704,0.003785,0.003869,0.003956,0.004045,0.004138,0.004234,...,0.000288,0.000286,0.000285,0.000283,0.000281,0.000279,0.000278,0.000276,0.000275,0.000273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130826,C1C2C3C4C5OC14C5N23,0.023682,0.024201,0.024739,0.025295,0.025871,0.026467,0.027084,0.027723,0.028387,...,0.002161,0.002148,0.002134,0.002121,0.002108,0.002095,0.002082,0.002069,0.002056,0.002044
130827,C1N2C3C2C2C4OC12C34,0.022243,0.022708,0.023188,0.023684,0.024195,0.024724,0.025270,0.025835,0.026419,...,0.002186,0.002172,0.002159,0.002145,0.002132,0.002119,0.002105,0.002092,0.002080,0.002067
130828,C1N2C3C4C5C2C13CN45,0.026143,0.026687,0.027247,0.027826,0.028423,0.029040,0.029677,0.030336,0.031017,...,0.002381,0.002367,0.002352,0.002338,0.002324,0.002310,0.002296,0.002282,0.002269,0.002255
130829,C1N2C3C4C5CC13C2C45,0.038665,0.039623,0.040618,0.041651,0.042726,0.043844,0.045008,0.046220,0.047483,...,0.002552,0.002536,0.002521,0.002506,0.002492,0.002477,0.002462,0.002448,0.002434,0.002419


In [6]:
# 找到两个 DataFrame 中共有的 'smiles' 列的值
common_smiles = set(ir_spe['smiles']).intersection(set(nmrh_spe['smiles']))

# 筛选 ir_spe 和 nmrh_spe 中只保留共有的 'smiles' 行
ir_spe_filtered = ir_spe[ir_spe['smiles'].isin(common_smiles)]
nmrh_spe_filtered = nmrh_spe[nmrh_spe['smiles'].isin(common_smiles)]

In [7]:
ir_spe_filtered

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,500.0,501.0,502.0,503.0,504.0,505.0,506.0,507.0,...,3992.0,3993.0,3994.0,3995.0,3996.0,3997.0,3998.0,3999.0,4000.0,smiles
0,0,0,0.005416,0.005429,0.005441,0.005454,0.005466,0.005479,0.005492,0.005504,...,0.005008,0.004998,0.004988,0.004978,0.004969,0.004959,0.004949,0.004940,0.004930,C
1,1,1,0.055118,0.055347,0.055578,0.055810,0.056043,0.056278,0.056515,0.056753,...,0.002362,0.002358,0.002355,0.002351,0.002348,0.002344,0.002341,0.002337,0.002334,N
2,2,2,0.003755,0.003762,0.003768,0.003775,0.003782,0.003789,0.003796,0.003803,...,0.048280,0.047874,0.047473,0.047077,0.046686,0.046300,0.045919,0.045542,0.045170,O
3,3,3,0.220005,0.221837,0.223693,0.225572,0.227475,0.229402,0.231354,0.233330,...,0.012502,0.012468,0.012435,0.012402,0.012369,0.012336,0.012303,0.012271,0.012238,C#C
4,4,4,0.039505,0.039826,0.040151,0.040481,0.040814,0.041152,0.041494,0.041840,...,0.009035,0.009009,0.008982,0.008956,0.008930,0.008903,0.008877,0.008851,0.008826,C#N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129809,129809,129809,0.237567,0.228677,0.220534,0.213059,0.206186,0.199854,0.194011,0.188611,...,0.020055,0.020016,0.019978,0.019939,0.019900,0.019862,0.019824,0.019785,0.019747,COC(C#N)C(F)(F)F
129810,129810,129810,0.180525,0.177744,0.175135,0.172686,0.170386,0.168223,0.166190,0.164278,...,0.016942,0.016909,0.016876,0.016844,0.016811,0.016778,0.016746,0.016714,0.016681,CCC(CO)C(F)(F)F
129811,129811,129811,0.436393,0.443748,0.452052,0.461350,0.471698,0.483167,0.495841,0.509818,...,0.015260,0.015231,0.015203,0.015174,0.015146,0.015118,0.015090,0.015062,0.015034,COC1CC1C(F)(F)F
129812,129812,129812,1.447464,1.334545,1.235457,1.148152,1.070928,1.002365,0.941274,0.886659,...,0.010466,0.010447,0.010427,0.010408,0.010388,0.010369,0.010350,0.010330,0.010311,CCOC(C)C(F)(F)F


In [8]:
nmrh_spe_filtered

Unnamed: 0,smiles,-2.2,-2.1500000000000004,-2.1,-2.0500000000000003,-2.0,-1.9500000000000002,-1.9000000000000001,-1.85,-1.8000000000000003,...,19.150000000000002,19.200000000000003,19.250000000000004,19.3,19.35,19.400000000000002,19.450000000000003,19.500000000000004,19.55,19.6
0,C,0.055475,0.057852,0.060384,0.063086,0.065972,0.069060,0.072368,0.075919,0.079737,...,0.000885,0.000880,0.000875,0.000871,0.000866,0.000862,0.000857,0.000853,0.000848,0.000844
1,N,0.069530,0.073402,0.077603,0.082174,0.087157,0.092604,0.098572,0.105132,0.112361,...,0.000627,0.000624,0.000620,0.000617,0.000614,0.000611,0.000608,0.000605,0.000602,0.000599
2,O,0.029873,0.031202,0.032622,0.034140,0.035766,0.037511,0.039386,0.041404,0.043580,...,0.000438,0.000436,0.000434,0.000431,0.000429,0.000427,0.000425,0.000423,0.000420,0.000418
3,C#C,0.013460,0.013859,0.014276,0.014713,0.015169,0.015647,0.016148,0.016673,0.017224,...,0.000495,0.000493,0.000490,0.000487,0.000485,0.000482,0.000479,0.000477,0.000474,0.000472
4,C#N,0.003550,0.003626,0.003704,0.003785,0.003869,0.003956,0.004045,0.004138,0.004234,...,0.000288,0.000286,0.000285,0.000283,0.000281,0.000279,0.000278,0.000276,0.000275,0.000273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130826,C1C2C3C4C5OC14C5N23,0.023682,0.024201,0.024739,0.025295,0.025871,0.026467,0.027084,0.027723,0.028387,...,0.002161,0.002148,0.002134,0.002121,0.002108,0.002095,0.002082,0.002069,0.002056,0.002044
130827,C1N2C3C2C2C4OC12C34,0.022243,0.022708,0.023188,0.023684,0.024195,0.024724,0.025270,0.025835,0.026419,...,0.002186,0.002172,0.002159,0.002145,0.002132,0.002119,0.002105,0.002092,0.002080,0.002067
130828,C1N2C3C4C5C2C13CN45,0.026143,0.026687,0.027247,0.027826,0.028423,0.029040,0.029677,0.030336,0.031017,...,0.002381,0.002367,0.002352,0.002338,0.002324,0.002310,0.002296,0.002282,0.002269,0.002255
130829,C1N2C3C4C5CC13C2C45,0.038665,0.039623,0.040618,0.041651,0.042726,0.043844,0.045008,0.046220,0.047483,...,0.002552,0.002536,0.002521,0.002506,0.002492,0.002477,0.002462,0.002448,0.002434,0.002419


In [44]:
nmrc_spe = pd.read_csv('../csv/smiles_nmr_C.csv')

In [43]:
nmrc_spe

Unnamed: 0,smiles,-5.6,-5.5,-5.3999999999999995,-5.3,-5.199999999999999,-5.1,-5.0,-4.8999999999999995,-4.8,...,284.70000000000005,284.8,284.90000000000003,285.00000000000006,285.1,285.20000000000005,285.3,285.40000000000003,285.50000000000006,285.6
0,C,1.124566,1.271555,1.069549,0.749156,0.503518,0.346196,0.247379,0.183569,0.140765,...,9.448606e-07,9.442098e-07,9.435597e-07,9.429102e-07,9.422614e-07,9.416133e-07,9.409658e-07,9.403190e-07,9.396728e-07,9.390274e-07
1,N,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
2,O,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,C#C,0.000025,0.000025,0.000025,0.000025,0.000026,0.000026,0.000026,0.000026,0.000026,...,3.575749e-06,3.572361e-06,3.568979e-06,3.565601e-06,3.562228e-06,3.558860e-06,3.555497e-06,3.552138e-06,3.548784e-06,3.545435e-06
4,C#N,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,...,2.650510e-06,2.647453e-06,2.644402e-06,2.641356e-06,2.638315e-06,2.635279e-06,2.632249e-06,2.629224e-06,2.626204e-06,2.623189e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55042,CC(C#N)C(C)C1CN1,0.000786,0.000794,0.000803,0.000812,0.000821,0.000830,0.000839,0.000849,0.000858,...,1.042482e-05,1.041512e-05,1.040543e-05,1.039576e-05,1.038611e-05,1.037647e-05,1.036684e-05,1.035723e-05,1.034763e-05,1.033804e-05
55043,CC(C#N)C(C)C1CO1,0.000456,0.000460,0.000463,0.000467,0.000470,0.000474,0.000478,0.000481,0.000485,...,1.105262e-05,1.104216e-05,1.103171e-05,1.102129e-05,1.101088e-05,1.100048e-05,1.099010e-05,1.097973e-05,1.096938e-05,1.095905e-05
55044,CC(C#N)C(C)N1CC1,0.000661,0.000667,0.000673,0.000679,0.000685,0.000692,0.000698,0.000705,0.000711,...,1.074891e-05,1.073879e-05,1.072869e-05,1.071860e-05,1.070853e-05,1.069848e-05,1.068843e-05,1.067841e-05,1.066840e-05,1.065840e-05
55045,CC(C#N)C(N)C1CN1,0.000492,0.000497,0.000501,0.000506,0.000511,0.000515,0.000520,0.000525,0.000530,...,9.498799e-06,9.489707e-06,9.480628e-06,9.471563e-06,9.462512e-06,9.453474e-06,9.444450e-06,9.435440e-06,9.426443e-06,9.417460e-06


In [42]:
nmrc_spe_filtered = nmrc_spe[nmrc_spe['smiles'].isin(common_smiles)]
nmrc_spe_filtered

Unnamed: 0,smiles,-5.6,-5.5,-5.3999999999999995,-5.3,-5.199999999999999,-5.1,-5.0,-4.8999999999999995,-4.8,...,284.70000000000005,284.8,284.90000000000003,285.00000000000006,285.1,285.20000000000005,285.3,285.40000000000003,285.50000000000006,285.6
0,C,1.124566,1.271555,1.069549,0.749156,0.503518,0.346196,0.247379,0.183569,0.140765,...,9.448606e-07,9.442098e-07,9.435597e-07,9.429102e-07,9.422614e-07,9.416133e-07,9.409658e-07,9.403190e-07,9.396728e-07,9.390274e-07
1,N,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
2,O,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,C#C,0.000025,0.000025,0.000025,0.000025,0.000026,0.000026,0.000026,0.000026,0.000026,...,3.575749e-06,3.572361e-06,3.568979e-06,3.565601e-06,3.562228e-06,3.558860e-06,3.555497e-06,3.552138e-06,3.548784e-06,3.545435e-06
4,C#N,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,...,2.650510e-06,2.647453e-06,2.644402e-06,2.641356e-06,2.638315e-06,2.635279e-06,2.632249e-06,2.629224e-06,2.626204e-06,2.623189e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55042,CC(C#N)C(C)C1CN1,0.000786,0.000794,0.000803,0.000812,0.000821,0.000830,0.000839,0.000849,0.000858,...,1.042482e-05,1.041512e-05,1.040543e-05,1.039576e-05,1.038611e-05,1.037647e-05,1.036684e-05,1.035723e-05,1.034763e-05,1.033804e-05
55043,CC(C#N)C(C)C1CO1,0.000456,0.000460,0.000463,0.000467,0.000470,0.000474,0.000478,0.000481,0.000485,...,1.105262e-05,1.104216e-05,1.103171e-05,1.102129e-05,1.101088e-05,1.100048e-05,1.099010e-05,1.097973e-05,1.096938e-05,1.095905e-05
55044,CC(C#N)C(C)N1CC1,0.000661,0.000667,0.000673,0.000679,0.000685,0.000692,0.000698,0.000705,0.000711,...,1.074891e-05,1.073879e-05,1.072869e-05,1.071860e-05,1.070853e-05,1.069848e-05,1.068843e-05,1.067841e-05,1.066840e-05,1.065840e-05
55045,CC(C#N)C(N)C1CN1,0.000492,0.000497,0.000501,0.000506,0.000511,0.000515,0.000520,0.000525,0.000530,...,9.498799e-06,9.489707e-06,9.480628e-06,9.471563e-06,9.462512e-06,9.453474e-06,9.444450e-06,9.435440e-06,9.426443e-06,9.417460e-06


In [10]:
raman_spe = pd.read_csv('../csv/raman.csv')

KeyError: 'smiles'

In [12]:
raman_spe.rename(columns={'smiles_all': 'smiles'}, inplace=True)
raman_spe_filtered = raman_spe[raman_spe['smiles'].isin(common_smiles)]
raman_spe_filtered

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,500.0,501.0,502.0,503.0,504.0,505.0,506.0,507.0,...,3992.0,3993.0,3994.0,3995.0,3996.0,3997.0,3998.0,3999.0,4000.0,smiles
0,0,0,0.002818,0.002822,0.002826,0.002830,0.002834,0.002838,0.002842,0.002846,...,0.008730,0.008713,0.008696,0.008679,0.008662,0.008645,0.008628,0.008611,0.008595,C
1,1,1,0.001302,0.001304,0.001307,0.001309,0.001312,0.001314,0.001316,0.001319,...,0.017296,0.017235,0.017174,0.017114,0.017054,0.016994,0.016935,0.016876,0.016817,N
2,2,2,0.000456,0.000457,0.000457,0.000458,0.000458,0.000459,0.000459,0.000460,...,0.035804,0.035543,0.035285,0.035029,0.034777,0.034527,0.034280,0.034036,0.033794,O
3,3,3,0.014142,0.014368,0.014599,0.014836,0.015079,0.015329,0.015586,0.015849,...,0.004203,0.004190,0.004177,0.004164,0.004152,0.004139,0.004126,0.004114,0.004101,C#C
4,4,4,0.000855,0.000858,0.000862,0.000865,0.000868,0.000872,0.000876,0.000879,...,0.001825,0.001820,0.001815,0.001811,0.001806,0.001801,0.001796,0.001792,0.001787,C#N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129809,129809,129809,0.058707,0.058738,0.058796,0.058878,0.058985,0.059115,0.059267,0.059440,...,0.033779,0.033711,0.033644,0.033576,0.033508,0.033441,0.033374,0.033307,0.033241,COC(C#N)C(F)(F)F
129810,129810,129810,0.064435,0.064083,0.063793,0.063558,0.063374,0.063236,0.063142,0.063087,...,0.034679,0.034608,0.034537,0.034466,0.034395,0.034325,0.034255,0.034185,0.034115,CCC(CO)C(F)(F)F
129811,129811,129811,0.165069,0.156992,0.150194,0.144489,0.139726,0.135785,0.132572,0.130012,...,0.031477,0.031411,0.031345,0.031279,0.031214,0.031149,0.031084,0.031019,0.030954,COC1CC1C(F)(F)F
129812,129812,129812,0.081207,0.078689,0.076566,0.074772,0.073257,0.071979,0.070906,0.070008,...,0.028366,0.028307,0.028248,0.028190,0.028132,0.028074,0.028016,0.027958,0.027901,CCOC(C)C(F)(F)F


In [13]:
raman_spe_filtered.to_csv('/data/chenyize/csv/raman_spe_filtered.csv', index=False)

In [14]:
zp_spe = pd.read_csv('../csv/zhipu.csv')
zp_spe

Unnamed: 0,mol_id,smiles,nce,1,2,3,4,5,6,7,...,145,146,147,148,149,150,151,152,153,M+_HighPrecision
0,0,C,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.031300
1,0,C,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.031300
2,0,C,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.031300
3,0,C,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.031300
4,0,C,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.031300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649080,129816,C1C2C3C4C5OC13C2C45,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.057515
649081,129816,C1C2C3C4C5OC13C2C45,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.057515
649082,129816,C1C2C3C4C5OC13C2C45,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.057515
649083,129816,C1C2C3C4C5OC13C2C45,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.057515


In [16]:
# 分组并基于 `smiles` 进行处理，确保每个分子只出现一行
grouped = zp_spe.groupby('smiles')

# 初始化一个列表来存储每个分子的处理后数据
processed_data = []

# 遍历每个分子的 `smiles` 分组
for smiles, group in grouped:
    # 获取当前分子的 mol_id 和高分辨率数值（所有行的 mol_id 和高分辨率值应该相同）
    mol_id = group['mol_id'].iloc[0]
    high_res_value = group['M+_HighPrecision'].iloc[0]
    
    # 获取所有 `nce` 值
    nce_values = group['nce'].values
    
    # 获取所有低分辨率数据，按列（3 到 -1）提取并转为一维数组
    low_res_values = group.iloc[:, 3:-1].values.flatten()

    # 将所有信息拼接在一起，形成一行
    row_data = [mol_id, smiles] + nce_values.tolist() + low_res_values.tolist() + [high_res_value]
    
    # 将拼接后的数据加入结果列表
    processed_data.append(row_data)

# 定义新 DataFrame 的列名
columns = ['mol_id', 'smiles'] + [f'nce_{i+1}' for i in range(5)] + [f'low_res_{i+1}' for i in range(len(low_res_values))] + ['M+_HighPrecision']

# 创建新的 DataFrame
processed_zp_spe= pd.DataFrame(processed_data, columns=columns)
processed_zp_spe

Unnamed: 0,mol_id,smiles,nce_1,nce_2,nce_3,nce_4,nce_5,low_res_1,low_res_2,low_res_3,...,low_res_757,low_res_758,low_res_759,low_res_760,low_res_761,low_res_762,low_res_763,low_res_764,low_res_765,M+_HighPrecision
0,0,C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.031300
1,3,C#C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.015650
2,20,C#CC#C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.015650
3,442,C#CC#CC#C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.015650
4,13303,C#CC#CC#CC#C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,98.015650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129812,20951,ON=C1COCCC=C1,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127.063329
129813,20956,ON=C1COCCOC1,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131.058243
129814,3728,ON=C1COCOC1,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.042593
129815,20908,ON=C1COCOC1=N,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.037842


In [20]:
zhipu_spe_filtered = processed_zp_spe[processed_zp_spe['smiles'].isin(common_smiles)]
zhipu_spe_filtered

Unnamed: 0,mol_id,smiles,nce_1,nce_2,nce_3,nce_4,nce_5,low_res_1,low_res_2,low_res_3,...,low_res_757,low_res_758,low_res_759,low_res_760,low_res_761,low_res_762,low_res_763,low_res_764,low_res_765,M+_HighPrecision
0,0,C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.031300
1,3,C#C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.015650
2,20,C#CC#C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.015650
3,442,C#CC#CC#C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.015650
4,13303,C#CC#CC#CC#C,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,98.015650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129812,20951,ON=C1COCCC=C1,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127.063329
129813,20956,ON=C1COCCOC1,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131.058243
129814,3728,ON=C1COCOC1,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.042593
129815,20908,ON=C1COCOC1=N,20.0,40.0,60.0,80.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.037842


In [23]:
zhipu_spe_filtered.drop(columns=['nce_1', 'nce_2', 'nce_3', 'nce_4', 'nce_5'], inplace=True)
zhipu_spe_filtered

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zhipu_spe_filtered.drop(columns=['nce_1', 'nce_2', 'nce_3', 'nce_4', 'nce_5'], inplace=True)


Unnamed: 0,mol_id,smiles,low_res_1,low_res_2,low_res_3,low_res_4,low_res_5,low_res_6,low_res_7,low_res_8,...,low_res_757,low_res_758,low_res_759,low_res_760,low_res_761,low_res_762,low_res_763,low_res_764,low_res_765,M+_HighPrecision
0,0,C,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.031300
1,3,C#C,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.015650
2,20,C#CC#C,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.015650
3,442,C#CC#CC#C,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.015650
4,13303,C#CC#CC#CC#C,0.0,0.0,0.0,0.0,0.0,0.0,0.081366,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,98.015650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129812,20951,ON=C1COCCC=C1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127.063329
129813,20956,ON=C1COCCOC1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131.058243
129814,3728,ON=C1COCOC1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.042593
129815,20908,ON=C1COCOC1=N,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.037842


In [28]:
zhipu_spe_filtered = zhipu_spe_filtered.sort_values(by='mol_id')

In [29]:
zhipu_spe_filtered

Unnamed: 0,mol_id,smiles,low_res_1,low_res_2,low_res_3,low_res_4,low_res_5,low_res_6,low_res_7,low_res_8,...,low_res_757,low_res_758,low_res_759,low_res_760,low_res_761,low_res_762,low_res_763,low_res_764,low_res_765,M+_HighPrecision
0,0,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.031300
88438,1,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.026549
100635,2,O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.010565
1,3,C#C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.015650
2400,4,C#N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.010899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80817,129809,COC(C#N)C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,139.024498
59022,129810,CCC(CO)C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,142.060550
84848,129811,COC1CC1C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.044900
71008,129812,CCOC(C)C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,142.060550


In [31]:
# 删除最后一列 'M+_HighPrecision'，并保留所有其他列
df_without_highprecision = zhipu_spe_filtered.drop(columns=['M+_HighPrecision'])

Unnamed: 0,smiles,mol_id,smiles.1,low_res_1,low_res_2,low_res_3,low_res_4,low_res_5,low_res_6,low_res_7,...,low_res_756,low_res_757,low_res_758,low_res_759,low_res_760,low_res_761,low_res_762,low_res_763,low_res_764,low_res_765
0,C,0,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88438,N,1,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100635,O,2,O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C#C,3,C#C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2400,C#N,4,C#N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80817,COC(C#N)C(F)(F)F,129809,COC(C#N)C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59022,CCC(CO)C(F)(F)F,129810,CCC(CO)C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84848,COC1CC1C(F)(F)F,129811,COC1CC1C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71008,CCOC(C)C(F)(F)F,129812,CCOC(C)C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
df_without_highprecision.drop(columns=['mol_id'], inplace=True)
df_without_highprecision.to_csv('/data/chenyize/csv/lowprecision_zhipu.csv', index=False)
df_without_highprecision

Unnamed: 0,smiles,low_res_1,low_res_2,low_res_3,low_res_4,low_res_5,low_res_6,low_res_7,low_res_8,low_res_9,...,low_res_756,low_res_757,low_res_758,low_res_759,low_res_760,low_res_761,low_res_762,low_res_763,low_res_764,low_res_765
0,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88438,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100635,O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C#C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2400,C#N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80817,COC(C#N)C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59022,CCC(CO)C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84848,COC1CC1C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71008,CCOC(C)C(F)(F)F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
zhipu_highprecision = zhipu_spe_filtered[['smiles', 'M+_HighPrecision']]

In [39]:
zhipu_highprecision
zhipu_highprecision.to_csv('/data/chenyize/csv/highprecision_zhipu.csv', index=False)