In [1]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings("ignore")

## 工作说明

#### 一、单位强度文件在处理前都要：
* 1.将其中的空值列删掉
* 2.按照不同的日期将单位强度分开保存
* 3.删去除单位强度外的列(比如表示日期的首列)
* 4.并将文件名改为英文(如：'23.csv')，并保存为csv


#### 二、每个样本对质量处理后的数据结构：

<img src='./Reference/form_of_mass_processed.png' width='660px'>


#### 三、代码操作说明：

* 以日期23的文件夹为例
* p = PreProcess(base, file, drop, percent_thr, top_k)：实例化一个处理器对象
* P.select_elements()：执行 **[ 空行舍弃;元素列舍弃;元素质量计算;元素质量占比计算;主要元素字典及个数统计;top K含量元素统计;文件名拼接 ]** 功能，并生成一个csv文件。


**执行过程**
```
file_list = [ 'S15_Poisson_particle.csv',
             'S16_Poisson_particle.csv',
             'S25_Poisson_particle.csv',
             'S27_Poisson_particle.csv',
             'S35_Poisson_particle.csv']

base = '23_base.csv'

drop = ['[42Ca]+ (cts)','[44Ca]+ (cts)','[56Fe]+ (cts)',]

percent_thr = 0.1

top_k = 10

for file in file_list:
    P = PreProcess(base, file, drop, percent_thr, top_k)
    P.select_elements()
```


In [12]:
class PreProcess:   # 数据集的预处理
    
    def __init__(self, unit_intensity, ptc_intensity, drop_ptc, percent_thr, top_k):
        '''
        base:单位强度csv文件名
        ptc_intensity:要处理的颗粒态强度csv文件名
        drop_ptc:要丢弃的粒子名组成的列表，如：['[56Fe]+ (cts)']
        percent_thr:字典要统计的元素占比的阈值，如0.1
        top_k:要统计的含量前k的k值，如10
        '''
        self.base = unit_intensity
        self.target = ptc_intensity
        self.drop_ptc = drop_ptc
        self.percent_thr = percent_thr
        self.top_k = top_k
        
        
        
    def read_base(self):
        '''
        读取单位强度的csv文件并将其数据转换为float型
        返回单位强度处理后的df
        '''
        return pd.read_csv(self.base).astype("float")
    
    
    
    def drop_particle(self):
        '''
        读取颗粒态强度的csv文件并将将不处理的粒子列去掉,之后将其中全NaN的行删掉
        返回颗粒态强度处理后的df
        '''
        ptc_ints = pd.read_csv(self.target)
        for item in self.drop_ptc:
            ptc_ints = ptc_ints.drop(item, axis=1)
        ptc_ints = ptc_ints.dropna(axis=0,how='all')
        return ptc_ints
    
    
    
    def get_mass_filename(self):
        '''
        从颗粒态强度文件名中读取样品标签(如：S15)
        返回颗粒态质量csv文件的文件名(如：'S15_mass.csv')
        '''
        label = self.target[0:-21]
        suffix = '_mass_final.csv'
        return label+suffix
        
        
        
    def get_particle_mass(self):
        '''
        计算颗粒态质量并返回相应df
        '''
        mass_df = self.drop_particle() / self.read_base().values
        return mass_df
        
        
    
    def get_mass_sum(self):
        '''
        计算每个颗粒质量和并返回相应df
        '''
        def sum_mass(row):
            return np.nansum(row)
        
        mass_df = self.get_particle_mass()
        total_mass = mass_df.apply(lambda x:sum_mass(x), axis=1)
        total_mass = total_mass.values
        mass_df.insert(mass_df.shape[1], 'total_mass', total_mass)
        return mass_df
        
    
    
    def get_short_ele_name(self):
        '''
        计算每个颗粒质量和并返回响应df
        '''
        col = self.drop_particle().columns
        new_col = []
        for item in col:
            new_col.append(item[1:-8])
        return new_col
    
    
    
    def normalize_mass(self):
        '''
        对每种元素质量进行归一化，并拼接df,返回相应df
        '''
        mass_df = self.get_mass_sum()
        ptc_mass = mass_df.iloc[:,0:-1]
        total_mass = self.get_mass_sum().iloc[:,-1]
        
        def divide(col, total):   # 将一列数据除以总质量
            return col/total

        ptc_percent = ptc_mass.apply(lambda x:divide(x,total_mass), axis=0)
        ptc_percent.columns = self.get_short_ele_name()
        normed_ptc = pd.concat([mass_df,ptc_percent], axis=1)
        normed_ptc.reset_index(inplace=True, drop=True) 
        return normed_ptc
    
    
    
    def select_elements(self):
        '''
        将占比>0.1的元素保存为字典，并记录符合条件的元素个数；
        将占比前k的元素保存为字典；
        拼接df，保存为csv文件
        '''
        main_ele_dict = []
        main_ele_len = []
        
        topk_dict = []
        
        normed_df = self.normalize_mass()
        col_len = normed_df.shape[1]
        percent = normed_df.iloc[:, (col_len+1)//2:]

        
        def process_row(row):
            # 对每行进行处理，每行数据为Series。
            row = row.sort_values(ascending=False)
            
            # 含量大于阈值的元素记录：对每行数据先按照占比排序，之后记录为字典以及元素个数
            ele_dict = {}
            ele_leng = 0
            ele_leng = len(row[row>self.percent_thr])
            for i in range(ele_leng):
                ele_dict[row.index[i]] = row[i]
            main_ele_dict.append(ele_dict)
            main_ele_len.append(ele_leng)
            
            # top K 含量元素记录：对每行数据先按照占比排序，之后记录为字典
            top_d = {}
            top_leng = row.shape[0]
            for i in range(top_leng):
                if top_leng == self.top_k:
                    break
                top_d[row.index[i]] = row[i]
            topk_dict.append(top_d)
            

            
        percent.apply(lambda x:process_row(x), axis=1)
        
        main_ele_dict = pd.DataFrame(pd.Series(main_ele_dict), columns=['components'])
        main_ele_len = pd.DataFrame(main_ele_len, columns=['number_of_components'])
        topk_dict = pd.DataFrame(pd.Series(topk_dict), columns=['top_k'])
        file_name = self.get_mass_filename()
        vital_ele = pd.concat([main_ele_dict,main_ele_len,topk_dict], axis=1)
        final_df = pd.concat([normed_df,vital_ele], axis=1)
        final_df.to_csv(file_name, index=None)
        print("%s have finished." % file_name)
        
        
# file_name = self.get_mass_filename()

### 执行

In [13]:
# 23
file_list = [ 'S15_Poisson_particle.csv',
             'S16_Poisson_particle.csv',
             'S25_Poisson_particle.csv',
             'S27_Poisson_particle.csv',
             'S35_Poisson_particle.csv']

base = '23_base.csv'

drop = ['[42Ca]+ (cts)','[44Ca]+ (cts)','[56Fe]+ (cts)',]

percent_thr = 0.1

top_k = 10

for file in file_list:
    P = PreProcess(base, file, drop, percent_thr, top_k)
    P.select_elements()

S15_mass_final.csv have finished.
S16_mass_final.csv have finished.
S25_mass_final.csv have finished.
S27_mass_final.csv have finished.
S35_mass_final.csv have finished.


In [14]:
# 24
file_list = [ 'S43_Poisson_particle.csv',
             'S44_Poisson_particle.csv',
             'S45_Poisson_particle.csv',
             'S46_Poisson_particle.csv',
             'S47_Poisson_particle.csv',
             'S48_Poisson_particle.csv',
             'S52_Poisson_particle.csv',
             'S57_Poisson_particle.csv',
             'S58_Poisson_particle.csv']

base = '24_base.csv'

drop = ['[42Ca]+ (cts)','[44Ca]+ (cts)','[56Fe]+ (cts)',]

percent_thr = 0.1

top_k = 10

for file in file_list:
    P = PreProcess(base, file, drop, percent_thr, top_k)
    P.select_elements()

S43_mass_final.csv have finished.
S44_mass_final.csv have finished.
S45_mass_final.csv have finished.
S46_mass_final.csv have finished.
S47_mass_final.csv have finished.
S48_mass_final.csv have finished.
S52_mass_final.csv have finished.
S57_mass_final.csv have finished.
S58_mass_final.csv have finished.


In [15]:
# 25
file_list = ['S73_Poisson_particle.csv',
            'S75_Poisson_particle.csv',
            'S79_Poisson_particle.csv',
            'S83_Poisson_particle.csv',
            'S84_Poisson_particle.csv']

base = '25_base.csv'

drop = ['[42Ca]+ (cts)','[44Ca]+ (cts)','[56Fe]+ (cts)',]

percent_thr = 0.1

top_k = 10

for file in file_list:
    P = PreProcess(base, file, drop, percent_thr, top_k)
    P.select_elements()

S73_mass_final.csv have finished.
S75_mass_final.csv have finished.
S79_mass_final.csv have finished.
S83_mass_final.csv have finished.
S84_mass_final.csv have finished.


In [16]:
# 26
file_list = ['S115_Poisson_particle.csv']

base = '26_base.csv'

drop = ['[42Ca]+ (cts)','[44Ca]+ (cts)','[56Fe]+ (cts)',]

percent_thr = 0.1

top_k = 10

for file in file_list:
    P = PreProcess(base, file, drop, percent_thr, top_k)
    P.select_elements()

S115_mass_final.csv have finished.


In [17]:
# 27
file_list = ['S143_Poisson_particle.csv',
            'S145_Poisson_particle.csv',
            'S156_Poisson_particle.csv',
            'S157_Poisson_particle.csv']

base = '27_base.csv'

drop = ['[42Ca]+ (cts)','[44Ca]+ (cts)','[56Fe]+ (cts)',]

percent_thr = 0.1

top_k = 10

for file in file_list:
    P = PreProcess(base, file, drop, percent_thr, top_k)
    P.select_elements()

S143_mass_final.csv have finished.
S145_mass_final.csv have finished.
S156_mass_final.csv have finished.
S157_mass_final.csv have finished.
