In [5]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings('ignore')

In [6]:
class DataLoader:
    
    def __init__(self, ori_file_name):   
        '''
        ori_file_name:原始数据文件的文件名。(原始数据放在当前目录下即可)
        '''
        self.ori_file_name = ori_file_name
        self.ori_df = pd.read_csv(ori_file_name).iloc[:,2:]   # 原始数据的df (去掉index和timestamp)
        self.col_name = self.ori_df.columns.tolist()   # 完整粒子名
        
        
    def get_ptc_name(self):  
        '''
        返回剥离多余符号后的粒子名，如：46Ti
        '''
        short_name = list(map(lambda x:x[1:-8], self.col_name))
        return short_name
    
    
    def get_cleaned_data(self):  
        '''
        清洗数据，将负值置为0, 并保存清洗后的数据
        '''
        cleaned_df = self.ori_df.copy()
        cleaned_df[cleaned_df<=0]=np.nan   # 不大于0的区域全部置为nan
        file_name = 'cleaned_'+ self.ori_file_name
        cleaned_df.to_csv(file_name, index=None)
        print('%s have been saved.' % file_name)
        return cleaned_df
    
    
    def get_basic_metirct(self, cleaned_df): 
        '''
        得到 [去除符号后的粒子名，每种粒子出现的次数,最小强度，最大强度，平均强度，总强度，强度标准差] 的df；
        并插入metirc列作为index；最后保存
        cleaned_df：get_cleaned_data得到的清洗后数据的df
        '''
        basic_metric = pd.DataFrame()
        short_name = pd.DataFrame(np.array(self.get_ptc_name()).reshape(1,-1), columns=self.col_name)   # 去除符号后的粒子名
        count = cleaned_df[cleaned_df>0].count().to_frame().T     # 某种粒子出现次数
        min_ints = cleaned_df.min().to_frame().T    # 某种粒子强度最小值
        max_ints = cleaned_df.max().to_frame().T    # 某种粒子强度最大值
        sum_ints = pd.DataFrame(np.nansum(cleaned_df, axis=0).reshape(1,-1), columns=self.col_name)    # 某种粒子强度和
        avg_ints = pd.DataFrame(np.nanmean(cleaned_df, axis=0).reshape(1,-1), columns=self.col_name)   # 某种粒子强度平均
        std_ints = pd.DataFrame(np.nanstd(cleaned_df, axis=0).reshape(1,-1), columns=self.col_name)    # 某种粒子强度的标准差
        basic_metric = pd.concat([short_name, min_ints, max_ints, count, sum_ints, avg_ints, std_ints], axis=0) 
        basic_metric.insert(0, 'metric', value=['ptc_name', 'min_ints', 'max_ints', 'count','sum_ints','avg_ints', 'std_ints'])
        basic_metric.set_index(['metric'],inplace=True)   # metric 列作为index
        file_name = 'basic_metric_' + self.ori_file_name
        basic_metric.to_csv(file_name)
        print('%s have been saved.' % file_name)
        return basic_metric
    

In [7]:
class IterMethod:

    def __init__(self, data_df, metric_df):
        '''
        data_csv: 清洗后的df
        metric_csv: 清洗后数据统计指标的df
        '''
        self.data_df = data_df
        self.metric_df = metric_df
        self.col_name = self.data_df.columns.tolist()

        self.cur_df = self.data_df  # 每次要进行被迭代的df

        self.iter_cnt = 0  # 迭代次数
        self.all_thr = list()  # 每次迭代后得到的阈值

    def get_avg(self):
        '''
        求出每种粒子强度的平均值，返回nparray
        '''
        return np.nanmean(self.cur_df, axis=0)

    def get_std(self):
        '''
        求出每种粒子强度的标准差，返回nparray
        '''
        return np.nanstd(self.cur_df, axis=0)

    def get_thr(self):
        '''
        求出每个csv文件的阈值,并更新self.iter_cnt, self.all_thr
        '''
        avg_tmp = self.get_avg()
        std_tmp = self.get_std()
        thr = np.nanmean(3 * std_tmp + avg_tmp)
        self.all_thr.append(thr)
        self.iter_cnt += 1
        return thr

    def gt_file(self, cnt):
        '''
        创建gt文件夹并返回对应迭代轮数的csv文件名
        '''
        dir_path = os.path.join(os.getcwd(), "gt")
        flag = os.path.exists(dir_path)
        if not flag:
            os.makedirs(dir_path)
        file_name = str(cnt) + ".csv"
        file_path = os.path.join(dir_path, file_name)
        return file_path

    def lt_file(self, cnt):
        '''
        创建lt文件夹并返回对应迭代轮数的csv文件名, 如：gt/1.csv
        '''
        dir_path = os.path.join(os.getcwd(), "lt")
        flag = os.path.exists(dir_path)
        if not flag:
            os.makedirs(dir_path)
        file_name = str(cnt) + ".csv"
        file_path = os.path.join(dir_path, file_name)
        return file_path

    def update_df(self, thr, cnt):
        '''
        根据阈值保存为两个csv文件：'gt/1.csv'、'lt/1.csv'；并返回小于阈值的df
        '''
        gt_df = self.cur_df[self.cur_df >= thr]
        gt_df.to_csv(self.gt_file(cnt), index=None)
        lt_df = self.cur_df[self.cur_df < thr]
        lt_df.to_csv(self.lt_file(cnt), index=None)
        print("The %s th iteration have been finished." % cnt)
        return lt_df

    def iterator(self):
        '''
        迭代过程
        '''
        beg_DF = self.cur_df
        end_DF = self.cur_df
        flag = False
        while not flag:
            beg_DF = end_DF
            THR = self.get_thr()  # self.iter_cnt 在此处已+1
            end_DF = self.update_df(THR, self.iter_cnt)
            self.cur_df = end_DF
            if beg_DF.equals(end_DF):
                flag = True

    def get_final_result(self, resolve_data):
        '''
        输入：溶解态的最终结果为 lt/last_iter_cnt.csv
        结果：得到颗粒态的最终结果 gt/last_iter_cnt+1.csv
        '''
        resolve_df = pd.read_csv(resolve_data)
        particle_df = self.data_df[pd.isnull(resolve_df)]
        # pd.isnull(resolve_df_：溶解态df的非空位为False，空位为True，与清洗后的原始数据做mask
        particle_df.to_csv(self.gt_file(self.iter_cnt + 1), index=None)
        

In [8]:
class PoissonMethod:

    def __init__(self, data_df, metric_df, credible):
        '''
        data_df：清洗后数据的df
        metric_df：清洗后数据统计指标的df
        credible：置信度。小数表示，如0.997
        '''
        self.data_df = data_df
        self.metric_df = metric_df
        self.col_name = self.data_df.columns.tolist()
        self.m = self.metric_df.iloc[5]  # 未归一化的强度均值，归一化处理后作为λ
        self.credible = credible


    def normal_lambda(self):
        '''
        将强度均值归一化，得到可用于泊松计算λ。
        返回的df包括每种粒子的 [强度均值，λ，scale], 并保存该df，每行都是float。
        '''
        lamb_li = []  # 每种粒子归一化后的λ
        scale_li = []  # 每种粒子的缩放系数scale
        scale = 1.0
        for val in self.m:
            if val > 0 and val <= 1:
                scale = 50.0
            elif val > 1 and val <= 2:
                scale = 25.0
            elif val > 2 and val <= 3:
                scale = 15.0
            elif val > 3 and val <= 35:
                scale = 1.0
            else:
                scale = 0.33
            lamb_li.append(round(val * scale))
            scale_li.append(scale)

        lamb_li = np.array(lamb_li).reshape(1, -1)
        scale_li = np.array(scale_li).reshape(1, -1)
        res_arr = np.concatenate((lamb_li, scale_li), axis=0)
        res_df = pd.DataFrame(res_arr, columns=self.col_name)
        res_df = pd.concat([self.m.to_frame().T, res_df])
        res_df.insert(0, 'metric', value=['avg_ints', 'lambda', 'scale'])
        res_df.set_index(['metric'], inplace=True)  # metric 列作为index
        file_name = "poisson_normalize_lambda.csv"
        res_df.to_csv(file_name)
        return res_df


    def poisson(self, k, lamb):
        '''
        泊松方程，计算得到单词的概率值。在计算最终阈值时需要将概率累加
        lamb：归一化后的λ,一定是整数
        '''
        kjie = 1  # k!
        for i in range(1, k):
            kjie *= i
        lamb = float(lamb)
        pk = np.power(lamb, k) / kjie * np.exp(-lamb)
        return pk


    def get_ints_thr(self):
        '''
        计算得到每种元素的阈值df，并保存
        '''
        lamb = self.normal_lambda().iloc[1].values.astype('int')
        scale = self.normal_lambda().iloc[2].values
        ints_val = []
        for i in range(len(self.col_name)):
            thr = 0.0
            prob = 0.0
            for k in range(1, 100):
                prob += self.poisson(k, lamb[i])
                if prob >= self.credible:
                    thr = k / scale[i]
                    break
            ints_val.append(thr)
        ints_val = pd.DataFrame(np.array(ints_val).reshape(1, -1), columns=self.col_name)
        file_name = "intensity_threshold.csv"
        ints_val.to_csv(file_name, index=None)
        return ints_val


    def classifier(self):
        '''
        根据每种元素强度的阈值区分颗粒态和溶解态粒子，分别保存为df
        '''
        resolve = pd.DataFrame()  # 分类后的溶解态粒子数据
        particle = pd.DataFrame()  # 分类后的颗粒态粒子数据
        ints_thr = self.get_ints_thr()
        ints_thr_li = ints_thr.values[0]

        for idx in range(len(self.col_name)):
            single_ptc_df = self.data_df.iloc[:, idx].to_frame()
            single_ptc_resolve = single_ptc_df[single_ptc_df >= ints_thr_li[idx]]
            particle = pd.concat([particle, single_ptc_resolve], axis=1)

        resolve = self.data_df[pd.isnull(particle)]
        particle.to_csv("Poisson_particle.csv", index=None)
        print("Particle have been saved.")
        resolve.to_csv("Poisson_resolve.csv", index=None)
        print("Resolve have been saved.")
        

In [9]:
class PostProcess:

    def __init__(self, particle_csv, resolve_csv):
        '''
        particle_csv ：颗粒态数据csv文件名，放在当前目录下即可
        resolve_csv：溶解态数据csv文件名，放在当前目录下即可
        '''
        self.ptc_df = pd.read_csv(particle_csv)
        self.resl_df = pd.read_csv(resolve_csv)
        self.df_len = len(self.resl_df)
        self.col_name = self.ptc_df.columns


    def get_background(self):
        '''
        计算每种粒子的背景值并保存为csv
        '''
        BG = pd.DataFrame([np.nanmean(self.resl_df, axis=0)] * self.df_len, columns=self.col_name)
        return BG


    def substract_background(self):
        '''
        对颗粒态数据减去背景值并保存为csv
        background_df：背景值df
        '''
        BG = self.get_background()
        file_name = 'substract_bg_particle.csv'
        substract_bg_particle = self.ptc_df - BG
        substract_bg_particle.to_csv(file_name, index=None)
        print("%s have been saved." % file_name)


    def select_columns(self, final_particle_csv, target_particle):
        '''
        在减去背景的颗粒态数据中选择要处理的粒子，组成df并保存为csv
        final_particle_csv：减去背景后的颗粒态csv文件名，放在该目录下即可
        target_particle：要选择的粒子名，如:'Au'
        '''
        ptc_df = pd.read_csv(final_particle_csv)
        ptc_name_full_li = ptc_df.columns.tolist()  # 表头
        ptc_name_short_li = list(map(lambda x: x[-10:-8], ptc_name_full_li))  # 元素名
        select_col_li = []  # 选中元素所在列的完整列名

        for i in range(len(ptc_name_full_li)):
            if target_particle == ptc_name_short_li[i]:
                select_col_li.append(ptc_name_full_li[i])

        selected_ptc_df = pd.DataFrame(ptc_df, columns=select_col_li)
        file_name = target_particle + '_in_' + final_particle_csv
        selected_ptc_df.to_csv(file_name, index=None)
        print("%s particle have been selected." % target_particle)


    def get_particle_number_concentration(self, selected_particle_csv, TE, speed, CPS):
        '''
        ！！旧的颗粒数浓度计算方法！！

        计算去除背景后颗粒态的目标元素的颗粒数浓度。
        selected_particle_csv：减去背景后的颗粒态目标元素的csv文件名，放在该目录下即可
        TE：计算参数，手动输入
        speed：流速，手动输入
        CPS：目标粒子的单位CPS，手动输入
        '''
        ele_name = selected_particle_csv[0:2]
        selected_ptc_df = pd.read_csv(selected_particle_csv)
        ints_sum = pd.DataFrame(np.nansum(selected_ptc_df, axis=0).reshape(1, -1), columns=selected_ptc_df.columns)
        coef = 1000 / (2.5 * TE * speed * CPS)  # 强度和df要乘的系数
        ptc_num_concentration = coef * ints_sum
        file_name = ele_name + "_particle_number_concentration.csv"
        ptc_num_concentration.to_csv(file_name, index=None)
        print("The particle number concentration of %s have been computed." % ele_name)


    def get_TE(self, selected_particle_csv):
        '''
        利用Std文件减去背景值后的目标粒子数据，计算得到TE，并保存对应csv。
        selected_particle_csv：减去背景后的颗粒态目标元素的csv文件名，放在该目录下即可
        '''
        std_df = pd.read_csv(selected_particle_csv)  # std文件的df
        TE = pd.DataFrame((std_df.count()) / (2.5 * 0.02 * 1e6), columns=std_df.columns)
        TE.to_csv("TE.csv", index=None)
        print("TE have been computed.")


    def get_particle_number_con_new(self, selected_particle_csv, TE, speed):
        '''
        ！！新的的颗粒数浓度计算方法！！

        计算去除背景后颗粒态的目标元素的颗粒数浓度。
        selected_particle_csv：减去背景后的颗粒态目标元素的csv文件名，放在该目录下即可
        TE：计算参数，手动输入
        speed：流速，手动输入
        '''
        ele_name = selected_particle_csv[0:2]
        selected_ptc_df = pd.read_csv(selected_particle_csv)
        ptc_cnt = selected_ptc_df.count()
        coef = 1 / (2.5 * TE * speed)  # 粒子计数要乘的系数
        res = coef * ptc_cnt
        ptc_num_con = pd.DataFrame(res, columns=selected_ptc_df.columns)
        file_name = ele_name + "_particle_number_concentration.csv"
        ptc_num_con.to_csv(file_name, index=None)
        print("The particle number concentration of %s have been computed." % ele_name)


In [12]:
# 泊松法执行完整流程
def main():
    if TE_flag:  # 计算TE
        #  一：执行
        data_loader = DataLoader(origin_csv)                        # 实例化
        cleaned_data = data_loader.get_cleaned_data()               # 得到清洗后的数据
        metric_data = data_loader.get_basic_metirct(cleaned_data)   # 得到相关指标统计结果

        # 二：Poisson执行
        poissonmethod = PoissonMethod(cleaned_data, metric_data, credible)   # 实例化
        avgints_lambda_scale = poissonmethod.normal_lambda()                 # 得到与lambda相关参数组成的csv
        intensity_threshold = poissonmethod.get_ints_thr()                   # 经过泊松过程得到强度阈值的csv
        poissonmethod.classifier()                                           # 分类得到颗粒态和溶解态数据csv

        # 三：执行
        p_process = PostProcess('Poisson_particle.csv', 'Poisson_resolve.csv')    # 实例化
        p_process.substract_background()                                          # 颗粒态数据减背景
        p_process.select_columns('substract_bg_particle.csv', 'Au')               # 在减背景后的颗粒态数据选择Au
        p_process.get_TE('Au_in_substract_bg_particle.csv')                       # 计算TE

        
    else:   # 计算颗粒数浓度
        #  一：执行
        data_loader = DataLoader(origin_csv)                        # 实例化
        cleaned_data = data_loader.get_cleaned_data()               # 得到清洗后的数据
        metric_data = data_loader.get_basic_metirct(cleaned_data)   # 得到相关指标统计结果

        # 二：Poisson执行
        poissonmethod = PoissonMethod(cleaned_data, metric_data, credible)  # 实例化
        avgints_lambda_scale = poissonmethod.normal_lambda()                # 得到与lambda相关参数组成的csv
        intensity_threshold = poissonmethod.get_ints_thr()                  # 经过泊松过程得到强度阈值的csv
        poissonmethod.classifier()                                          # 分类得到颗粒态和溶解态数据csv

        # 三：执行
        p_process = PostProcess('Poisson_particle.csv', 'Poisson_resolve.csv')                    # 实例化
        p_process.substract_background()                                                          # 颗粒态数据减背景
        p_process.select_columns('substract_bg_particle.csv', 'Au')                               # 在减背景后的颗粒态数据选择Au
        p_process.get_particle_number_con_new('Au_in_substract_bg_particle.csv', TE, speed)       # 计算Au的颗粒数浓度


In [13]:
file_li = ['64nmAu-TE.csv', '5X.csv', '10X.csv', '100X .csv', '1000X.csv', '50nm.csv', '50nm-P.csv']    # 文件列表

In [133]:
# 超参数
origin_csv =  file_li[1]    # 原始数据的csv文件
credible = 1e-7             # 泊松分布的置信度

TE_flag = False             # 是否计算TE

speed = 0.02                # 流速
TE = 0.67544                # TE      



if __name__ == '__main__':
    main()

cleaned_5X.csv have been saved.
basic_metric_5X.csv have been saved.
Particle have been saved.
Resolve have been saved.
substract_bg_particle.csv have been saved.
Au particle have been selected.
The particle number concentration of Au have been computed.


In [48]:
# 传统方法执行流程
file_li = ['64nmAu-TE.csv', '5X.csv', '10X.csv', '100X.csv', '1000X.csv', '50nm.csv', '50nm-P.csv']    # 文件列表

In [61]:
# 超参数
origin_csv =  file_li[6]    # 原始数据的csv文件
print(origin_csv)

TE_flag = False             # 是否计算TE

speed = 0.02                # 流速
TE = 0.36726                # TE      

50nm-P.csv


In [62]:
# 执行流程
#  一：数据清洗执行
data_loader = DataLoader(origin_csv)                        # 实例化
cleaned_data = data_loader.get_cleaned_data()               # 得到清洗后的数据
metric_data = data_loader.get_basic_metirct(cleaned_data)   # 得到相关指标统计结果

# 二：迭代法执行
itermethod = IterMethod(cleaned_data, metric_data)             # 实例化
itermethod.iterator()                                          # 迭代过程

cleaned_50nm-P.csv have been saved.
basic_metric_50nm-P.csv have been saved.
The 1 th iteration have been finished.
The 2 th iteration have been finished.
The 3 th iteration have been finished.
The 4 th iteration have been finished.
The 5 th iteration have been finished.
The 6 th iteration have been finished.
The 7 th iteration have been finished.
The 8 th iteration have been finished.
The 9 th iteration have been finished.
The 10 th iteration have been finished.
The 11 th iteration have been finished.
The 12 th iteration have been finished.
The 13 th iteration have been finished.


In [63]:
itermethod.get_final_result('lt/13.csv')                       # 得到最终的颗粒态数据

In [64]:
# 三：后处理执行
p_process = PostProcess('14.csv', '13.csv')                                               # 实例化
p_process.substract_background()                                                          # 颗粒态数据减背景
p_process.select_columns('substract_bg_particle.csv', 'Au')                               # 在减背景后的颗粒态数据选择Au
# p_process.get_TE('Au_in_substract_bg_particle.csv')                                       # 计算TE
p_process.get_particle_number_con_new('Au_in_substract_bg_particle.csv', TE, speed)       # 计算Au的颗粒数浓度

substract_bg_particle.csv have been saved.
Au particle have been selected.
The particle number concentration of Au have been computed.
