In [4]:
import pandas as pd
import numpy as np
import os
import warnings

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth, fpmax

warnings.filterwarnings('ignore')

# 类的定义

In [8]:
class DataLoader:    # 原始数据预处理
    def __init__(self, ori_file_name):
        '''
        ori_file_name:原始数据文件的文件名。(原始数据放在当前目录下即可)
        '''
        self.ori_file_name = ori_file_name
        self.ori_df = pd.read_csv(ori_file_name).iloc[:, 2:]  # 原始数据的df (去掉index和timestamp)
        self.col_name = self.ori_df.columns.tolist()  # 完整粒子名

        
        
    def get_ptc_name(self):
        '''
        返回剥离多余符号后的粒子名，如：46Ti
        '''
        short_name = list(map(lambda x: x[1:-8], self.col_name))
        return short_name

    
    
    def get_cleaned_data(self):
        '''
        清洗数据，将负值置为0, 并保存清洗后的数据
        '''
        cleaned_df = self.ori_df.copy()
        cleaned_df[cleaned_df <= 0] = np.nan  # 不大于0的区域全部置为nan
        file_name = 'cleaned_' + self.ori_file_name
        cleaned_df.to_csv(file_name, index=None)
        print('%s have been saved.' % file_name)
        return cleaned_df

    
    
    def get_basic_metirct(self, cleaned_df):
        '''
        得到 [去除符号后的粒子名，每种粒子出现的次数,最小强度，最大强度，平均强度，总强度，强度标准差] 的df；
        并插入metirc列作为index；最后保存
        cleaned_df：get_cleaned_data得到的清洗后数据的df
        '''
        basic_metric = pd.DataFrame()
        short_name = pd.DataFrame(np.array(self.get_ptc_name()).reshape(1, -1), columns=self.col_name)  # 去除符号后的粒子名
        count = cleaned_df[cleaned_df > 0].count().to_frame().T  # 某种粒子出现次数
        min_ints = cleaned_df.min().to_frame().T  # 某种粒子强度最小值
        max_ints = cleaned_df.max().to_frame().T  # 某种粒子强度最大值
        sum_ints = pd.DataFrame(np.nansum(cleaned_df, axis=0).reshape(1, -1), columns=self.col_name)  # 某种粒子强度和
        avg_ints = pd.DataFrame(np.nanmean(cleaned_df, axis=0).reshape(1, -1), columns=self.col_name)  # 某种粒子强度平均
        std_ints = pd.DataFrame(np.nanstd(cleaned_df, axis=0).reshape(1, -1), columns=self.col_name)  # 某种粒子强度的标准差
        basic_metric = pd.concat([short_name, min_ints, max_ints, count, sum_ints, avg_ints, std_ints], axis=0)
        basic_metric.insert(0, 'metric',
                            value=['ptc_name', 'min_ints', 'max_ints', 'count', 'sum_ints', 'avg_ints', 'std_ints'])
        basic_metric.set_index(['metric'], inplace=True)  # metric 列作为index
        file_name = 'basic_metric_' + self.ori_file_name
        basic_metric.to_csv(file_name)
        print('%s have been saved.' % file_name)
        return basic_metric

In [9]:
class PoissonMethod:    # 泊松分类法
    def __init__(self, data_df, metric_df, credible):
        '''
        data_df：清洗后数据的df
        metric_df：清洗后数据统计指标的df
        credible：置信度。小数表示，如0.997
        '''
        self.data_df = data_df
        self.metric_df = metric_df
        self.col_name = self.data_df.columns.tolist()
        self.m = self.metric_df.iloc[5]  # 未归一化的强度均值，归一化处理后作为λ
        self.credible = credible

        
        
    def normal_lambda(self):
        '''
        将强度均值归一化，得到可用于泊松计算λ。
        返回的df包括每种粒子的 [强度均值，λ，scale], 并保存该df，每行都是float。
        '''
        lamb_li = []  # 每种粒子归一化后的λ
        scale_li = []  # 每种粒子的缩放系数scale
        scale = 1.0
        for val in self.m:
            # 当k最大值为100时，概率累加到80时已超过1，因此平均强度归一化到80之内即可
            if val > 0 and val <= 1:
                scale = 80.0
            elif val > 1 and val <= 2:
                scale = 40.0
            elif val > 2 and val <= 3:
                scale = 30.0
            elif val > 3 and val <= 5:
                scale = 16.0
            elif val > 5 and val <= 10:
                scale = 8.0
            elif val > 10 and val <= 20:
                scale = 4.0
            elif val > 20 and val <= 40:
                scale = 2.0
            elif val > 40 and val <= 60:
                scale = 1.5
            elif val > 60 and val <= 80:
                scale = 1.0
            elif val > 80 and val <= 100:
                scale = 0.8
            elif val > 100 and val <= 200:
                scale = 0.4
            elif val > 200 and val <= 300:
                scale = 0.3
            elif val > 300 and val <= 400:
                scale = 0.2
            elif val > 400 and val <= 500:
                scale = 0.16
            elif val > 500 and val <= 800:
                scale = 0.1
            else:
                scale = 0.04

            lamb_li.append(round(val * scale))
            scale_li.append(scale)
            
        lamb_li = np.array(lamb_li).reshape(1, -1)
        scale_li = np.array(scale_li).reshape(1, -1)
        res_arr = np.concatenate((lamb_li, scale_li), axis=0)
        res_df = pd.DataFrame(res_arr, columns=self.col_name)
        res_df = pd.concat([self.m.to_frame().T, res_df])
        res_df.insert(0, 'metric', value=['avg_ints', 'lambda', 'scale'])
        res_df.set_index(['metric'], inplace=True)  # metric 列作为index
        file_name = "poisson_normalize_lambda.csv"
        res_df.to_csv(file_name)
        return res_df

    
    
    def poisson(self, k, lamb):
        '''
        泊松方程，计算得到单词的概率值。在计算最终阈值时需要将概率累加
        lamb：归一化后的λ,一定是整数
        '''
        kjie = 1  # k!
        for i in range(1, k):
            kjie *= i
        lamb = float(lamb)
        pk = np.power(lamb, k) / kjie * np.exp(-lamb)
        return pk

    
    
    def get_ints_thr(self):
        '''
        计算得到每种元素的阈值df，并保存
        '''
        lamb = self.normal_lambda().iloc[1].values.astype('int')
        scale = self.normal_lambda().iloc[2].values
        ints_val = []
        for i in range(len(self.col_name)):
            thr = 0.0
            prob = 0.0
            for k in range(1, 100):
                prob += self.poisson(k, lamb[i])
                if prob >= self.credible:
                    thr = k / scale[i]
                    break
            ints_val.append(thr)
        ints_val = pd.DataFrame(np.array(ints_val).reshape(1, -1), columns=self.col_name)
        file_name = "intensity_threshold.csv"
        ints_val.to_csv(file_name, index=None)
        return ints_val

    
    
    def classifier(self):
        '''
        根据每种元素强度的阈值区分颗粒态和溶解态粒子，分别保存为df
        '''
        resolve = pd.DataFrame()  # 分类后的溶解态粒子数据
        particle = pd.DataFrame()  # 分类后的颗粒态粒子数据
        ints_thr = self.get_ints_thr()
        ints_thr_li = ints_thr.values[0]

        for idx in range(len(self.col_name)):
            single_ptc_df = self.data_df.iloc[:, idx].to_frame()
            single_ptc_resolve = single_ptc_df[single_ptc_df >= ints_thr_li[idx]]
            particle = pd.concat([particle, single_ptc_resolve], axis=1)

        resolve = self.data_df[pd.isnull(particle)]
        particle.to_csv("Poisson_particle.csv", index=None)
        print("Particle have been saved.")
        resolve.to_csv("Poisson_resolve.csv", index=None)
        print("Resolve have been saved.")

In [10]:
class PostProcess:    # 减背景&计算颗粒数浓度
    def __init__(self, particle_csv, resolve_csv):
        '''
        particle_csv ：颗粒态数据csv文件名，放在当前目录下即可
        resolve_csv：溶解态数据csv文件名，放在当前目录下即可
        '''
        self.ptc_df = pd.read_csv(particle_csv)
        self.resl_df = pd.read_csv(resolve_csv)
        self.df_len = len(self.resl_df)
        self.col_name = self.ptc_df.columns

        
        
    def get_background(self):
        '''
        计算每种粒子的背景值并保存为csv
        '''
        BG = pd.DataFrame([np.nanmean(self.resl_df, axis=0)] * self.df_len, columns=self.col_name)
        return BG

    
    
    def substract_background(self):
        '''
        对颗粒态数据减去背景值并保存为csv
        background_df：背景值df
        '''
        BG = self.get_background()
        file_name = 'substract_bg_particle.csv'
        substract_bg_particle = self.ptc_df - BG
        substract_bg_particle.to_csv(file_name, index=None)
        print("%s have been saved." % file_name)

        
        
    def select_columns(self, final_particle_csv, target_particle):
        '''
        在减去背景的颗粒态数据中选择要处理的粒子，组成df并保存为csv
        final_particle_csv：减去背景后的颗粒态csv文件名，放在该目录下即可
        target_particle：要选择的粒子名列表，如:['27Al','197Au']
        '''
        ptc_df = pd.read_csv(final_particle_csv)
        ptc_name_full_li = ptc_df.columns.tolist()  # 表头
        ptc_name_short_li = list(map(lambda x: x[1:-8], ptc_name_full_li))  # 粒子名：原子质量+元素名
        select_col_li = []  # select_col_li 选中元素所在列的完整列名

        for item in target_particle:
            for i in range(len(ptc_name_full_li)):
                if item == ptc_name_short_li[i]:
                    select_col_li.append(ptc_name_full_li[i])

        selected_ptc_df = pd.DataFrame(ptc_df, columns=select_col_li)
        file_name = 'selected_particles.csv'
        selected_ptc_df.to_csv(file_name, index=None)
        print("Subtracted background target particle have been selected.")

        
        
    def get_particle_number_concentration(self, selected_particle_csv, TE, speed, CPS):
        '''
        ！！旧的颗粒数浓度计算方法！！

        计算去除背景后颗粒态的目标元素的颗粒数浓度。
        selected_particle_csv：减去背景后的颗粒态目标元素的csv文件名，放在该目录下即可
        TE：计算参数，手动输入
        speed：流速，手动输入
        CPS：目标粒子的单位CPS，手动输入
        '''
        ele_name = selected_particle_csv[0:2]
        selected_ptc_df = pd.read_csv(selected_particle_csv)
        ints_sum = pd.DataFrame(np.nansum(selected_ptc_df, axis=0).reshape(1, -1), columns=selected_ptc_df.columns)
        coef = 1000 / (2.5 * TE * speed * CPS)  # 强度和df要乘的系数
        ptc_num_concentration = coef * ints_sum
        file_name = ele_name + "_particle_number_concentration.csv"
        ptc_num_concentration.to_csv(file_name, index=None)
        print("The particle number concentration of %s have been computed." % ele_name)

        
        
    def get_TE(self, selected_particle_csv):
        '''
        利用Std文件减去背景值后的目标粒子数据，计算得到TE，并保存对应csv。
        selected_particle_csv：减去背景后的颗粒态目标元素的csv文件名，放在该目录下即可
        '''
        std_df = pd.read_csv(selected_particle_csv)  # std文件的df
        TE = pd.DataFrame((std_df.count()) / (2.5 * 0.02 * 1e6), columns=std_df.columns)
        TE.to_csv("TE.csv", index=None)
        print("TE have been computed.")

        
        
    def get_particle_number_con_new(self, selected_particle_csv, TE, speed):
        '''
        ！！新的的颗粒数浓度计算方法！！

        计算去除背景后颗粒态的目标元素的颗粒数浓度。
        selected_particle_csv：减去背景后的颗粒态目标元素的csv文件名，放在该目录下即可
        TE：计算参数，手动输入
        speed：流速，手动输入
        '''
        selected_ptc_df = pd.read_csv(selected_particle_csv)
        ptc_cnt = selected_ptc_df.count()
        coef = 1 / (2.5 * TE * speed)  # 粒子计数要乘的系数
        res = coef * ptc_cnt
        ptc_num_con = res.to_frame().T
        file_name = "concentration.csv"
        ptc_num_con.to_csv(file_name, index=None)
        print("Particle number concentration have been computed.")

In [20]:
class PreProcess:  # 数据集的预处理：删除不需要的元素，质量计算，主要元素统计
    def __init__(self, unit_intensity, ptc_intensity, drop_ptc, percent_thr, top_k):
        '''
        base:单位强度csv文件名
        ptc_intensity:要处理的颗粒态强度csv文件名
        drop_ptc:要丢弃的粒子名组成的列表，如：['[56Fe]+ (cts)']
        percent_thr:字典要统计的元素占比的阈值，如0.1
        top_k:要统计的含量前k的k值，如10
        '''
        self.base = unit_intensity
        self.target = ptc_intensity
        self.drop_ptc = drop_ptc
        self.percent_thr = percent_thr
        self.top_k = top_k

        
        
    def read_base(self):
        '''
        读取单位强度的csv文件并将其数据转换为float型
        返回单位强度处理后的df
        '''
        return pd.read_csv(self.base).astype("float")

    
    
    def drop_particle(self):
        '''
        读取颗粒态强度的csv文件并将将不处理的粒子列去掉,之后将其中全NaN的行删掉
        返回颗粒态强度处理后的df
        '''
        ptc_ints = pd.read_csv(self.target)
        for item in self.drop_ptc:
            ptc_ints = ptc_ints.drop(item, axis=1)
        ptc_ints = ptc_ints.dropna(axis=0, how='all')
        return ptc_ints

    
    
    def get_mass_filename(self):
        '''
        从颗粒态强度文件名中读取样品标签(如：S15)
        返回颗粒态质量csv文件的文件名(如：'S15_mass.csv')
        '''
        label = self.target[0:-13]
        suffix = '_mass_final.csv'
        return label + suffix

    
    
    def get_particle_mass(self):
        '''
        计算颗粒态质量并返回相应df
        '''
        mass_df = self.drop_particle() / self.read_base().values
        return mass_df

    
    
    def get_mass_sum(self):
        '''
        计算每个颗粒质量和并返回相应df
        '''

        def sum_mass(row):
            return np.nansum(row)

        mass_df = self.get_particle_mass()
        total_mass = mass_df.apply(lambda x: sum_mass(x), axis=1)
        total_mass = total_mass.values
        mass_df.insert(mass_df.shape[1], 'total_mass', total_mass)
        return mass_df

    
    
    def get_short_ele_name(self):
        '''
        计算每个颗粒质量和并返回响应df
        '''
        col = self.drop_particle().columns
        new_col = []
        for item in col:
            new_col.append(item[1:-8])
        return new_col

    
    
    def normalize_mass(self):
        '''
        对每种元素质量进行归一化，并拼接df,返回相应df
        '''
        mass_df = self.get_mass_sum()
        ptc_mass = mass_df.iloc[:, 0:-1]
        total_mass = self.get_mass_sum().iloc[:, -1]

        def divide(col, total):  # 将一列数据除以总质量
            return col / total

        ptc_percent = ptc_mass.apply(lambda x: divide(x, total_mass), axis=0)
        ptc_percent.columns = self.get_short_ele_name()
        normed_ptc = pd.concat([mass_df, ptc_percent], axis=1)
        normed_ptc.reset_index(inplace=True, drop=True)
        return normed_ptc

    
    
    def select_elements(self):
        '''
        将占比>0.1的元素保存为字典，并记录符合条件的元素个数；
        将占比前k的元素保存为字典；
        拼接df，保存为csv文件
        '''
        main_ele_dict = []
        main_ele_len = []

        topk_dict = []

        normed_df = self.normalize_mass()
        col_len = normed_df.shape[1]
        percent = normed_df.iloc[:, (col_len + 1) // 2:]

        def process_row(row):
            # 对每行进行处理，每行数据为Series。
            row = row.sort_values(ascending=False)

            # 含量大于阈值的元素记录：对每行数据先按照占比排序，之后记录为字典以及元素个数
            ele_dict = {}
            ele_leng = 0
            ele_leng = len(row[row > self.percent_thr])
            for i in range(ele_leng):
                ele_dict[row.index[i]] = row[i]
            main_ele_dict.append(ele_dict)
            main_ele_len.append(ele_leng)

            # top K 含量元素记录：对每行数据先按照占比排序，之后记录为字典
            top_d = {}
            top_leng = row.shape[0]
            for i in range(top_leng):
                if top_leng == self.top_k:
                    break
                top_d[row.index[i]] = row[i]
            topk_dict.append(top_d)

        percent.apply(lambda x: process_row(x), axis=1)

        main_ele_dict = pd.DataFrame(pd.Series(main_ele_dict), columns=['components'])
        main_ele_len = pd.DataFrame(main_ele_len, columns=['number_of_components'])
        topk_dict = pd.DataFrame(pd.Series(topk_dict), columns=['top_k'])
        file_name = self.get_mass_filename()
        vital_ele = pd.concat([main_ele_dict, main_ele_len, topk_dict], axis=1)
        final_df = pd.concat([normed_df, vital_ele], axis=1)
        final_df.to_csv(file_name, index=None)
        print("%s have finished." % file_name)

In [42]:
class AprioriProcess:    # 统计出现的元素 & Apriori & 压缩
    def __init__(self, csv_path, min_support, min_confidence, min_lift):
        '''
        min_support:apriori中的支持度阈值
        min_confidence:confidence阈值
        min_lift:lift阈值
        '''
        self.path = csv_path
        self.support = min_support
        self.confidence = min_confidence
        self.lift = min_lift
        self.ele_cnt = 0
    
    
    
    def fillna_percent(self):
        '''
        将经过质量处理后的csv数据中的nan进行填充，同时选出元素质量占比的数据
        返回经过上述处理后的df
        '''
        df = pd.read_csv(self.path).fillna(0)  # 填充
        self.ele_cnt = int((df.shape[1]-4)/2)
        df = df.iloc[:, self.ele_cnt+1: 2*self.ele_cnt+1]   # 选取占比部分数据
        return df
    
    
    
    def get_exist_eles(self, percent_thr=0.0):
        '''
        对只包含各元素质量占比的df中每个粒子包含的元素进行统计,并返回统计结果的Series
        percent_thr:占比大于该阈值时认为元素存在
        '''
        df = self.fillna_percent()
        exist_eles = []

        def process_row(row):
            # 对每行进行处理，每行数据为Series。
            eles = []
            for i in range(self.ele_cnt):
                if row[i] > percent_thr:
                    eles.append(row.index[i])
            exist_eles.append(eles)

        df.apply(lambda x:process_row(x), axis=1)
        return exist_eles
    
    
    
    def Apriori(self):
        '''
        mlxtend中Apriori方法完整过程的封装
        '''
        exist_eles = self.get_exist_eles()
        
        # 首先转换为模型可接受数据
        te = TransactionEncoder()
        te_ary = te.fit(exist_eles).transform(exist_eles)
        df = pd.DataFrame(te_ary, columns=te.columns_)

        #求频繁项集：导入apriori方法设置最小支持度min_support=min_support求频繁项集，还能选择出长度大于x的频繁项集。
        frequent_itemsets = apriori(df, min_support=self.support, use_colnames=True)
        frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))   # 添加长度

        # 求关联规则：导入association_rules方法判断'confidence'大于0.3，求关联规则。
        association_rule = association_rules(frequent_itemsets,metric='confidence',min_threshold=self.confidence).iloc[:,0:-2]
        association_rule = association_rule[association_rule['lift']>self.lift].iloc[:, [0,1,4]]
        return association_rule
        
        
        
    def Apriori_del_repeat(self):
        '''
        对Apriori方法得到结果中的重复项进行压缩并保存结果。
        '''
        association_rule_df = self.Apriori()
        new_df = association_rule_df.iloc[0].to_frame().T
        ele_set = set()
        tmp_set = set()
        [ele_set.add(item) for s in association_rule_df.iloc[0,0:2] for item in s]

        for r in range(association_rule_df.shape[0]):
            tmp_set.clear()
            for s in association_rule_df.iloc[r,0:2]:
                for item in s: 
                    tmp_set.add(item)
            if tmp_set!=ele_set:
                ele_set.clear()
                [ele_set.add(item) for item in tmp_set]
                new_df = new_df.append(association_rule_df.iloc[r])
        new_df = new_df.reset_index(drop=True)
        filename = self.path[0:-15] + '_support_' + str(self.support)[:4] + '_unique.csv'
        new_df.to_csv(filename,index=None)
        print("%.3f finished." % self.support)

# 一、泊松分类&减背景执行流程

In [13]:
# 泊松法分类阶段执行函数
def main():
    #  一：执行
    data_loader = DataLoader(origin_csv)  # 实例化
    cleaned_data = data_loader.get_cleaned_data()  # 得到清洗后的数据
    metric_data = data_loader.get_basic_metirct(cleaned_data)  # 得到相关指标统计结果

    # 二：Poisson执行
    poissonmethod = PoissonMethod(cleaned_data, metric_data, credible)  # 实例化
    avgints_lambda_scale = poissonmethod.normal_lambda()  # 得到与lambda相关参数组成的csv
    intensity_threshold = poissonmethod.get_ints_thr()  # 经过泊松过程得到强度阈值的csv
    poissonmethod.classifier()  # 分类得到颗粒态和溶解态数据csv

    # 三：执行
    p_process = PostProcess('Poisson_particle.csv', 'Poisson_resolve.csv')  # 实例化
    p_process.substract_background()  # 颗粒态数据减背景

In [14]:
file_li = ['meihui_S84.csv', 'turang_S15.csv',  'weiqi_L58.csv']

In [17]:
# 泊松分类阶段超参数
origin_csv =  file_li[2]    # 原始数据的csv文件

credible =0.997             # 泊松分布的置信度
speed = 0.02                # 流速
print(origin_csv)

if __name__ == '__main__':
    main()

weiqi_L58.csv
cleaned_weiqi_L58.csv have been saved.
basic_metric_weiqi_L58.csv have been saved.
Particle have been saved.
Resolve have been saved.
substract_bg_particle.csv have been saved.


# 二、质量处理&主要元素统计&出现元素统计执行流程

In [22]:
# 质量处理阶段执行函数
file = 'weiqi_L58_particle.csv'

base = 'base.csv'

drop = ['[9Be]+ (cts)', '[23Na]+ (cts)', '[24Mg]+ (cts)', '[39K]+ (cts)', '[42Ca]+ (cts)', '[44Ca]+ (cts)', '[46Ti]+ (cts)',
        '[48Ti]+ (cts)', '[49Ti]+ (cts)', '[50Ti]+ (cts)', '[55Mn]+ (cts)', '[54Fe]+ (cts)', '[56Fe]+ (cts)', '[57Fe]+ (cts)', 
        '[82Se]+ (cts)', '[86Sr]+ (cts)', '[87Sr]+ (cts)', '[138Ba]+ (cts)', '[197Au]+ (cts)', '[205Tl]+ (cts)']

percent_thr = 0.1

top_k = 10

P = PreProcess(base, file, drop, percent_thr, top_k)
P.select_elements()

weiqi_L58_mass_final.csv have finished.


In [24]:
file_list = [ 'meihui_S84_particle.csv', 'turang_S15_particle.csv']

base = 'base_start_Al.csv'

drop = ['[42Ca]+ (cts)','[44Ca]+ (cts)', '[46Ti]+ (cts)', '[48Ti]+ (cts)', '[49Ti]+ (cts)', 
        '[50Ti]+ (cts)', '[54Fe]+ (cts)', '[56Fe]+ (cts)', '[57Fe]+ (cts)', '[197Au]+ (cts)']

percent_thr = 0.1

top_k = 10

for file in file_list:
    P = PreProcess(base, file, drop, percent_thr, top_k)
    P.select_elements()

meihui_S84_mass_final.csv have finished.
turang_S15_mass_final.csv have finished.


# 三、统计出现的元素&Apriori&压缩执行流程

In [44]:
files = ['weiqi_L58_mass_final.csv', 'meihui_S84_mass_final.csv', 'turang_S15_mass_final.csv']

ap = AprioriProcess(files[0], 0.1, 0.3, 1.2)
ap.Apriori_del_repeat()

0.100 finished.


In [43]:
ap = AprioriProcess(files[0], 0.05, 0.3, 1.2)
ap.Apriori_del_repeat()

0.050 finished.
