In [2]:
# 该文件放在生成文件对应的目录下运行
import pandas as pd
import numpy as np
import os
import shutil
import warnings
import pickle

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth, fpmax

warnings.filterwarnings('ignore')

# 类的定义

In [3]:
class MassAndExist:  # 质量相关处理：删除不需要的元素，质量计算，主要元素统计，筛选颗粒包含的元素
    
    def __init__(self, unit_intensity, drop_ptc, percent_thr, top_k):
        '''
        base:单位强度csv文件名
        drop_ptc:要丢弃的粒子名组成的列表，如：['[56Fe]+ (cts)']。
                 要丢弃的元素包含：不需要的同位素，单位强度文件中不包含的元素。(需要人为判断)
        percent_thr:字典要统计的元素占比的阈值，如0.1
        top_k:要统计的含量前k的k值，如10
        '''
        shutil.copyfile('../'+base, './'+base)   # 将base文件复制到生成文件的目录下
        self.base = unit_intensity
        self.target = 'particle_classified_final.csv'    # 减背景后的颗粒态数据
        self.drop_ptc = drop_ptc
        self.percent_thr = percent_thr
        self.top_k = top_k

        
        
    def read_base(self):
        '''
        读取单位强度的csv文件并将其数据转换为float型
        返回单位强度处理后的df
        '''
        return pd.read_csv(self.base).astype("float")

    
    
    def drop_particle(self):
        '''
        读取减背景后颗粒态强度的csv文件(self.target) 并将不需要处理的粒子列去掉,之后将其中全NaN的行删掉
        返回颗粒态强度处理后的df
        '''
        ptc_ints = pd.read_csv(self.target)
        for item in self.drop_ptc:
            ptc_ints = ptc_ints.drop(item, axis=1)
        ptc_ints = ptc_ints.dropna(axis=0, how='all')
        return ptc_ints

    
    
    def get_particle_mass(self):
        '''
        计算颗粒态质量并返回相应df
        '''
        mass_df = self.drop_particle() / self.read_base().values
        return mass_df

    
    
    def get_mass_sum(self):
        '''
        计算每个颗粒质量和并返回相应df
        '''

        def sum_mass(row):
            return np.nansum(row)

        mass_df = self.get_particle_mass()
        total_mass = mass_df.apply(lambda x: sum_mass(x), axis=1)
        total_mass = total_mass.values
        mass_df.insert(mass_df.shape[1], 'total_mass', total_mass)
        return mass_df

    
    
    def get_short_ele_name(self):
        '''
        得到每个元素的元素名
        '''
        col = self.drop_particle().columns
        new_col = []
        for item in col:
            new_col.append(item[1:-8])
        return new_col

    
    
    def normalize_mass(self):
        '''
        对每种元素质量进行归一化，并拼接df,返回相应df
        '''
        mass_df = self.get_mass_sum()
        ptc_mass = mass_df.iloc[:, 0:-1]
        total_mass = self.get_mass_sum().iloc[:, -1]

        def divide(col, total):  # 将一列数据除以总质量
            return col / total

        ptc_percent = ptc_mass.apply(lambda x: divide(x, total_mass), axis=0)
        ptc_percent.columns = self.get_short_ele_name()
        normed_ptc = pd.concat([mass_df, ptc_percent], axis=1)
        normed_ptc.reset_index(inplace=True, drop=True)
        return normed_ptc

    
    
    def select_elements(self):
        '''
        将占比>0.1的元素保存为字典，并记录符合条件的元素个数；
        将占比前k的元素保存为字典；
        拼接df，保存为csv文件
        '''
        main_ele_dict = []
        main_ele_len = []

        topk_dict = []

        normed_df = self.normalize_mass()
        col_len = normed_df.shape[1]
        percent = normed_df.iloc[:, (col_len + 1) // 2:]

        def process_row(row):
            # 对每行进行处理，每行数据为Series。
            row = row.sort_values(ascending=False)

            # 含量大于阈值的元素记录：对每行数据先按照占比排序，之后记录为字典以及元素个数
            ele_dict = {}
            ele_leng = 0
            ele_leng = len(row[row > self.percent_thr])
            for i in range(ele_leng):
                ele_dict[row.index[i]] = row[i]
            main_ele_dict.append(ele_dict)
            main_ele_len.append(ele_leng)

            # top K 含量元素记录：对每行数据先按照占比排序，之后记录为字典
            top_d = {}
            for i in range(self.top_k):
                if pd.isna(row[i]):
                    break
                top_d[row.index[i]] = row[i]
            topk_dict.append(top_d)

        percent.apply(lambda x: process_row(x), axis=1)

        main_ele_dict = pd.DataFrame(pd.Series(main_ele_dict), columns=['components'])
        main_ele_len = pd.DataFrame(main_ele_len, columns=['number_of_components'])
        topk_dict = pd.DataFrame(pd.Series(topk_dict), columns=['top_k'])
        file_name = 'mass_final.csv'
        vital_ele = pd.concat([main_ele_dict, main_ele_len, topk_dict], axis=1)
        final_df = pd.concat([normed_df, vital_ele], axis=1)
        final_df.to_csv(file_name, index=None)
        print("%s have finished." % file_name)
    
    

    def get_exist_eles(self, percent_thr=0.0):
        '''
        对只包含各元素质量占比的df中每个粒子包含的元素进行统计,并返回统计结果的pk文件(pk文件可以完整保存列表)。
        percent_thr:占比大于该阈值时认为元素存在
        '''
        df = self.drop_particle()
        df.columns = self.get_short_ele_name()
        ele_cnt = len(df.columns)
        exist_eles = []

        def process_row(row):
            # 对每行进行处理，每行数据为Series。
            eles = []
            for i in range(ele_cnt):
                if not pd.isna(row[i]):
                    eles.append(row.index[i])
            exist_eles.append(eles)

        df.apply(lambda x:process_row(x), axis=1)
        
        file_name = 'exist_eles.pk'
        with open(file_name,'wb') as file:
            pickle.dump(exist_eles, file)
        print('exist_eles.pk have been finished.')
        

In [4]:
class AprioriProcess:    # 提取关联项, 压缩, 对压缩后的数据按照集合长度降序排序
    
    def __init__(self, max_len, min_support, min_confidence, min_lift=1.2):
        '''
        max_len:频繁项中包含项的最大个数;
        min_support:apriori中的支持度阈值;
        min_confidence:confidence阈值;
        min_lift:lift阈值,默认1.2
        '''
        self.max_len = max_len
        self.support = min_support
        self.confidence = min_confidence
        self.lift = min_lift
        self.ele_cnt = 0
    
    
    
    def Apriori(self):
        '''
        mlxtend中Apriori方法完整过程的封装
        '''        
        
        with open('exist_eles.pk', 'rb') as file:
            exist_li = pickle.load(file)

        # 首先转换为模型可接受数据
        te = TransactionEncoder()
        te_ary = te.fit(exist_li).transform(exist_li)
        df = pd.DataFrame(te_ary, columns=te.columns_)

        #求频繁项集：导入apriori方法设置最小支持度min_support=min_support求频繁项集，还能选择出长度大于x的频繁项集。
        frequent_itemsets = apriori(df, min_support=self.support, max_len=self.max_len, use_colnames=True)

        # 求关联规则：导入association_rules方法判断'confidence'大于0.3，求关联规则。
        association_rule = association_rules(frequent_itemsets,metric='confidence',min_threshold=self.confidence).iloc[:,0:-2]
        association_rule = association_rule[association_rule['lift']>self.lift].iloc[:, [0,1,4]]
        return association_rule



    def apriori_del_repeat(self):
        '''
        对Apriori方法得到结果中的重复项进行压缩并返回df。
        '''
        association_rule_df = self.Apriori()
        new_df = association_rule_df.iloc[0].to_frame().T
        ele_set = set()
        tmp_set = set()
        [ele_set.add(item) for s in association_rule_df.iloc[0,0:2] for item in s]

        for r in range(association_rule_df.shape[0]):
            tmp_set.clear()
            for s in association_rule_df.iloc[r,0:2]:
                for item in s: 
                    tmp_set.add(item)
            if tmp_set!=ele_set:
                ele_set.clear()
                [ele_set.add(item) for item in tmp_set]
                new_df = new_df.append(association_rule_df.iloc[r])
        new_df = new_df.reset_index(drop=True)
        return new_df
        
        
        
    def Apriori_final(self):
        '''
        将去重后的frozenset合并，统计每个集合的长度，按照长度降序排序，并将最终结果保存为csv。
        '''
        association_compressed = self.apriori_del_repeat()
        support = association_compressed.iloc[:,-1].values
        new_set_li = list()
        len_set_li = list()
        
        def merge_frozenset(row):
            new_set = set(row[0].union(row[1]))
            new_set_li.append(new_set)
            len_set_li.append(len(new_set))
        
        association_compressed.apply(lambda x: merge_frozenset(x), axis=1)
        association_final = pd.DataFrame([new_set_li, len_set_li, support], index=None).T
        association_final.columns = ['frequent_item', 'set_length', 'support']
        association_final = association_final.sort_values(by='set_length', ascending=False)
        filename = 'support_'+str(self.support)[:4]+'_confidence_'+str(self.confidence)[:3]+'_maxlen_'+str(self.max_len)+'_uniq.csv'
        association_final.to_csv(filename, index=None)
        print("Final apriori of support %.2f, confidence %.2f, maxlen %d, finished." % (self.support, self.confidence, self.max_len))
        

### 查询部分

* 0b11  # 二进制表示
*  int类型，每2 ** 30增加4个字节
* [python int占多少字节](https://zxi.mytechroad.com/blog/desgin/python%E4%B8%AD%E7%9A%84%E6%95%B4%E5%9E%8B%E5%8D%A0%E5%A4%9A%E5%B0%91%E4%B8%AA%E5%AD%97%E8%8A%82%EF%BC%9F/ )

`
import sys
a = 0   # 24
a = 1   # 28
a = 1<<29   # 28
a = 1<<30   # 32
a = 1<<35   # 32
sys.getsizeof(a)
`

> 索引方式：
> 1. 对每个颗粒中出现的元素设为1，否则为0，按照该方式对所有颗粒编码。判断元素是否出现用TransactionEncoder生成的bool矩阵。
> 2. 将TransactionEncoder生成的bool矩阵头的顺序作为元素先后顺序，按此顺序对频繁项进行01编码。
> 3. 按照01编码进行查找并返回每种组合包含的所有颗粒。因为频繁项已经按长度降序排序，因为颗粒只匹配最长的频繁项，不重复匹配。（利用差集下实现不重复匹配）

In [5]:
class FingerPrint:   # 构建元素指纹(根据频繁项查询并将每种物质归类)
    
    def __init__(self, frequent_csv):
        '''
        frequent_csv:频繁项的csv文件名，
        以上3个文件均在当前目录下。
        '''
        with open('exist_eles.pk', 'rb') as file:
            self.exist_li = pickle.load(file)    # 每个颗粒存在元素的列表
        self.mass_df = pd.read_csv('mass_final.csv')   # 质量&质量占比&主要元素统计的df
        col_len_mass_df = (self.mass_df.shape[1]-4)//2    # 元素数量
        self.frequent_csv = frequent_csv    # 频繁项文件名
        self.frequent_df = pd.read_csv(frequent_csv)['frequent_item']    #　频繁项(fi)，只包含集合，不包含support
        self.fi_cnt = len(self.frequent_df)   #　频繁项(fi)个数
        self.ele_order = None    # 元素在flag_df中的出现顺序，即flag_df的表头
        self.idx_set = set(self.mass_df.index.values)   # self.mass_df的行索引集合
        
    
    
    def bit_encoder(self):
        '''
        对所有粒子进行二进制编码，并返回数组
        '''
        te = TransactionEncoder()
        te_ary = te.fit(self.exist_li).transform(self.exist_li)
        flag_df = pd.DataFrame(te_ary, columns=te.columns_)
        self.ele_order = flag_df.columns    # 元素在flag_df中的出现顺序，即flag_df的表头
        res = [] 
        
        def zero_one_to_str(row):    # 对bit_encoder得到的bool矩阵，将其每行转为二进制表示。
            res.append(eval('0b'+ ''.join(str(int(b)) for b in row)))
        
        flag_df.apply(lambda x: zero_one_to_str(x), axis=1)
        return np.array(res)
        
        
    
    def fi_bit_encoder(self):
        '''
        对频繁项进行二进制编码,以及抽取标签。返回两个列表
        '''
        # 对元素建立顺序字典
        ele_cnt = len(self.ele_order)
        ele_idx_dic = {}
        for i in range(ele_cnt):
            ele_idx_dic[self.ele_order[i]] = i
        
        # 频繁项编码
        bit_li = []   # 频繁项的编码列表
        label_li = []    # 标签列表(标签：即每个频繁项的元素组成)
        tmp = []      # 某个频繁项中包含元素的顺序数
        fi_emb = 0    # 某个频繁项的01编码
        
        for i in range(self.fi_cnt):
            # 得到标签           
            label_li.append(str(sorted(list(eval(self.frequent_df[i]))))[1:-1])
            # 得到编码
            fi_emb = 0
            tmp = []
            for ele in eval(self.frequent_df[i]):
                tmp.append(ele_idx_dic[ele])
            tmp.sort()
            for j in range(ele_cnt):
                fi_emb <<= 1
                if j in tmp:
                    fi_emb += 1
            bit_li.append(fi_emb)
        return bit_li, label_li
    
        
           
    def query(self):
        '''
        查询频繁项对应的粒子并添加标签，拼接成新的df并保存。
        '''
        all_ptc_emb = self.bit_encoder()
        fi_bit_emb, fi_label = self.fi_bit_encoder()
        res_df = pd.DataFrame()    # 筛选结果的df
        row_idx = []     #　某一频繁项对应的颗粒的行数
        tmp_df = pd.DataFrame()   # 某一频繁项对应的颗粒组成的df
        for i in range(self.fi_cnt):
            row_idx = np.squeeze(np.argwhere(all_ptc_emb&fi_bit_emb[i]==fi_bit_emb[i]))
            row_idx = list(set(row_idx)&self.idx_set) 
            self.idx_set -= set(row_idx)
            tmp_df = self.mass_df.iloc[row_idx]
            if not tmp_df.empty:
                tmp_df['label'] = fi_label[i]
            res_df = pd.concat([res_df,tmp_df])
        res_df.reset_index(drop=True)
        file_name = 'FingerPirnt_' + self.frequent_csv
        res_df.to_csv(file_name, index=None)
        print('%s have been finished.' % file_name)
        

# 完整执行流程

In [14]:
def main():
    # Step1：质量相关计算,提取颗粒中出现的元素 
    if not os.path.isfile('exist_eles.pk'):
        me = MassAndExist(base, drop, percent_thr, top_k)
        me.select_elements()
        me.get_exist_eles()
    else:
        print("Start from Apriori.")

    # Step2：Apriori提取关联项
    ap = AprioriProcess(max_len, min_support, min_confidence)
    ap.Apriori_final()

# 超参数

* 'meihui_S84_particle.csv': 'base_start_Al.csv'
* drop = ['[42Ca]+ (cts)','[44Ca]+ (cts)', '[46Ti]+ (cts)', '[48Ti]+ (cts)', '[49Ti]+ (cts)', '[50Ti]+ (cts)', '[54Fe]+ (cts)', '[56Fe]+ (cts)', '[57Fe]+ (cts)', '[197Au]+ (cts)']  
---
* 'turang_S15_particle.csv': 'base_start_Al.csv'
* drop = ['[42Ca]+ (cts)','[44Ca]+ (cts)', '[46Ti]+ (cts)', '[48Ti]+ (cts)', '[49Ti]+ (cts)', '[50Ti]+ (cts)', '[54Fe]+ (cts)', '[56Fe]+ (cts)', '[57Fe]+ (cts)', '[197Au]+ (cts)']

---

* 'weiqi_L58_particle.csv': 'base.csv'
* drop = ['[9Be]+ (cts)', '[23Na]+ (cts)', '[24Mg]+ (cts)', '[39K]+ (cts)', '[42Ca]+ (cts)', '[44Ca]+ (cts)', '[46Ti]+ (cts)', '[48Ti]+ (cts)', '[49Ti]+ (cts)', '[50Ti]+ (cts)', '[55Mn]+ (cts)', '[54Fe]+ (cts)', '[56Fe]+ (cts)', '[57Fe]+ (cts)', '[82Se]+ (cts)', '[86Sr]+ (cts)', '[87Sr]+ (cts)', '[138Ba]+ (cts)', '[197Au]+ (cts)', '[205Tl]+ (cts)']

In [17]:
base = 'base.csv'             #　计算质量时的单位强度(放在上一级目录，会自动copy)
# 计算质量和筛选颗粒中包含元素过程中要舍弃的元素
drop = ['[9Be]+ (cts)', '[23Na]+ (cts)', '[24Mg]+ (cts)', '[39K]+ (cts)', '[42Ca]+ (cts)', '[44Ca]+ (cts)', '[46Ti]+ (cts)',
        '[48Ti]+ (cts)', '[49Ti]+ (cts)', '[50Ti]+ (cts)', '[55Mn]+ (cts)', '[54Fe]+ (cts)', '[56Fe]+ (cts)', '[57Fe]+ (cts)', 
        '[82Se]+ (cts)', '[86Sr]+ (cts)', '[87Sr]+ (cts)', '[138Ba]+ (cts)', '[197Au]+ (cts)', '[205Tl]+ (cts)']
percent_thr = 0.1   #　质量阈值 
top_k = 10          # 前topK个元素中的K值


max_len = 4
# min_support = 0.1
min_support = 0.05
min_confidence = 0.3


if __name__ == '__main__':
    main()

mass_final.csv have finished.
exist_eles.pk have been finished.
Final apriori of support 0.05, confidence 0.30, maxlen 4, finished.


In [19]:
# 按照频繁项对颗粒进行分类
frequent_item_csv = 'support_0.05_confidence_0.3_maxlen_4_uniq.csv'    

fp = FingerPrint(frequent_item_csv)
fp.query()

FingerPirnt_support_0.05_confidence_0.3_maxlen_4_uniq.csv have been finished.


#### 反向统计

In [22]:
t = pd.read_csv('FingerPirnt_support_0.05_confidence_0.3_maxlen_4_uniq.csv')
t.groupby('label').agg('count').shape

(856, 74)