In [1]:
# 该文件放在生成文件对应的目录下运行
import pandas as pd
import numpy as np
import os
import re
import shutil
import warnings
import pickle as pkl

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth, fpmax
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 


warnings.filterwarnings('ignore')

### 查询部分

* 0b11 # 二进制表示
* int类型，每2 ** 30增加4个字节
* [python int占多少字节](https://zxi.mytechroad.com/blog/desgin/python%E4%B8%AD%E7%9A%84%E6%95%B4%E5%9E%8B%E5%8D%A0%E5%A4%9A%E5%B0%91%E4%B8%AA%E5%AD%97%E8%8A%82%EF%BC%9F/)

```python
    import sys
    a = 0   # 24
    a = 1   # 28
    a = 1<<29   # 28
    a = 1<<30   # 32
    a = 1<<35   # 32
    sys.getsizeof(a)
```

> 索引方式：
>
> 1. 对每个颗粒中出现的同位素设为1，否则为0，按照该方式对所有颗粒编码。
> 2. 将bool矩阵columns的顺序（按同位素质量升序排列）作为同位素先后顺序，按此顺序对频繁项进行01编码。
> 3. 按照01编码进行查找并返回每种组合包含的所有颗粒。因为频繁项已经按长度降序排序，因为颗粒只匹配最长的频繁项，不重复匹配。（利用差集下实现不重复匹配）

# 类的定义

### 0. 对所有物质中要处理的同位素的并集建立字典。

In [2]:
class IsotopesDict:  # 对所有物质中要处理的同位素的并集建立字典,同位素字典要放在物质文件夹同级目录下

    def __init__(self, isotopes_li):
        '''
        isotopes_li:每种物质要处理的同位素的列表组成的列表。
        '''
        self.isotopes_li = isotopes_li
        self.num = len(isotopes_li)
        self.iso_dic = dict()
        

        
    def get_union(self):
        '''
        对所有物质中要处理的同位素计算并集，并返回按照原子质量升序排序后的列表。
        '''
        exist_iso = set()
        for i in range(self.num):
            iso_set = set(self.isotopes_li[i])
            exist_iso = exist_iso.union(iso_set)
        exist_iso = list(exist_iso)
        exist_iso.sort(key=lambda x: int(re.match('\d*', x).group()))
        return exist_iso
        
        
        
    def build_iso_dict(self):
        '''
        对所有出现的同位素建立二进制表示的字典，并保存。
        '''
        exist_iso = self.get_union()
        file_name = 'existing_isotopes_dict.pk'
        for i in range(len(exist_iso)):
            self.iso_dic[exist_iso[i]] = 1<<i
        with open(file_name, 'wb') as f:
            pkl.dump(self.iso_dic, f)
        print("Existing isotopes' dict of is built.")

In [3]:
# 建立字典单独执行
isotopes_li = [['24Mg', '27Al', '47Ti', '51V', '52Cr', '54Fe', '55Mn', '59Co', '60Ni', '63Cu', '66Zn', '75As', '87Sr', '89Y', '98Mo', '107Ag', '111Cd', '112Sn', '121Sb', '138Ba', '139La', '140Ce', '141Pr', '146Nd', '147Sm', '153Eu', '157Gd', '159Tb', '163Dy', '165Ho', '166Er', '169Tm', '172Yb', '175Lu', '205Tl', '208Pb'],
               ['24Mg', '27Al', '47Ti', '51V', '52Cr', '54Fe', '55Mn', '59Co', '60Ni', '63Cu', '66Zn', '75As', '87Sr', '89Y', '98Mo', '107Ag', '111Cd', '112Sn', '121Sb', '138Ba', '139La', '140Ce', '141Pr', '146Nd', '147Sm', '153Eu', '157Gd', '159Tb', '163Dy', '165Ho', '166Er', '169Tm', '172Yb', '175Lu', '205Tl', '208Pb'],
               ['24Mg', '27Al', '47Ti', '51V', '52Cr', '54Fe', '55Mn', '59Co', '60Ni', '63Cu', '66Zn', '75As', '87Sr', '89Y', '98Mo', '107Ag', '111Cd', '112Sn', '121Sb', '138Ba', '139La', '140Ce', '141Pr', '146Nd', '147Sm', '153Eu', '157Gd', '159Tb', '163Dy', '165Ho', '166Er', '169Tm', '172Yb', '175Lu', '205Tl', '208Pb']
              ]


isd = IsotopesDict(isotopes_li)
isd.build_iso_dict()

Existing isotopes' dict of is built.


### 1. 预处理：建立同位素的二进制对应关系，得到每个颗粒的二进制表示，删去全空行

In [2]:
class DataPreparation:    # 对泊松分类得到的数据进行预处理
    
    def __init__(self, targ_isotopes, substance, iter_flag=True):
        '''
        targ_isotopes:要保留并处理的同位素列表，每种物质有差异。
        '''
        self.targ_isotopes = targ_isotopes                 # 要保留并处理的同位素列表
        self.data_csv = 'particle_classified_final.csv'    # 减背景后的颗粒态数据
        self.df = None                                     # 减背景后颗粒态数据的df
        self.substance = substance                         # 物质名称，如'weiqi'
        self.iso_dic = dict()                              # 同位素的二进制值字典
        self.iter_flag = iter_flag                         # 上一阶段是否用迭代法分类，默认True。若是泊松法，置为False
        
        
        
    def get_iso_dic(self):
        '''
        读取同位素的二进制字典。
        '''
        iso_dic_pk = '../existing_isotopes_dict.pk'
        with open(iso_dic_pk, 'rb') as f:
            self.iso_dic = pkl.load(f)
        
        
        
    def get_dir_data(self):
        '''
        拼接该物质文件所在的文件夹路径，并切换到物质文件所在文件夹。
        '''
        # 拼接得到物质文件所在目录
        # 创建保存该物质生成数据的文件夹
        suffix = 'iteration' if self.iter_flag else 'poisson'
        dir_name = '_'.join([self.substance, suffix])
        
        if not os.path.exists(dir_name):   # 目录不正确
            raise Exception(f'The dir {dir_name} not exists!')
        
        else:  # 目录正确，切换到目录下并读取'particle_classified_final.csv'文件
            os.chdir(dir_name)
            print(f'Current dir: {dir_name}.')
            self.df = pd.read_csv(self.data_csv)  # 读取
    
        
        
    def update_origin_df(self):
        '''
        更新原始df的columns；只保留要处理的列；删去全空行。
        '''
        new_col = []
        for e in self.df.columns:
            new_col.append(e[1:-8])
        self.df.columns = new_col
        self.df = self.df[self.targ_isotopes]
        self.df = self.df.dropna(axis=0,how='all')
        
        
        
    def get_bin_emb(self):
        '''
        对df中每个颗粒计算其二进制编码表示，并保存df。
        '''
        self.get_iso_dic()
        file_name = 'isotopes_embedding.csv'
        bin_emb_li = list()
        for i in range(len(self.df)):
            ptc = self.df.iloc[i]
            tmp = 0
            exist_iso = ptc[ptc.notna()].index
            for iso in exist_iso:
                tmp += self.iso_dic[iso]
            bin_emb_li.append(tmp)
        self.df['embedding'] = bin_emb_li
        self.df.to_csv(file_name,index=None)
        print("The data of remained isotopes and binary embedding of %s is saved." % self.substance)

### 2. 得到频繁项：得到每种物质的频繁项，并对频繁项也进行二进制编码表示

In [3]:
class AprioriProcess:    # 提取关联项, 压缩, 对压缩后的数据按照集合长度降序排序
    
    def __init__(self, substance, max_len, min_support, min_confidence, min_lift=1.2):
        '''
        substance:物质名称
        max_len:频繁项中包含同位素的最大个数;
        min_support:apriori中的支持度阈值;
        min_confidence:confidence阈值;
        min_lift:lift阈值,默认1.2
        '''
        # 判断目录是否正确
        if substance != os.path.basename(os.getcwd()).split('_')[0]:
            raise Exception(f'The current dir {os.getcwd()} is wrong!')
        
        self.max_len = max_len
        self.support = min_support
        self.confidence = min_confidence
        self.lift = min_lift
        self.df = None               # isotopes 和 embeddings 数据的df
        self.iso_dic = dict()        # isotopes 的字典
        
        
    
    def get_df_isodic(self):
        '''
        更新 self.iso_dic 和 self.df
        '''
        data_csv = 'isotopes_embedding.csv'
        self.df = pd.read_csv(data_csv)
        iso_dic_pk = '../existing_isotopes_dict.pk'
        with open(iso_dic_pk, 'rb') as f:
            self.iso_dic = pkl.load(f)

        
                
    def Apriori(self):
        '''
        mlxtend中Apriori方法完整过程的封装
        '''        
        bool_df = self.df.iloc[:,:-1].notnull()
        
        #求频繁项集：导入apriori方法设置最小支持度min_support=min_support求频繁项集，还能选择出长度大于x的频繁项集。
        frequent_itemsets = apriori(bool_df, min_support=self.support, max_len=self.max_len, use_colnames=True)

        # 求关联规则：导入association_rules方法判断'confidence'大于0.3，求关联规则。
        association_rule = association_rules(frequent_itemsets,metric='confidence',min_threshold=self.confidence).iloc[:,0:-2]
        association_rule = association_rule[association_rule['lift']>self.lift].iloc[:, [0,1,4]]
        return association_rule

    
    
    def apriori_del_repeat(self):
        '''
        对Apriori方法得到结果中的重复项进行压缩并返回df。
        '''
        association_rule_df = self.Apriori()
        new_df = association_rule_df.iloc[0].to_frame().T
        iso_set = set()
        tmp_set = set()
        [iso_set.add(item) for s in association_rule_df.iloc[0,0:2] for item in s]

        for r in range(association_rule_df.shape[0]):
            tmp_set.clear()
            for s in association_rule_df.iloc[r,0:2]:
                for item in s: 
                    tmp_set.add(item)
            if tmp_set!=iso_set:
                iso_set.clear()
                [iso_set.add(item) for item in tmp_set]
                new_df = new_df.append(association_rule_df.iloc[r])
        new_df = new_df.reset_index(drop=True)
        return new_df
    
    
    
    def Apriori_final(self):
        '''
        将去重后的frozenset合并，统计每个集合的长度，按照长度降序排序，并将最终结果保存为csv。
        '''
        association_compressed = self.apriori_del_repeat()
        support = association_compressed.iloc[:,-1].values
        new_set_li = list()
        len_set_li = list()
        bin_emb_li = list()
        
        def merge_frozenset(row):
            new_set = set(row[0].union(row[1]))
            new_set_li.append(new_set)
            len_set_li.append(len(new_set))
            emb_tmp = 0
            for e in new_set:
                emb_tmp += self.iso_dic[e]
            bin_emb_li.append(emb_tmp)
        
        association_compressed.apply(lambda x: merge_frozenset(x), axis=1)
        association_final = pd.DataFrame([new_set_li, bin_emb_li, len_set_li, support], index=None).T
        association_final.columns = ['frequent_item', 'binary_embedding', 'set_length', 'support']
        association_final = association_final.sort_values(by='set_length', ascending=False)
        filename = ''.join(['sup_', str(self.support)[:4], '_conf_', str(self.confidence)[:3], '_maxlen_', str(self.max_len), '.csv'])
        association_final.to_csv(filename, index=None)
        print(f"The total count of item is {association_final.shape[0]}.")
        print("Final apriori of support %.2f, confidence %.2f, maxlen %d, finished." % (self.support, self.confidence, self.max_len))
        

### 3. 得到物质的特有成分：通过TFIDF得到每种物质的特有成分，并对其进行二进制编码；再将每种特有成分在原数据中对应的颗粒搜索出来。

In [4]:
class UniqueComponents:    # 将几种物质的频繁项二进制表示合成一个列表，并对其进行TFIDF计算提取每种物质的特有成分。
    
    def __init__(self, substance_li, fi_csv_lie):
        '''
        substance_li:物质名的列表(即生成文件所在目录)
        fi_csv_li:每种物质要处理的频繁项文件名，顺序和数量要与substance_li一致。
        '''
        self.substance_li = substance_li
        self.fi_csv_li = fi_csv_li
        self.num = len(self.substance_li) if len(self.substance_li)==len(self.fi_csv_li) else -1
        
        # 判断两个列表长度是否一样
        if self.num < 0:
            raise Exception(f'The lengths of substance list and fi_csv list do not match!')
        
        
        
    def get_fi_df(self, substance, fi_csv):
        '''
        得到某种物质频繁项csv的路径，并读取得该文件对应的df。
        substance:物质名(即生成文件所在目录)
        fi_csv:该种物质要处理的频繁项文件名
        '''
        fi_path = '/'.join([substance, fi_csv])
        fi_df = pd.read_csv(fi_path)
        return fi_df
    
    
    
    def get_iso_dic(self):
        '''
        得到某种物质同位素字典pk文件的路径，并读取该字典。
        '''
        iso_dic_pk = 'existing_isotopes_dict.pk'
        with open(iso_dic_pk, 'rb') as f:
            iso_dic = pkl.load(f)
        return iso_dic  

        
                
    def decode_embedding(self, embedding):
        '''
        对embedding解码，得到同位素组合(按原子质量升序排列)。
        substance:物质名(即生成文件所在目录)
        '''
        iso_dic = self.get_iso_dic()
        iso_li = list()
        for k,v in iso_dic.items():
            if v==embedding&v:
                iso_li.append(k)
        return iso_li
        
       
        
    def merge_fi_embedding(self):
        '''
        将每种物质的频繁项二进制编码合成一个空格分隔的字符串，并将左右物质的编码字符串合成一个列表。
        '''
        all_fi_emb = list()     #  所有物质的频繁项二进制编码(以空格分隔)字符串组成的列表
        for i in range(self.num):
            substance,fi_csv = self.substance_li[i],fi_csv_li[i]
            fi_df = self.get_fi_df(substance, fi_csv)
            fi_emb = fi_df['binary_embedding']
            fi_emb_str = list(map(str, fi_emb))
            all_fi_emb.append(' '.join(fi_emb_str))
        return all_fi_emb
    
    
    
    def compute_tfidf(self):
        '''
        对所有样品的频繁项二进制表示组成的数据进行TFIDF计算,得到对应的weights数组。
        '''
        all_fi_emb = self.merge_fi_embedding()
        
        vectorizer = CountVectorizer()             # CountVectorizer:将文本中的词语转换为词频矩阵 
        X = vectorizer.fit_transform(all_fi_emb)   # 计算每个频繁项出现的次数(频率)
        keywords = vectorizer.get_feature_names()      # 获取数据集中所有出现的频繁项(关键词)。查看X的值:X.toarray()

        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(X)       # 将频率矩阵X统计成TF-IDF值
        weights = tfidf.toarray()                  # tfidf[i][j]表示i类物质中的第j个成分的tfidf值
        
        return weights, keywords
    
    
    
    def get_unique_components(self):
        '''
        从tfidf得到的weights中选出每种物质tfidf值最高的成分，并对其进行解码。df包括同位素组成、embedding、同位素数量，并保存csv。
        ''' 
        weights,keywords = self.compute_tfidf()
        cols = ['isotopes', 'embedding']
        
        for i in range(len(weights)):
            substance = self.substance_li[i]         # 该种物质的名称
            iso_dic = self.get_iso_dic()    # 该种物质的同位素字典
            weight = weights[i]
            emb_res = list()                         # 特有成分的emb
            iso_res = list()                         # 特有成分的同位素组成
            max_val = max(weight)                    # 该种物质的最大tfidf值，即特有成分的tfidf值 
            
            for j in range(len(weight)):
                if weight[j]==max_val:
                    embedding = int(keywords[j])
                    isotopes = self.decode_embedding(embedding)
                    emb_res.append(embedding)  
                    iso_res.append(isotopes)
            
            file_name = 'unique_components.csv'
            path = '/'.join([substance, file_name])
            uniq_fi_df = pd.DataFrame(np.array([iso_res,emb_res]).T, columns=cols)
            uniq_fi_df['iso_num'] = [len(iso) for iso in uniq_fi_df['isotopes']]
            uniq_fi_df = uniq_fi_df.sort_values(by='iso_num', ascending=False)
            uniq_fi_df.to_csv(path, index=None)
            print('%s\'s unique components have been extracted.' % substance)

### 4. 根据特有成分从原始数据中查找对应颗粒，并对颗粒进行质量计算等处理。

In [5]:
class Query_Features:     # 根据特有成分从原始数据中查找对应颗粒；并添加质量占比、物质名、特有成分、浓度等特征。
    
    def __init__(self, substance, mass_factor, TE):
        '''
        substance:物质名(即生成文件所在目录)；
        base_csv:同位素单位强度的csv文件名，表头已经过处理；
        TE:传输效率
        '''
        self.substance = substance
        self.mass_factor = mass_factor
        self.TE = TE
        self.iso_n = 0         # 该种物质中包含的同位素数量。
        self.new_df = None     # 每种特有成分包含的颗粒组成的df。最终添加了质量占比、物质名、特有成分、浓度等特征
        
        
        
    def get_data_df(self):
        '''
        得到添加了embedding的原始数据。
        '''
        data_csv = 'isotopes_embedding.csv'
        data_path = '/'.join([self.substance,data_csv])
        data_df = pd.read_csv(data_path)
        self.iso_n = data_df.shape[1]-1
        return data_df
        
        
        
    def get_unique_components_df(self):
        '''
        得到特有成分的df。
        '''
        uniq_comp_csv = 'unique_components.csv'
        for f in os.listdir(self.substance):
            if re.match('sup_[\S]*', f)!=None:
                uniq_comp_csv = re.match('sup_[\S]*', f).group()
                break
        uniq_comp_path = '/'.join([self.substance,uniq_comp_csv])
        uniq_comp_df = pd.read_csv(uniq_comp_path)
        return uniq_comp_df
    
    
    
    def compute_number(self, ptc_cnt):
        '''
        计算每种特有成分颗粒的浓度。
        ptc_cnt:某种特有成分包含的粒子数。
        '''   
        return ptc_cnt / self.TE
        
    
    
    def query_particles(self):
        '''
        在data_df查询中查询每个特有成分包含的颗粒，并添加[特有成分组成、特有成分embedding、特有成分同位素数量、浓度]4个特征，组成新的df。
        '''
        data_df = self.get_data_df()
        uc_df = self.get_unique_components_df()
        queried_set = set()
        tmp_df = pd.DataFrame()
        data_idx = set(data_df.index)
        data_emb = data_df['embedding'].values
        for i in range(len(uc_df)):
            uc_iso = uc_df.iloc[i,0]
            uc_emb = uc_df.iloc[i,1]
            uc_iso_n = uc_df.iloc[i,2]
            row_idx = np.squeeze(np.argwhere(uc_emb==data_emb&uc_emb))
            row_idx = list(set(row_idx)&data_idx)
            data_idx -= set(row_idx)
            tmp_df = data_df.iloc[row_idx]
            if not tmp_df.empty:
                tmp_df['uniq_iso'] = uc_iso
                tmp_df['uniq_iso_emb'] = uc_emb
                tmp_df['uniq_iso_n'] = uc_iso_n
                tmp_df['number'] = self.compute_number(len(row_idx))
            self.new_df = pd.concat([self.new_df,tmp_df])
        self.new_df.reset_index(drop=True)        
    
        
        
    def computer_iso_mass(self):
        '''
        计算每个同位素的质量，并将其加入df。
        '''
        iso_df = self.new_df.iloc[:,:self.iso_n]
        mass_factor = pd.read_csv(self.mass_factor)[iso_df.columns].values
        mass_df = iso_df / mass_factor
        self.new_df = pd.concat([self.new_df, mass_df], axis=1)
        
        

    def compute_total_mass(self):
        '''
        计算每个颗粒的总质量，并将其加入df。
        '''        
        def sum_mass(row):
            return np.nansum(row)
        
        mass_df = self.new_df.iloc[:,self.iso_n+5:]
        total_mass = mass_df.apply(lambda x:sum_mass(x), axis=1).values
        self.new_df['total_mass'] = total_mass
        
        
        
    def compute_mass_weights(self):
        '''
        计算颗粒中每种同位素的质量分布，并将其加入df。
        '''
        def divide(col, total):
            return col/total
        
        mass_df = self.new_df.iloc[:,self.iso_n+5:-1]
        total_mass = self.new_df.iloc[:,-1]
        mass_weights_df = mass_df.apply(lambda x: divide(x, total_mass), axis=0)
        self.new_df = pd.concat([self.new_df, mass_weights_df], axis=1)
        
        
        
    def mass_process(self):
        '''
        质量处理的完整流程，并保存最终df为csv文件。
        '''
        self.computer_iso_mass()
        self.compute_total_mass()
        self.compute_mass_weights()
        features_df = self.new_df.iloc[:,self.iso_n:]
        file_name = 'features.csv'
        file_path = '/'.join([self.substance, file_name])
        features_df.to_csv(file_path, index=None)

In [6]:
# 从颗粒态数据直接生成频繁项的执行脚本（1，2阶段）
def main():
    # 预处理
    dp = DataPreparation(targ_iso, substance)   # iter_flag
    dp.get_dir_data()
    dp.update_origin_df()
    dp.get_bin_emb()

    # 得到频繁项
    ap = AprioriProcess(substance, 4, 0.01, 0.3)
    ap.get_df_isodic()
    ap.Apriori_final()
    
    # 退回上级目录
    os.chdir('..')
    print(f'cwd: {os.getcwd()}.')
    print("-" * 30)

In [7]:
substance_li = ['12meihui', 'jiangchen', 'shangturang', 'weiqi', 'xiaturang']
targ_iso = ['24Mg', '27Al', '47Ti', '51V', '52Cr', '54Fe', '55Mn', '59Co', '60Ni', '63Cu', '66Zn', '75As', '87Sr', '89Y',
            '98Mo', '107Ag', '111Cd', '112Sn', '121Sb', '138Ba', '139La', '140Ce', '141Pr', '146Nd', '147Sm', '153Eu', 
            '157Gd', '159Tb', '163Dy', '165Ho', '166Er', '169Tm', '172Yb', '175Lu', '205Tl', '208Pb']

# if __name__ == '__main__':
for substance in substance_li:
    main()

Current dir: 12meihui_iteration.
The data of remained isotopes and binary embedding of 12meihui is saved.
The total count of item is 549.
Final apriori of support 0.01, confidence 0.30, maxlen 4, finished.
cwd: C:\Users\Yates.W\Desktop\第二阶段修改.
------------------------------
Current dir: jiangchen_iteration.
The data of remained isotopes and binary embedding of jiangchen is saved.
The total count of item is 732.
Final apriori of support 0.01, confidence 0.30, maxlen 4, finished.
cwd: C:\Users\Yates.W\Desktop\第二阶段修改.
------------------------------
Current dir: shangturang_iteration.
The data of remained isotopes and binary embedding of shangturang is saved.
The total count of item is 277.
Final apriori of support 0.01, confidence 0.30, maxlen 4, finished.
cwd: C:\Users\Yates.W\Desktop\第二阶段修改.
------------------------------
Current dir: weiqi_iteration.
The data of remained isotopes and binary embedding of weiqi is saved.
The total count of item is 290.
Final apriori of support 0.01, conf

In [8]:
# ！！！若需要对纯物质计算tfidf来筛选频繁项，则直接执行改部分，否则不执行（3阶段）

substance_li = ['12meihui_iteration', 'jiangchen_iteration', 
                'shangturang_iteration', 'weiqi_iteration',
                'xiaturang_iteration']
fi_csv_li = ['sup_0.01_conf_0.3_maxlen_4.csv', 
             'sup_0.01_conf_0.3_maxlen_4.csv', 
             'sup_0.01_conf_0.3_maxlen_4.csv', 
             'sup_0.01_conf_0.3_maxlen_4.csv', 
             'sup_0.01_conf_0.3_maxlen_4.csv']

uc = UniqueComponents(substance_li, fi_csv_li)
uc.get_unique_components()


# 展示tfidf统计结果(weights结构)
weights,_ = uc.compute_tfidf()
print('shape:', weights.shape, '\n')
print('12meihui:', sorted(Counter(weights[0]).items(),key=lambda x:x[0],reverse=True), '\n')
print('jiangchen:', sorted(Counter(weights[1]).items(),key=lambda x:x[0],reverse=True), '\n')
print('shangturang:', sorted(Counter(weights[2]).items(),key=lambda x:x[0],reverse=True), '\n')    
print('weiqi:', sorted(Counter(weights[3]).items(),key=lambda x:x[0],reverse=True), '\n')    
print('xiaturang:', sorted(Counter(weights[4]).items(),key=lambda x:x[0],reverse=True), '\n') 

12meihui_iteration's unique components have been extracted.
jiangchen_iteration's unique components have been extracted.
shangturang_iteration's unique components have been extracted.
weiqi_iteration's unique components have been extracted.
xiaturang_iteration's unique components have been extracted.
shape: (5, 911) 

12meihui: [(0.057080564027823716, 134), (0.04605223964919108, 127), (0.0382275189778647, 88), (0.03215819410210578, 67), (0.02719919459923207, 130), (0.0, 365)] 

jiangchen: [(0.04690918642503205, 250), (0.03784603624346002, 143), (0.03141562885442068, 128), (0.026427817382686097, 77), (0.02235247867284866, 130), (0.0, 183)] 

shangturang: [(0.1038569652776853, 10), (0.08379114564940701, 9), (0.06955422005292193, 56), (0.0585112026333073, 70), (0.04948840042464366, 130), (0.0, 636)] 

weiqi: [(0.09341401412561312, 24), (0.07536583841408256, 24), (0.06256045395845657, 78), (0.05262782611335125, 33), (0.04451227824692601, 130), (0.0, 622)] 

xiaturang: [(0.09460879854276961

In [9]:
%%time
# 频繁项查找对应颗粒，以及质量计算。直接执行即可（4阶段）

substance_li = ['12meihui_iteration', 'jiangchen_iteration', 'shangturang_iteration', 'weiqi_iteration', 'xiaturang_iteration']
mass_factor = 'mass_factor.csv'
TE = 0.2

for substance in substance_li:
    qf = Query_Features(substance, mass_factor, TE)
    qf.query_particles()
    qf.mass_process()
    print('%s finished.' % substance)

12meihui_iteration finished.
jiangchen_iteration finished.
shangturang_iteration finished.
weiqi_iteration finished.
xiaturang_iteration finished.
Wall time: 9.73 s
