In [1]:
#import sampling
import os
import pandas as pd
import logging
import sklearn
from time import time
import random
from simpletransformers.ner import NERModel,NERArgs

In [3]:
config_class_features = read_data("config/config-class-features.json")
config_classinfo = read_data("config/config-classinfo.json")
config_numeric_fields = read_data("config/config-numeric-fields.json")
config_dynamic_units = read_data("config/config-dynamic-units.json")
pair_params = read_data("config/pair-params")

In [4]:
def get_rs_list_of_all_category():
    """
    input_size: the number of items to sample
    returns the input for simpletransformer
    """
    # 读取所有catogory文件
    path = os.getcwd()
    files= os.listdir('./formatData') #得到文件夹下的所有文件名称
    rs_list = []
    for file in files: #遍历文件夹
        file_path = os.path.join(path, 'formatData/'+file)
        if os.path.isfile(file_path): #判断是否是文件夹，不是文件夹才打开
            rs = sampling(file,0.2,0.6)
            rs_list.append(rs)
    print('num_of_all_categories',len(rs_list))
    return rs_list

def get_input_from_sampling(input_size: int):
    """
    input_size: the number of items to sample
    returns the input for simpletransformer
    """
    rs_list = get_rs_list_of_all_category()
    
    # input_size为1表示从每一个category抽取一个样本
    input=[]
    index=0
    for i in range(input_size):
        for rs in rs_list:
            item=rs.random_sampling()
            attribute_list=[]
            for key, val in item.items():
                tmp=[index]
                tmp.append(val)
                tmp.append(key)
                attribute_list.append(tmp)
            index+=1
            input.extend(attribute_list)
    print('样本数量：',index)
    return input
# get_input_from_sampling(1)

In [5]:
def baseline_train():
    """
    
    """
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    # Creating train_df  and eval_df for demonstration
    train_data = get_input_from_sampling(260)
    train_df = pd.DataFrame(train_data, columns=['sentence_id', 'words', 'labels'])
    train_df['words'] = train_df['words'].astype(str)

    eval_data = get_input_from_sampling(60)
    eval_df = pd.DataFrame(eval_data, columns=['sentence_id', 'words', 'labels'])
    eval_df['words'] = eval_df['words'].astype(str)
    label_list = list(pd.concat([train_df['labels'],eval_df['labels']],axis=0).drop_duplicates())
    print(label_list)
    print(len(label_list))

    # Create a NERModel
    model_args = NERArgs()
    model_args.output_dir = "D:/outputs/"
    model_args.classification_report = True
    model_args.labels_list = label_list
    model_args.use_early_stopping = True
    model_args.early_stopping_consider_epochs = True
    model_args.early_stopping_delta = 0.01
    model_args.early_stopping_metric = "mcc"
    model_args.early_stopping_metric_minimize = False
    model_args.early_stopping_patience = 3
    model_args.do_lower_case = True
    model_args.manual_seed = 2020
    model_args.max_seq_length = 24
    model_args.num_train_epochs = 1
    model_args.overwrite_output_dir = True
    model_args.save_steps = 1000
    model_args.tensorboard_dir = "tensorboard_dir"
    model_args.use_cached_eval_features = True
    
    model = NERModel(model_type='distilbert', model_name='distilbert-base-uncased', args=model_args,use_cuda=False)

    # Train the model
    start = time()
    model.train_model(train_df, output_dir = "D:/outputs/train_output",show_running_loss=True, eval_data=eval_df)
    end = time()
    print(f"Pytorch CPU: {end - start}")
    
    # Evaluate the model
    result, model_outputs, preds_list = model.eval_model(eval_df, output_dir = "D:/outputs/eval_output",verbose=True)

    return model, result, model_outputs, preds_list

model, result, model_outputs, preds_list = baseline_train()

num_of_all_categories 37
样本数量： 9620
num_of_all_categories 37
样本数量： 2220
['class', 'category', 'AverageInputPower', 'OutputImpedance', 'Response', 'InputImpedance', 'VSWR', 'Width', 'PeakReflowTemperatureCel', 'SupplyVoltageMinVsup', 'PowerSupplies', 'Capacitance', 'Polarity', 'PositiveTolerance', 'OperatingTemperatureMin', 'OperatingTemperatureMax', 'RippleCurrent', 'Diameter', 'SupplyVoltageNomVsup', 'BandwidthNom', 'SurfaceMount', 'MoistureSensitivityLevel', 'IhsCategory', 'NumberofAnalogInChannels', 'AnalogInputVoltageMax', 'ConversionTimeMax', 'PackageBodyMaterial', 'JESD609Code', 'TerminalPosition', 'NegativeSupplyVoltageNom', 'SupplyVoltageNom', 'Technology', 'NoiseMax', 'ScreeningLevel', 'NegSupplyVoltageNom', 'TerminalPitch', 'BatteryFeed', 'ControlMode', 'OutputFrequencyMax', 'NegSupplyVoltageNomVsup', 'SeatedHeightMax', 'OutputCharacteristics', 'CountDirection', 'NumberofBits', 'SupplyVoltageMaxVsup', 'PackageCode', 'PackageEquivalenceCode', 'DielectricMaterial', 'Temperature

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

HBox(children=(FloatProgress(value=0.0, max=9620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=1203.0, style=ProgressStyle(de…







INFO:simpletransformers.ner.ner_model: Training of distilbert model complete. Saved to D:/outputs/train_output.


Pytorch CPU: 2841.072289943695


INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=2220.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=278.0, style=ProgressStyle(descr…




  _warn_prf(average, modifier, msg_start, len(result))
INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.8367662346191543, 'precision': 0.8103668798384382, 'recall': 0.8103668798384382, 'f1_score': 0.8103668798384382}


In [6]:
result

{'eval_loss': 0.8367662346191543,
 'precision': 0.8103668798384382,
 'recall': 0.8103668798384382,
 'f1_score': 0.8103668798384382}

In [8]:
# pd.read_table('D:/outputs/eval_output/eval_results.txt',sep='       ')

In [9]:
def show_prediction_result(model):
    """
    model: the trained model
    returns the predicted result
    """
    rs_list = get_rs_list_of_all_category()
    rs = random.choice(rs_list)
    item = rs.random_sampling()
    truth_result = pd.DataFrame.from_dict(item,orient='index').reset_index()
    truth_result.columns = ['output_truth','input']
    truth_result['input'] = truth_result['input']
    print(truth_result)
    input_sentence = list(item.values())
    input_sentence = [str(x) for x in input_sentence]
    predictions, raw_outputs = model.predict([input_sentence],split_on_space= False )
    
    pred_dict={}
    for item in predictions[0]:
        pred_dict.update(item)
    pred_result = pd.DataFrame.from_dict(pred_dict,orient='index').reset_index()
    pred_result.columns = ['input','output_pred']
    print(pred_result)
    show_result = pd.merge(pred_result,truth_result,how='right',on='input')
    return show_result
show_prediction_result(model)

INFO:simpletransformers.ner.ner_model: Converting to features started.


num_of_all_categories 37
               output_truth              input
0                     class  Consumer Circuits
1                  category  Audio Control ICs
2         ChannelSeparation               65.0
3               PackageCode                SOP
4        HarmonicDistortion               7/4%
5                      Gain               10.5
6  MoistureSensitivityLevel                  3
7          TerminalPosition               DUAL
8          SupplyCurrentMax             19.175


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=1.0, style=ProgressStyle(descrip…


               input         output_pred
0  Consumer Circuits               class
1  Audio Control ICs            category
2               65.0        BandwidthNom
3                SOP         PackageCode
4               7/4%  HarmonicDistortion
5               10.5                Gain
6                  3   NumberofTerminals
7               DUAL    TerminalPosition
8             19.175              Length


Unnamed: 0,input,output_pred,output_truth
0,Consumer Circuits,class,class
1,Audio Control ICs,category,category
2,65.0,BandwidthNom,ChannelSeparation
3,SOP,PackageCode,PackageCode
4,7/4%,HarmonicDistortion,HarmonicDistortion
5,10.5,Gain,Gain
6,3,NumberofTerminals,MoistureSensitivityLevel
7,DUAL,TerminalPosition,TerminalPosition
8,19.175,Length,SupplyCurrentMax


In [10]:
# Saving the quantized model
model.save_model("D:/outputs/model_folder", model=model.model)

In [None]:
# Load the saved (quantized) model
model = NERModel(model_type='distilbert', model_name='distilbert-base-uncased', args=model_args,use_cuda=False)

In [None]:
# 默认参数
model_args.adam_epsilon = 1e-8
model_args.best_model_dir = "outputs/best_model"
model_args.cache_dir = "cache_dir/"
model_args.dynamic_quantize = False
model_args.encoding = None
model_args.eval_batch_size = 8
model_args.evaluate_during_training = False
model_args.evaluate_during_training_silent = True
model_args.evaluate_during_training_steps = 2000
model_args.evaluate_during_training_verbose = False
model_args.evaluate_each_epoch = True
model_args.fp16 = True
model_args.gradient_accumulation_steps = 1
model_args.learning_rate = 4e-5
model_args.local_rank = -1
model_args.logging_steps = 50
model_args.max_grad_norm = 1.0
model_args.model_name = None
model_args.model_type = None
model_args.multiprocessing_chunksize = 500
model_args.n_gpu = 1
model_args.no_cache = False
model_args.no_save = False

model_args.quantized_model = False
model_args.reprocess_input_data = True
model_args.save_best_model = True
model_args.save_eval_checkpoints = True
model_args.save_model_every_epoch = True
model_args.save_optimizer_and_scheduler = True
model_args.silent = False
model_args.thread_count = None
model_args.train_batch_size = 8
model_args.train_custom_parameters_only = False
model_args.use_multiprocessing = True
model_args.wandb_project = None
model_args.warmup_ratio = 0.06
model_args.warmup_steps = 0
model_args.weight_decay = 0
model_args.skip_special_tokens = True

In [2]:
#!/usr/bin/env python
# coding: utf-8
import re
import os
import json
import random
from collections import defaultdict
from typing import Tuple, Dict, Sequence, List, Set


def read_data(filename: str):
    """
    filename: the name of a file
    returns the data of in the file
    """
    if filename[-5:] == ".json":
        directory = "formatData/" + filename
        with open(directory, 'r', encoding='utf-8') as file:
            output = json.loads(file.read())
    
    elif filename == "config/pair-params":
        directory = "formatData/" + filename
        with open(directory, 'r', encoding='utf-8') as file:
            data = file.read().split("\n\n")

        output = defaultdict()
        for pair in data:
            pair = pair.replace(" ", "").replace("-", "").replace("(", "").replace(")", "").split("/")
            output[pair[0]] = pair[1]
   
    else:
        raise ValueError("The input is not valid.")
    return output


def preprocess_config_class_features(config_class_features: dict) -> dict:
    """
    config_class_features: the class features of all components (in "config/config-class-features.json")
    returns a preprocessed config-class-features
    """
    output = defaultdict(defaultdict)
    for component in config_class_features:
        for label in config_class_features[component]:
            # "Rated (DC) Voltage (URdc)" in config-class-features appears as "RatedDCVoltageURdc" in a component file
            output[component][label] = [i.replace(" ", "").replace("-", "").replace("(", "").replace(")", "") for i in config_class_features[component][label]]
    return output


def preprocess_config_numeric_fields(config_numeric_fields: list) -> list:
    """
    config_numeric_fields: the numeric fields of all components (in "config/config-numeric-fields.json")
    returns a preprocessed config-class-features
    """
    output = []
    for feature in config_numeric_fields:
        # "Rated (DC) Voltage (URdc)" in config-numeric-fields appears as "RatedDCVoltageURdc" in a component file
        output.append(feature.replace(" ", "").replace("-", "").replace("(", "").replace(")", ""))
    return output


def extract_feature_value(filename: str) -> dict:
    """
    filename: the name of a component file (e.g. "Active_Filters.json")
    returns all the features of a component and their corresponding values 
    """
    features_values = defaultdict(list)
    for item in read_data(filename):
        for feature in item:
            features_values[feature].append(item[feature])
    return features_values


def connect(sample: dict) -> str:
    """
    sample: the output of sampling.random_sampling()
    returns the concatenation of values
    """
    output = ''
    deliminter_list = ['#', ',', '/', ';', ':', '-', '_']
    deliminter = random.sample(deliminter_list, 1)[0]
    for key, value in sample.items():
        if random.uniform(0, 1) > 0.9: # 10% chance to use another deliminter
            output = output + str(value) + random.sample(deliminter_list, 1)[0]
        else:
            output = output + str(value) + deliminter
    return re.sub("(" + "|".join(deliminter_list) + ")$", "", output)


class sampling():
    def __init__(
        self,
        filename: str,
        p: float,
        most_relevant_p: float
    ):
        """
        filename: the name of a component file (e.g. "Active_Filters.json")
        p: the portion of features needs sampling
        most_relevant_p: the portion of most relevant features in a sample
        """
        self.component_name = os.path.splitext(filename)[0].replace("_"," ") # obtain the name of the component (e.g. "Active Filters")
        self.config_class_features = preprocess_config_class_features(config_class_features)
        self.config_numeric_fields = preprocess_config_numeric_fields(config_numeric_fields)
        self.config_dynamic_units = config_dynamic_units
        self.config_dynamic_units_keys = list(config_dynamic_units.keys())
        self.pair_params = pair_params
        self.pair_params_keys = list(pair_params.keys())
        self.pair_params_values = list(pair_params.values())
        
        self.p = p
        self.most_relevant_p = most_relevant_p

        self.features_values = extract_feature_value(filename)
        self.features_list = list(self.features_values.keys())

        # obtain the class of the component (e.g. "Filters")
        # component_name "Array Network Resistors" appears as "Array/Network Resistors" in config_classinfo
        for key in config_classinfo:
            if self.component_name in [item.replace("/"," ") for item in config_classinfo[key]]:
                self.component_class = key    

        self.component_config_class_features = list(config_class_features[self.component_class].keys())
        
        # all of the features in config-class-features are not found in component files
        # e.g. "Resistance Law" is not found in Array_Network_Resistors.json
        self.most_relevant_features_list = [i for i in list(self.config_class_features[self.component_class]["Most Relevant"]) if i in self.features_list]
    
    
    def exist_in_pair(self, sample_features: list) -> list:
        """
        sample_features: features need sampling
        returns a new list of sample_features
        """
        output = []
        for feature in sample_features:
            output.append(feature)
            if feature in self.pair_params_keys:
                if random.uniform(0, 1) < 0.9:
                    output.append(self.pair_params[feature])
            if feature in self.pair_params_values:
                if random.uniform(0, 1) < 0.9:
                    pair_params = {value: key for key, value in self.pair_params.items()}
                    output.append(pair_params[feature])
        return output

    
    def sample_value(self, sample_features: list):
        """
        sample_features: features need sampling
        """
        for feature in sample_features:
            if feature in self.config_numeric_fields: # check if the feature is numeric
                n = 2
                total_numeric = 0
                for i in range(n):
                    value = str(random.sample(self.features_values[feature], 1)[0])
                    numeric = re.findall("-*\d+\.?\d*", value)[0] # extract the numeric part of the value
                    total_numeric += float(numeric)
                unit = value.strip(numeric) # extract the unit of the value
                total_numeric = total_numeric / n
                
                # amplify the diversity of unit (e.g. 1000Ω -> 1kΩ)
                if unit in self.config_dynamic_units_keys:
                    new_unit = random.sample(list(self.config_dynamic_units[unit].keys()), 1)[0]
                    mutator = float(self.config_dynamic_units[unit][new_unit])
                    if mutator < 1:
                        # test if total_numeric is greater than 1/mutator (e.g. normally we do not write 8000mW)
                        while total_numeric >= 1 / mutator: 
                            total_numeric = total_numeric * mutator
                    if mutator > 1:
                        # test if total_numeric is greater than the mutator (e.g. normally we do not write 8000kW)
                        while total_numeric >= mutator: 
                            total_numeric = total_numeric / mutator
                    self.sample[feature] = str(round(total_numeric, 3)) + new_unit
                else:
                    # amplify the diversity of the value (e.g. 0.25W -> 1/4W)
                    if random.uniform(0, 1) < 0.7:
                        self.sample[feature] = str(round(total_numeric, 3)) + unit
                    else:
                        self.sample[feature] = str(random.randint(1, 10)) + "/" + str(random.randint(1, 10)) + unit
            else:
                self.sample[feature] = random.sample(self.features_values[feature], 1)[0]
    
    
    def random_sampling(self) -> dict:
        """
        returns a sample
        """
        features_list = list(set(self.features_list) - set(self.most_relevant_features_list))
        most_relevant_features_list = self.most_relevant_features_list.copy()
        
        sample_size = int(len(features_list) * self.p) ## sample_size [5, 6, 7, 8]
        most_relevant_features_size = int(sample_size * self.most_relevant_p)
                
        if sample_size < 1: # if sample_size is too small, set the two sizes to 1
            sample_size = 1
            most_relevant_features_size = 1
            
        self.sample = {}
        if "necessary" in self.component_config_class_features: # check if the component has a necessary feature
            sample_features = self.config_class_features[self.component_class]["necessary"]
            extended_sample_features = self.exist_in_pair(sample_features)
            self.sample_value(extended_sample_features)
            most_relevant_features_list = list(set(most_relevant_features_list) - set(extended_sample_features))
            sample_size -= len(sample_features)
            most_relevant_features_size -= len(sample_features)
            
        if most_relevant_features_size > 0:
            if len(most_relevant_features_list) >= most_relevant_features_size:
                sample_features = random.sample(most_relevant_features_list, most_relevant_features_size)
            else:
                sample_features = random.sample(most_relevant_features_list, len(most_relevant_features_list))
            extended_sample_features = self.exist_in_pair(sample_features)
            self.sample_value(extended_sample_features)
            sample_size -= most_relevant_features_size

        if sample_size > 0:   
            sample_features = random.sample(features_list, sample_size)
            extended_sample_features = self.exist_in_pair(sample_features)
            self.sample_value(extended_sample_features)
            
        output = {'class':self.component_class,'category':self.component_name}
        output.update(self.sample)
        return output