In [1]:
from transformers import pipeline
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, AutoModel
from transformers import DataCollatorForTokenClassification
import datasets
import evaluate

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

  from .autonotebook import tqdm as notebook_tqdm


### Load data

In [2]:
def get_tags(tags_str):
    if len(tags_str) <= 0:
        return []

    tags_str = tags_str.replace('vehicle-type', 'vehicletype')
    tags = [t.strip() for t in tags_str.split(',')]
    tags = [t for t in tags if not (('attribute:' in t) or ('attributevalue:' in t) or ('autoparts & accessories:' in t) or ('class:' in t) or ('domain:' in t) or ('finance:' in t) or ('vin:' in t) or ('non-autos:' in t))]
    return tags


In [3]:
def word_pos_to_index(text, words):
    pos_to_index = {}
    start = 0
    for i, w in enumerate(words):
        pos = text.find(w, start)
        if pos >= 0:
            pos_to_index[pos] = i
            start += len(w)
    return pos_to_index

In [4]:
def update_labels(labels, labels_len, label, pattern, text, text_len, pos_to_index):
    pattern_words_len = len(pattern.split())
    pattern_len = len(pattern)

    start = 0
    while start < text_len:
        pos = text.find(pattern, start)
        if pos < 0:
            break

        start = pos + pattern_len

        index = pos_to_index.get(pos, -1)
        if (index >= 0) and (index + pattern_words_len - 1 < labels_len):
            labels[index] = f'B-{label}'
            for i in range(index + 1, index + pattern_words_len):
                labels[i] = f'I-{label}'

In [5]:
def get_labels(text, words, tags):
    labels_len = len(words)
    labels = ['0'] * labels_len

    if len(tags) <= 0:
        return labels

    pos_to_index = word_pos_to_index(text, words)
    text_len = len(text)
    for tag in tags:
        parts = list(filter(None, tag.split(':')))
        if len(parts) != 2:
            continue

        update_labels(labels, labels_len, parts[0], parts[1], text, text_len, pos_to_index)
    
    return labels

In [6]:
def label_base(x):
    x = x.replace('B-', '')
    x = x.replace('I-', '')
    return x

In [7]:
def label_order(x):
    if 'B-' in x:
        x = x.replace('B-', '')
        x = x + 'B'
    if 'I-' in x:
        x = x.replace('I-', '')
        x = x + 'I'
    return x

In [8]:
path = 'E:/work/auto/bingans/quality'

In [9]:
file = f'{path}/tag_results_large.tsv'
results = pd.read_csv(file, sep='\t', header=0)

In [10]:
results.shape

(6773, 3)

In [11]:
results['Result.Intent'].value_counts()

Result.Intent
Research                                 1149
Buy                                      1028
Non-Autos                                 943
Navigational                              900
Buy, Research                             564
Research, Buy                             524
Finance                                   438
Autoparts & Accessories, Ownership        390
Autoparts & Accessories                   258
Ownership                                 238
Sell                                       74
Autoparts & Accessories, Research          40
Research, Autoparts & Accessories          26
Research, Ownership                        26
Research, Finance                          25
Buy, Navigational                          17
Ownership, Research                        17
Autoparts & Accessories, Buy               16
Ownership, Autoparts & Accessories         16
Rent                                       14
Buy, Autoparts & Accessories               13
Research, Navigation

In [12]:
cols = ['Query', 'Result.Tags']
research = results[results['Result.Intent'].str.contains('Research') & ~results['Result.Intent'].str.contains('Autoparts & Accessories') & ~results['Result.Intent'].str.contains('Buy')][cols]

In [80]:
research.shape

(1231, 2)

In [81]:
research['NormQuery'] = research.Query.str.lower()
research['TagsStr'] = research['Result.Tags'].str.lower()
research['Tags'] = research.TagsStr.apply(lambda x: get_tags(x))
research['Words'] = research.NormQuery.str.split()
research['Labels'] = research.apply(lambda row: get_labels(row.NormQuery, row.Words, row.Tags), axis=1)

In [82]:
unique_labels_research = {label_base(l) for label_arr in research.Labels for l in label_arr}
sorted_labels_research = sorted(unique_labels_research)

In [83]:
unique_labels_research

{'0',
 'aggregator',
 'bodystyle',
 'color',
 'compare',
 'dealer',
 'dealername',
 'drivetrain',
 'engine',
 'intentindicator',
 'location',
 'make',
 'mileage',
 'model',
 'modifier',
 'segment',
 'stateofvehicle',
 'transmission',
 'trim',
 'vehicletype',
 'year'}

In [84]:
len(unique_labels_research)

21

In [85]:
unique_labels_research - unique_labels

{'compare', 'mileage'}

In [86]:
unique_labels - unique_labels_research

{'price'}

In [13]:
# cols = ['Query', 'Result.Tags', 'Result.Intent', 'Result.Explanation', 'Result.PrimaryIntent']
cols = ['Query', 'Result.Tags']
buy_research = results[(results['Result.Intent'].str.contains('Buy') | results['Result.Intent'].str.contains('Research')) & ~results['Result.Intent'].str.contains('Autoparts & Accessories')][cols]

In [14]:
buy_research.shape

(3390, 2)

In [89]:
buy_research['NormQuery'] = buy_research.Query.str.lower()
buy_research['TagsStr'] = buy_research['Result.Tags'].str.lower()
buy_research['Tags'] = buy_research.TagsStr.apply(lambda x: get_tags(x))
buy_research['Words'] = buy_research.NormQuery.str.split()
buy_research['Labels'] = buy_research.apply(lambda row: get_labels(row.NormQuery, row.Words, row.Tags), axis=1)

In [43]:
buy.at[40, 'Labels'] = ['B-make', 'B-model', '0', 'B-location']

In [90]:
#unique_labels = {l for label_arr in buy.Labels for l in label_arr}
#sorted_labels = sorted(unique_labels, key=label_order)
unique_labels = {label_base(l) for label_arr in buy_research.Labels for l in label_arr}
sorted_labels = sorted(unique_labels)

In [91]:
sorted_labels = [mod_label for l in sorted_labels if not l == '0' for mod_label in ['B-' + l, 'I-' + l]]

In [92]:
sorted_labels.insert(0, '0')

In [93]:
len(sorted_labels)

43

In [94]:
id2label = {i: label for i, label in enumerate(sorted_labels)}
label2id = {v: k for k, v in id2label.items()}

In [95]:
buy_research['LabelIds'] = buy_research.Labels.apply(lambda x: [label2id[l] for l in x])

In [96]:
id2label

{0: '0',
 1: 'B-aggregator',
 2: 'I-aggregator',
 3: 'B-bodystyle',
 4: 'I-bodystyle',
 5: 'B-color',
 6: 'I-color',
 7: 'B-compare',
 8: 'I-compare',
 9: 'B-dealer',
 10: 'I-dealer',
 11: 'B-dealername',
 12: 'I-dealername',
 13: 'B-drivetrain',
 14: 'I-drivetrain',
 15: 'B-engine',
 16: 'I-engine',
 17: 'B-intentindicator',
 18: 'I-intentindicator',
 19: 'B-location',
 20: 'I-location',
 21: 'B-make',
 22: 'I-make',
 23: 'B-mileage',
 24: 'I-mileage',
 25: 'B-model',
 26: 'I-model',
 27: 'B-modifier',
 28: 'I-modifier',
 29: 'B-price',
 30: 'I-price',
 31: 'B-segment',
 32: 'I-segment',
 33: 'B-stateofvehicle',
 34: 'I-stateofvehicle',
 35: 'B-transmission',
 36: 'I-transmission',
 37: 'B-trim',
 38: 'I-trim',
 39: 'B-vehicletype',
 40: 'I-vehicletype',
 41: 'B-year',
 42: 'I-year'}

In [97]:
buy_research[:2]

Unnamed: 0,Query,Result.Tags,NormQuery,TagsStr,Tags,Words,Labels,LabelIds
6,13' boston whaler for sale,"Vehicle-Type:Boats, Make:Boston Whaler, IntentIndicator:for sale",13' boston whaler for sale,"vehicle-type:boats, make:boston whaler, intentindicator:for sale","[vehicletype:boats, make:boston whaler, intentindicator:for sale]","[13', boston, whaler, for, sale]","[0, B-make, I-make, B-intentindicator, I-intentindicator]","[0, 21, 22, 17, 18]"
7,1963 falcon for sale,"Year:1963, Make:Falcon, IntentIndicator:for sale",1963 falcon for sale,"year:1963, make:falcon, intentindicator:for sale","[year:1963, make:falcon, intentindicator:for sale]","[1963, falcon, for, sale]","[B-year, B-make, B-intentindicator, I-intentindicator]","[41, 21, 17, 18]"


### Create dataset

In [98]:
data = datasets.Dataset.from_pandas(buy_research[['Words', 'LabelIds']])

In [99]:
data[2]

{'Words': ['1966', 'chevrolet', 'malibu', 'specs'],
 'LabelIds': [41, 21, 25, 0],
 '__index_level_0__': 8}

In [100]:
split_ds = data.train_test_split(test_size=0.07)

In [101]:
split_ds

DatasetDict({
    train: Dataset({
        features: ['Words', 'LabelIds', '__index_level_0__'],
        num_rows: 3152
    })
    test: Dataset({
        features: ['Words', 'LabelIds', '__index_level_0__'],
        num_rows: 238
    })
})

### Tokenizer

In [102]:
def labels_to_labelnames(labels):
    return [None if l == -100 else sorted_labels[l] for l in labels]

In [103]:
def align_labels_with_subwords(wordids, labels):
    new_labels = [-100 if w is None else labels[w] for w in wordids]

    # If a label is repeated (same label as the one preceding it) and the label is B-XXX we change it to I-XXX
    updated_labels = [(l+1) if l % 2 == 1 and i > 0 and new_labels[i-1] == l else l for i, l in enumerate(new_labels)]

    return updated_labels

In [104]:
def tokenize_and_align_labels(ds_slice, tokenizer):
    tokenized = tokenizer(ds_slice['Words'], truncation=True, is_split_into_words=True)
    tokenized['subtokens'] = [tokenized.tokens(i) for i, t in enumerate(ds_slice['Words'])]
    tokenized['word_ids'] = [tokenized.word_ids(i) for i, t in enumerate(ds_slice['Words'])]
    tokenized['labels'] = [align_labels_with_subwords(tokenized.word_ids(i), nt) for i, nt in enumerate(ds_slice['LabelIds'])]

    return tokenized

In [105]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [106]:
mapped = split_ds.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True, remove_columns=['__index_level_0__'])

Map:   0%|          | 0/3152 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

In [107]:
mapped['train'][1]

{'Words': ['bmw', 'xm', '2023'],
 'LabelIds': [21, 25, 41],
 'input_ids': [101, 171, 1306, 2246, 193, 1306, 17881, 1495, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
 'subtokens': ['[CLS]', 'b', '##m', '##w', 'x', '##m', '202', '##3', '[SEP]'],
 'word_ids': [None, 0, 0, 0, 1, 1, 2, 2, None],
 'labels': [-100, 21, 22, 22, 25, 26, 41, 42, -100]}

In [108]:
labels_to_labelnames(mapped['train'][1]['labels'])

[None,
 'B-make',
 'I-make',
 'I-make',
 'B-model',
 'I-model',
 'B-year',
 'I-year',
 None]

In [109]:
ds = mapped.map(remove_columns=['Words', 'LabelIds', 'subtokens', 'word_ids'])

Map:   0%|          | 0/3152 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

In [110]:
ds['train'][2]

{'input_ids': [101, 180, 1465, 3960, 25112, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1],
 'labels': [-100, 21, 22, 25, 0, -100]}

### Data collation and padding

In [111]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [112]:
batch = data_collator([ds["train"][i] for i in range(2)])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [113]:
batch

{'input_ids': tensor([[  101,  1107, 10359,   189, 10160,  1162, 21552,   102,     0],
         [  101,   171,  1306,  2246,   193,  1306, 17881,  1495,   102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'labels': tensor([[-100,   21,   22,    3,    4,    4,   39, -100, -100],
         [-100,   21,   22,   22,   25,   26,   41,   42, -100]])}

### Metrics

In [114]:
metric = evaluate.load("seqeval")

In [115]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[sorted_labels[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [sorted_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### Model

In [116]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [117]:
model.config

BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "0",
    "1": "B-aggregator",
    "2": "I-aggregator",
    "3": "B-bodystyle",
    "4": "I-bodystyle",
    "5": "B-color",
    "6": "I-color",
    "7": "B-compare",
    "8": "I-compare",
    "9": "B-dealer",
    "10": "I-dealer",
    "11": "B-dealername",
    "12": "I-dealername",
    "13": "B-drivetrain",
    "14": "I-drivetrain",
    "15": "B-engine",
    "16": "I-engine",
    "17": "B-intentindicator",
    "18": "I-intentindicator",
    "19": "B-location",
    "20": "I-location",
    "21": "B-make",
    "22": "I-make",
    "23": "B-mileage",
    "24": "I-mileage",
    "25": "B-model",
    "26": "I-model",
    "27": "B-modifier",
    "28": "I-modifier",
    "29": "B-price",
    "30": "

### Trainer

In [118]:
args = TrainingArguments(
    output_dir="AutosNer_buy_research_07_08_2023",
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    eval_steps=100,
    save_steps=100,
#    eval_steps=100,
#    save_steps=100,
#    per_device_train_batch_size=16,
#    per_device_eval_batch_size=16,    
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    report_to = "none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

### Using model

In [None]:
token_classifier = pipeline(
    "token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="max"
)
token_classifier("toyota rav4 under 35000 near renton")

[{'entity_group': 'make',
  'score': 0.9305529,
  'word': 'toyota',
  'start': 0,
  'end': 6},
 {'entity_group': 'model',
  'score': 0.7887881,
  'word': 'rav4',
  'start': 7,
  'end': 11},
 {'entity_group': '0',
  'score': 0.34494418,
  'word': 'under',
  'start': 12,
  'end': 17},
 {'entity_group': 'location',
  'score': 0.22965065,
  'word': '35000',
  'start': 18,
  'end': 23},
 {'entity_group': 'location',
  'score': 0.52201116,
  'word': 'near renton',
  'start': 24,
  'end': 35}]

### Load model from local dir

In [13]:
def get_ner_tags(classification_list):
    return [t for t in classification_list if not t['entity_group'] == '0']

In [19]:
model_path = "D:\\nlp\\nn\\tokenclassification\AutosNerModel_07_07_2023/checkpoint-700"

In [None]:
local_tokenizer = AutoTokenizer.from_pretrained(model_path)

In [20]:
local_model = AutoModelForTokenClassification.from_pretrained(model_path)

In [21]:
local_model.num_parameters()

107749671

In [12]:
local_model.config

BertConfig {
  "_name_or_path": "./AutosNerModel_07_07_2023/checkpoint-700",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "0",
    "1": "B-aggregator",
    "2": "I-aggregator",
    "3": "B-bodystyle",
    "4": "I-bodystyle",
    "5": "B-color",
    "6": "I-color",
    "7": "B-dealer",
    "8": "I-dealer",
    "9": "B-dealername",
    "10": "I-dealername",
    "11": "B-drivetrain",
    "12": "I-drivetrain",
    "13": "B-engine",
    "14": "I-engine",
    "15": "B-intentindicator",
    "16": "I-intentindicator",
    "17": "B-location",
    "18": "I-location",
    "19": "B-make",
    "20": "I-make",
    "21": "B-model",
    "22": "I-model",
    "23": "B-modifier",
    "24": "I-modifier",
    "25": "B-price",
    "26": "I-price",
    "27": "B-segment",
    "28": "I-segment",

In [11]:
local_token_classifier = pipeline(
    "token-classification", model=model_path, tokenizer=model_path, aggregation_strategy="max"
)
local_token_classifier("toyota rav4 with awd under $35000 near renton")

[{'entity_group': 'make',
  'score': 0.9983127,
  'word': 'toyota',
  'start': 0,
  'end': 6},
 {'entity_group': 'model',
  'score': 0.99685186,
  'word': 'rav4',
  'start': 7,
  'end': 11},
 {'entity_group': '0',
  'score': 0.9472336,
  'word': 'with',
  'start': 12,
  'end': 16},
 {'entity_group': 'drivetrain',
  'score': 0.6339309,
  'word': 'awd',
  'start': 17,
  'end': 20},
 {'entity_group': 'price',
  'score': 0.8370262,
  'word': 'under $ 35000',
  'start': 21,
  'end': 33},
 {'entity_group': '0',
  'score': 0.5044755,
  'word': 'near',
  'start': 34,
  'end': 38},
 {'entity_group': 'location',
  'score': 0.85374326,
  'word': 'renton',
  'start': 39,
  'end': 45}]

In [15]:
str(get_ner_tags(local_token_classifier("toyota rav4 with awd under $35000 near renton")))

"[{'entity_group': 'make', 'score': 0.9983127, 'word': 'toyota', 'start': 0, 'end': 6}, {'entity_group': 'model', 'score': 0.99685186, 'word': 'rav4', 'start': 7, 'end': 11}, {'entity_group': 'drivetrain', 'score': 0.6339309, 'word': 'awd', 'start': 17, 'end': 20}, {'entity_group': 'price', 'score': 0.8370262, 'word': 'under $ 35000', 'start': 21, 'end': 33}, {'entity_group': 'location', 'score': 0.85374326, 'word': 'renton', 'start': 39, 'end': 45}]"

In [5]:
queries = [
    'rav4 under 20k near toledo',
    'honda crv limited under 25000 with less than 30000 miles',
    'Suv under 40K close to Oakland',
    'minivan less than 45000 near Oakland',
    'cheap used cars under 10K',
    'best suv with price under 35k in Bellevue',
    'sports cars under 25k',
    'luxury mid size sedan in grey color with awd between 20k and 35k',
    '2022 nissan civic with less than 10K miles',
    'red mazda cx9 newer than 2019',
    'cr-v vs mazda cx9',
    'klein honda',
    'honda of seattle'
]

In [6]:
[f'{s} {local_token_classifier(s)}' for s in queries]

["rav4 under 20k near toledo [{'entity_group': 'model', 'score': 0.91556233, 'word': 'rav4', 'start': 0, 'end': 4}, {'entity_group': 'price', 'score': 0.89678526, 'word': 'under 20k', 'start': 5, 'end': 14}, {'entity_group': 'location', 'score': 0.9253358, 'word': 'near toledo', 'start': 15, 'end': 26}]",
 "honda crv limited under 25000 with less than 30000 miles [{'entity_group': 'make', 'score': 0.9982223, 'word': 'honda', 'start': 0, 'end': 5}, {'entity_group': 'model', 'score': 0.99104816, 'word': 'crv', 'start': 6, 'end': 9}, {'entity_group': 'trim', 'score': 0.41648585, 'word': 'limited', 'start': 10, 'end': 17}, {'entity_group': 'price', 'score': 0.9390913, 'word': 'under 25000', 'start': 18, 'end': 29}, {'entity_group': '0', 'score': 0.8885052, 'word': 'with less than', 'start': 30, 'end': 44}, {'entity_group': 'price', 'score': 0.73682314, 'word': '30000', 'start': 45, 'end': 50}, {'entity_group': '0', 'score': 0.8931481, 'word': 'miles', 'start': 51, 'end': 56}]",
 "Suv under

### Create test file for polaris

In [15]:
test = buy_research.sample(n=200)

In [18]:
test['Query'].to_csv(f'{path}/autos_ner_test.tsv', sep='\t', index=False, header=False)