In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

import csv


In [5]:
file_path = '/kaggle/input/challenge/Train_Tagged_Titles.tsv'

# Read the data from the TSV file
trained_data = pd.read_csv(file_path, sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)
trained_data = trained_data.replace('No Tag', '0')

trained_data = trained_data.fillna(method='ffill')
trained_data['word_labels'] = trained_data[['Record Number','Tag']].groupby(['Record Number'])['Tag'].transform(lambda x: ','.join(x))
data = trained_data[["Title", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.rename(columns={'Title': 'sentence'}, inplace=True)
data['sentence'] = data['sentence'].apply(remove_special_characters_from_sentence)

label2id = {k: v + 1 for v, k in enumerate(trained_data.Tag.unique())}

id2label = {v + 1: k for v, k in enumerate(trained_data.Tag.unique())}

label2id['0']=0
id2label[0]='0'


In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification


#models now deleted to comply with E-Bay rules 
tokenizer_gelectra = BertTokenizer.from_pretrained('codern/downstream-gelectra-large')
model_gelectra = BertForTokenClassification.from_pretrained('codern/downstream-gelectra-large')

tokenizer_gbert = BertTokenizer.from_pretrained('codern/downstream-gbert-large')
model_gbert = BertForTokenClassification.from_pretrained('codern/downstream-gbert-large')

tokenizer_guncased = BertTokenizer.from_pretrained('codern/bert-regular-base-uncased')
model_guncased = BertForTokenClassification.from_pretrained('codern/bert-regular-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [36]:
def extractor(sentence,tokenizer,model):
    sentence = remove_special_characters_from_sentence(sentence)
    inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

    # move to gpu
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    # forward pass
    outputs = model(ids, mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels) # shape
    flattened_predictions = torch.argmax(active_logits, axis=1) # tokenize

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) # prediction per word

    word_level_predictions = []
    for pair in wp_preds:

      if (pair[0].startswith("##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):

        continue
      else:
        word_level_predictions.append(pair[1])

    str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
    return word_level_predictions

sentence = 'New Balance Sneakers Damen Freizeitschuhe Turnschuhe Gr . DE 42.5 Kun ... # 19f7d45'
print(extractor(sentence,tokenizer_gbert,model_gbert)==extractor(sentence,tokenizer_guncased,model_guncased))

True


In [54]:
file_path = '/kaggle/input/challenge/Listing_Titles.tsv'

# Read the data from the TSV file
listing_data = pd.read_csv(file_path, sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)

In [55]:
listing_data = listing_data[5000:30000]


In [56]:
listing_data.head()

Unnamed: 0,Record Number,Title
5000,5001,NIKE FREE RUN 3 SHIELD 5.0 SNEAKERS LAUFSCHUHE...
5001,5002,DAMEN SCHUHE 153351 SNEAKER WEISS 38 NEU
5002,5003,Converse Sneakers Damen Gr . DE 36 Leder grau ...
5003,5004,Adidas Freizeitschuh Gr UK 9
5004,5005,K Swiss Schuhe schwarz Leder größe 41 low snea...




In [62]:
import pandas as pd
from tqdm import tqdm
from collections import Counter


result_rows = []

# Assuming you already have 'listing_data' and 'label2id' defined

for i, row in tqdm(listing_data.iterrows(), total=len(listing_data), desc="Processing"):
    record_number = row['Record Number']
    title = row['Title']
    characteristics_guncased = extractor(title, tokenizer_guncased, model_guncased) 
    characteristics_gelectra = extractor(title, tokenizer_gelectra, model_gelectra)
    characteristics_gbert = extractor(title, tokenizer_gbert, model_gbert)
    
    characteristics = [max(Counter(x), key=Counter(x).get) for x in zip(characteristics_guncased, characteristics_gelectra, characteristics_gbert)]

    title = title.split()
    
    for index, value in enumerate(title):
        aspect_name = characteristics[index]
        
        if aspect_name == '0' or aspect_name == 0 or aspect_name == 'No Tag':
            continue
        
        if len(characteristics) != len(title):
            print(record_number, title)
        
        # Check if the current row matches the previous row's Record Number and Aspect Name
        if i > 0 and len(result_rows)>0 and record_number == result_rows[-1]['Record Number'] and aspect_name == result_rows[-1]['Aspect Name']:
            # Combine the Aspect Value with the previous row
            result_rows[-1]['Aspect Value'] += ' ' + value
        else:
            # Add a new row to the list
            result_rows.append({'Record Number': record_number, 'Aspect Name': aspect_name, 'Aspect Value': value})

# Create the DataFrame from the accumulated rows
result_df = pd.DataFrame(result_rows)

result_df.head()


Processing: 100%|██████████| 25000/25000 [1:53:22<00:00,  3.68it/s]  


Unnamed: 0,Record Number,Aspect Name,Aspect Value
0,5001,Marke,NIKE
1,5001,Produktlinie,FREE
2,5001,Modell,RUN 3 SHIELD 5.0
3,5001,Stil,SNEAKERS
4,5001,Produktart,LAUFSCHUHE


In [63]:
output_file_path = '/kaggle/working/evalensemble.tsv'

# Save the DataFrame to a CSV file with tab-separated values (TSV)
result_df.to_csv(output_file_path, sep='\t', index=False)

print("finish")

finish
