In [1]:
!pip install datasets



In [3]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
from datasets import Dataset


In [4]:
df = pd.read_csv('/kaggle/input/new-data/Data_with_description_partial.csv')
df.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value,description
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,PROPS' NATURE INGREDIENT MENAGER MULTI-USAGE T...
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,recalcitrantes sur toutes les surfaces (moquet...
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION Serving Size:1 Tablet (0.709 g)Eac...
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,Horbaach HIGHSTRENGTH PSYLLIOM HUSK 1400 PLANT...


In [5]:
df['entity_name'].unique()

array(['item_weight', 'item_volume', 'voltage', 'wattage',
       'maximum_weight_recommendation', 'height', 'depth', 'width'],
      dtype=object)

In [6]:
df['image_link'][4]

'https://m.media-amazon.com/images/I/617Tl40LOXL.jpg'

In [7]:
len(df)

130000

In [8]:
df['group_id'].value_counts()

group_id
459516    9363
281678    6102
308856    5405
752266    4824
731432    4711
          ... 
521308       1
952470       1
297246       1
679049       1
178031       1
Name: count, Length: 247, dtype: int64

In [9]:
df1 = df.drop('image_link', axis=1)
df1.head()

Unnamed: 0,group_id,entity_name,entity_value,description
0,748919,item_weight,500.0 gram,PROPS' NATURE INGREDIENT MENAGER MULTI-USAGE T...
1,916768,item_volume,1.0 cup,recalcitrantes sur toutes les surfaces (moquet...
2,459516,item_weight,0.709 gram,COMPOSITION Serving Size:1 Tablet (0.709 g)Eac...
3,459516,item_weight,0.709 gram,
4,731432,item_weight,1400 milligram,Horbaach HIGHSTRENGTH PSYLLIOM HUSK 1400 PLANT...


In [10]:
def prepare_input(row):
    return f"group: {row['group_id']} entity_name: {row['entity_name']} description: {row['description']}"

df1['input'] = df1.apply(prepare_input, axis=1)
df1['output'] = df1['entity_value']

In [11]:
df1.head()

Unnamed: 0,group_id,entity_name,entity_value,description,input,output
0,748919,item_weight,500.0 gram,PROPS' NATURE INGREDIENT MENAGER MULTI-USAGE T...,group: 748919 entity_name: item_weight descrip...,500.0 gram
1,916768,item_volume,1.0 cup,recalcitrantes sur toutes les surfaces (moquet...,group: 916768 entity_name: item_volume descrip...,1.0 cup
2,459516,item_weight,0.709 gram,COMPOSITION Serving Size:1 Tablet (0.709 g)Eac...,group: 459516 entity_name: item_weight descrip...,0.709 gram
3,459516,item_weight,0.709 gram,,group: 459516 entity_name: item_weight descrip...,0.709 gram
4,731432,item_weight,1400 milligram,Horbaach HIGHSTRENGTH PSYLLIOM HUSK 1400 PLANT...,group: 731432 entity_name: item_weight descrip...,1400 milligram


In [12]:
train_df, val_df = train_test_split(df1, test_size=0.2, random_state=42)

In [13]:
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


In [15]:
train_dataset

Dataset({
    features: ['group_id', 'entity_name', 'entity_value', 'description', 'input', 'output', '__index_level_0__'],
    num_rows: 104000
})

In [16]:
def tokenize_function(examples):
    inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=512)
    outputs = tokenizer(examples["output"], padding="max_length", truncation=True, max_length=128)
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": outputs.input_ids
    }

In [17]:
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)


Map:   0%|          | 0/104000 [00:00<?, ? examples/s]

Map:   0%|          | 0/26000 [00:00<?, ? examples/s]

In [18]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [19]:
import numpy as np

def custom_f1_score(predictions, references):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for pred, ref in zip(predictions, references):
        if pred != "" and ref != "":
            if pred == ref:
                true_positives += 1
            else:
                false_positives += 1
        elif pred != "" and ref == "":
            false_positives += 1
        elif pred == "" and ref != "":
            false_negatives += 1
        # Note: We don't need to count true negatives for F1 score

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {"precision": precision, "recall": recall, "f1": f1}

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute custom F1 score
    result = custom_f1_score(decoded_preds, decoded_labels)
    
    return result

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Added to match evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    fp16=True,
    gradient_accumulation_steps=4,
    save_total_limit=2,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    predict_with_generate=True# Specify which metric to use for selecting the best model
)



In [21]:
from transformers import DataCollatorForSeq2Seq

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

In [23]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [24]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.027,0.023934,0.489462,1.0,0.657233
2,0.0237,0.02232,0.527769,1.0,0.690902
3,0.0242,0.021852,0.536115,1.0,0.698014


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=9750, training_loss=0.05733240163020598, metrics={'train_runtime': 38751.2665, 'train_samples_per_second': 8.051, 'train_steps_per_second': 0.252, 'total_flos': 1.8999486185472e+17, 'train_loss': 0.05733240163020598, 'epoch': 3.0})

In [26]:
trainer.save_model("/kaggle/working//fine_tuned_model_3epoch")


In [1]:
input_text = "group: 101697 entity_name: width description: Happy Birthday! 4n 5\" 6\" uL 8\" 9\" 10\" 11\" 12\""
inputs = tokenizer(input_text, return_tensors="pt")
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}
outputs = model.generate(**inputs)
predicted_entity_value = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Predicted entity value: {predicted_entity_value}")

NameError: name 'tokenizer' is not defined

In [33]:
def predict(input_text):
    inputs = tokenizer(input_text, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
test_input = "group: 101697 entity_name: depth description: 23.3cm/9.17in 27.2cm/10.7in"
prediction = predict(test_input)
print(f"Input: {test_input}")
print(f"Predicted entity value: {prediction}")

Input: group: 101697 entity_name: depth description: 23.3cm/9.17in 27.2cm/10.7in
Predicted entity value: 23.3 centimetre


In [None]:
eval_results = trainer.evaluate()


In [None]:
eval_results

{'eval_loss': 0.03720603138208389,
 'eval_runtime': 66.0455,
 'eval_samples_per_second': 33.416,
 'eval_steps_per_second': 8.358,
 'epoch': 2.99728014505893}

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd

# Directory where all your model and tokenizer files are saved
model_directory = '/kaggle/input/model/transformers/default/1'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_directory)

In [None]:
import pandas as pd

# Load the CSV file
test_df = pd.read_csv('/kaggle/input/testing-dataset/merged_output.csv')

# Assuming the CSV has 'group_id', 'entity_name', and 'description' columns
def predict_from_csv(row):
    input_text = f"group: {row['group_id']} entity_name: {row['entity_name']} description: {row['description']}"
    inputs = tokenizer(input_text, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Apply the prediction function to each row in the DataFrame
test_df['predicted_entity_value'] = test_df.apply(predict_from_csv, axis=1)

# Save predictions to a new CSV file
test_df.to_csv('/kaggle/working/output/test_out.csv', index=False)

# Optionally, print out some predictions
print(test_df[['group_id', 'entity_name', 'description', 'predicted_entity_value']].head())




In [32]:
print("over")

over
