## Libraries

In [1]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=80be6abbf435655a1a31641035daebaa1a484115d81e3929e86b1aa96f22c0c7
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [18]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from transformers import TrainerCallback
from transformers import GPT2Tokenizer
from rouge_score import rouge_scorer
import pandas as pd
import json
import os

## Dataset

In [3]:
folder_path = '/kaggle/input/amazon-products-dataset/'
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
csv_files

['Gaming Consoles.csv',
 'Car Electronics.csv',
 'Janitorial and Sanitation Supplies.csv',
 'All Electronics.csv',
 'All Books.csv',
 'Make-up.csv',
 'Travel Accessories.csv',
 'Indian Language Books.csv',
 'Car and Bike Care.csv',
 'Sunglasses.csv',
 'Bags and Luggage.csv',
 'Yoga.csv',
 'Sportswear.csv',
 'Fiction Books.csv',
 'Exam Central.csv',
 'Home Storage.csv',
 'Toys Gifting Store.csv',
 'All English.csv',
 'Amazon-Products.csv',
 'Air Conditioners.csv',
 'Shoes.csv',
 'Casual Shoes.csv',
 'Baby Products.csv',
 'Sports Collectibles.csv',
 'Wallets.csv',
 'Musical Instruments and Professional Audio.csv',
 'Gold and Diamond Jewellery.csv',
 'Nursing and Feeding.csv',
 'Home Furnishing.csv',
 'School Textbooks.csv',
 'All Hindi.csv',
 'Baby Bath Skin and Grooming.csv',
 'Coffee Tea and Beverages.csv',
 'Headphones.csv',
 'Furniture.csv',
 'Shirts.csv',
 'Subscribe and Save.csv',
 'Fitness Accessories.csv',
 'Formal Shoes.csv',
 'Cycling.csv',
 'Western Wear.csv',
 'Bedroom Linen.

In [4]:
dfs = []

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0.1,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,Unnamed: 0
0,Electronic Spices 2.75 Inch 4ω (Ohm) 400w Max ...,car & motorbike,Car Electronics,https://m.media-amazon.com/images/I/61G5k3T0ff...,https://www.amazon.in/Electronic-Spices-Power-...,,,₹129,₹199,
1,"ZQWINT Bluetooth Car Adapter, Mini USB Bluetoo...",car & motorbike,Car Electronics,https://m.media-amazon.com/images/I/51NLOKwNsL...,https://www.amazon.in/ZQWINT-Bluetooth-Transmi...,,,₹219,₹999,
2,PROTECTRON 6.35X32mm GLASS FUSE/INVERTER FUSE ...,car & motorbike,Car Electronics,https://m.media-amazon.com/images/I/71St2ruv+N...,https://www.amazon.in/PROTECTRON-6-35X32mm-GLA...,3.8,32.0,,₹105,
3,Cave Maruti Suzuki Male-Female Stereo Coupler ...,car & motorbike,Car Electronics,https://m.media-amazon.com/images/I/31npb8UF2y...,https://www.amazon.in/Maruti-Suzuki-Stereo-Cou...,4.1,7.0,₹582,₹873,
4,COVERBLACK Rubber Back Cover for Infinix X6815...,car & motorbike,Car Electronics,https://m.media-amazon.com/images/I/611zXbVxbN...,https://www.amazon.in/COVERBLACK-Infinix-X6815...,,,₹148,₹799,


In [5]:
df.shape

(1103170, 10)

In [6]:
df.columns

Index(['name', 'main_category', 'sub_category', 'image', 'link', 'ratings',
       'no_of_ratings', 'discount_price', 'actual_price', 'Unnamed: 0'],
      dtype='object')

In [7]:
df['main_category'].unique()

array(['car & motorbike', 'industrial supplies', 'tv, audio & cameras',
       'beauty & health', 'bags & luggage', 'accessories',
       'sports & fitness', 'stores', 'home & kitchen',
       'toys & baby products', 'appliances', 'grocery & gourmet foods',
       'pet supplies', "kids' fashion", "women's shoes", "men's shoes",
       "women's clothing", "men's clothing", 'music',
       'home, kitchen, pets'], dtype=object)

In [8]:
df['sub_category'].unique()

array(['Car Electronics', 'Janitorial & Sanitation Supplies',
       'All Electronics', 'Make-up', 'Travel Accessories',
       'Car & Bike Care', 'Sunglasses', 'Bags & Luggage', 'Yoga',
       'Sportswear', 'Home Storage', 'Toys Gifting Store',
       'Air Conditioners', 'All Appliances',
       'All Car & Motorbike Products', 'All Exercise & Fitness',
       'All Grocery & Gourmet Foods', 'All Home & Kitchen',
       'All Pet Supplies', 'All Sports, Fitness & Outdoors',
       'Amazon Fashion', 'Baby Bath, Skin & Grooming', 'Baby Fashion',
       'Baby Products', 'Backpacks', 'Badminton', 'Ballerinas',
       'Beauty & Grooming', 'Bedroom Linen', 'Camera Accessories',
       'Cameras', 'Camping & Hiking', 'Car Accessories', 'Car Parts',
       'Cardio Equipment', 'Casual Shoes', 'Clothing',
       'Coffee, Tea & Beverages', 'Cricket', 'Cycling', 'Diapers',
       'Diet & Nutrition', 'Dog supplies', 'Ethnic Wear',
       'Fashion & Silver Jewellery', 'Fashion Sales & Deals',
       'F

In [9]:
df.isnull().sum()

name                   0
main_category          0
sub_category           0
image                  0
link                   0
ratings           351588
no_of_ratings     351588
discount_price    122326
actual_price       35626
Unnamed: 0        551585
dtype: int64

In [10]:
# remove ₹ char 
def clean_price(value):
    if isinstance(value, str):
        # Remove currency symbols and commas
        return value.replace('₹', '').replace(',', '')
    return value

df['discount_price'] = df['discount_price'].apply(clean_price)
df['actual_price'] = df['actual_price'].apply(clean_price)

# convert data type to numerical
df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce') 
df['no_of_ratings'] = pd.to_numeric(df['no_of_ratings'], errors='coerce')
df['discount_price'] = pd.to_numeric(df['discount_price'], errors='coerce')
df['actual_price'] = pd.to_numeric(df['actual_price'], errors='coerce')

# fill missing numerical values with the median
df['ratings'] = df['ratings'].fillna(df['ratings'].median())
df['no_of_ratings'] = df['no_of_ratings'].fillna(df['no_of_ratings'].median())
df['discount_price'] = df['discount_price'].fillna(df['discount_price'].median())
df['actual_price'] = df['actual_price'].fillna(df['actual_price'].median())

# drop the unnecessary column 'Unnamed: 0'
df = df.drop(columns=['Unnamed: 0'])

df.isnull().sum()

name              0
main_category     0
sub_category      0
image             0
link              0
ratings           0
no_of_ratings     0
discount_price    0
actual_price      0
dtype: int64

In [11]:
df.shape

(1103170, 9)

In [12]:
df['input_text'] =  f"Product name: {df['name']}, Category: {df['main_category']}, Sub-category: {df['sub_category']}, Rating: {df['ratings'].fillna('0').astype(str)}, Price: {df['actual_price'].fillna('0').astype(str)}"

df['target_text'] = "This is a great product for " + df['main_category'] + " users."

df[['input_text', 'target_text']].head()

Unnamed: 0,input_text,target_text
0,Product name: 0 Electronic Spices 2.7...,This is a great product for car & motorbike us...
1,Product name: 0 Electronic Spices 2.7...,This is a great product for car & motorbike us...
2,Product name: 0 Electronic Spices 2.7...,This is a great product for car & motorbike us...
3,Product name: 0 Electronic Spices 2.7...,This is a great product for car & motorbike us...
4,Product name: 0 Electronic Spices 2.7...,This is a great product for car & motorbike us...


## Tokenization

In [13]:
# load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [15]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_sample(row, idx):
    if (idx + 1) % 1000 == 0:
        print(f"sample {idx + 1}")
    return tokenizer(row['input_text'], padding=True, truncation=True, max_length=512)

train_dataset = df[['input_text', 'target_text']].apply(lambda x: tokenize_sample(x, x.name), axis=1).tolist()

sample 1000
sample 2000
sample 3000
sample 4000
sample 5000
sample 6000
sample 7000
sample 8000
sample 9000
sample 10000
sample 11000
sample 12000
sample 13000
sample 14000
sample 15000
sample 16000
sample 17000
sample 18000
sample 19000
sample 20000
sample 21000
sample 22000
sample 23000
sample 24000
sample 25000
sample 26000
sample 27000
sample 28000
sample 29000
sample 30000
sample 31000
sample 32000
sample 33000
sample 34000
sample 35000
sample 36000
sample 37000
sample 38000
sample 39000
sample 40000
sample 41000
sample 42000
sample 43000
sample 44000
sample 45000
sample 46000
sample 47000
sample 48000
sample 49000
sample 50000
sample 51000
sample 52000
sample 53000
sample 54000
sample 55000
sample 56000
sample 57000
sample 58000
sample 59000
sample 60000
sample 61000
sample 62000
sample 63000
sample 64000
sample 65000
sample 66000
sample 67000
sample 68000
sample 69000
sample 70000
sample 71000
sample 72000
sample 73000
sample 74000
sample 75000
sample 76000
sample 77000
sample 7

In [29]:
# convert each BatchEncoding to a dictionary using .data or dict()
tokenized_dicts = [dict(sample) for sample in train_dataset]

In [32]:
# save the list of dictionaries as JSON
with open("tokenized_dicts.json", "w") as f:
    json.dump(tokenized_dicts, f)

In [None]:
train_dataset

In [17]:
tokenizer.save_pretrained('./kaggle/working/tokenizer')

('./kaggle/working/tokenizer/tokenizer_config.json',
 './kaggle/working/tokenizer/special_tokens_map.json',
 './kaggle/working/tokenizer/vocab.json',
 './kaggle/working/tokenizer/merges.txt',
 './kaggle/working/tokenizer/added_tokens.json')

In [33]:
import zipfile

file_path = "/kaggle/working/tokenized_dicts.json"
zip_path = "/kaggle/working/tokenized_dicts.zip"

# Create a zip file containing only train_dataset.json
with zipfile.ZipFile(zip_path, 'w') as zipf:
    zipf.write(file_path, arcname="tokenized_dicts.json")

## setting up the model

In [34]:
# load the pre-trained model (GPT-2)
model = GPT2LMHeadModel.from_pretrained('gpt2')

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [35]:
training_args = TrainingArguments(
    output_dir='./kaggle/working/results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_steps=10_000,
)

In [36]:
class LoggingCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 10 == 0:
            print(f"Step {state.global_step} / {state.max_steps}")

    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch:.0f} ended. Total steps: {state.global_step}")

    def on_epoch_start(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch:.0f} start")

# trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    callbacks=[LoggingCallback()]
)

In [37]:
# start training
trainer.train()



<IPython.core.display.Javascript object>

KeyboardInterrupt: 

## Generate product description

In [None]:
# generate description based on the product info
def generate_description(row):
    # Generate the description
    output = model.generate(inputs['input_ids'], max_length=100, num_beams=5, no_repeat_ngram_size=2)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return generated_text

df['generated_description'] = df.apply(generate_description, axis=1)

df[['name', 'generated_description']].head()

## Evaluation

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score("Actual product description here.", generated_text)

scores