In [5]:
import transformers
import pandas as pd
import numpy as np
import torch
import datasets
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [6]:
df = pd.read_csv("/kaggle/input/businessgpt7/new_for_gpt_2_7.csv")
df.isna().sum()

text    0
dtype: int64

In [7]:
df.head()

Unnamed: 0,text
0,wayona nylon braided usb to lightning fast cha...
1,ambrane unbreakable 60w 3a fast charging 1.5m ...
2,boat deuce usb 300 2 in 1 typec micro usb stre...
3,portronics konnect l 1.2m fast charging 3a 8 p...
4,ptron solero tb301 3a typec data and fast char...


In [8]:
df["text"][0]

'wayona nylon braided usb to lightning fast charging and data sync cable compatible for iphone 13, 12,11, x, 8, 7, 6, 5, ipad air, pro, mini 3 ft pack of 1, grey ->\n discounted price 399 actual price 1,099 discount percentage 64 rating 4.2 rating count 24,269 about product high compatibility compatible with iphone 12, 11, xxsmaxxr ,iphone 88 plus,iphone 77 plus,iphone 6s6s plus,iphone 66 plus,iphone 55s5cse,ipad pro,ipad air 12,ipad mini 123,ipod nano7,ipod touch and more apple devices.fast chargedata sync it can charge and sync simultaneously at a rapid speed, compatible with any charging adaptor, multiport charging station or power bank.durability durable nylon braided design with premium aluminum housing and toughened nylon fiber wound tightly around the cord lending it superior durability and adding a bit to its flexibility.high security level it is designed to fully protect your device from damaging excessive current.copper core thickmultilayer shielding, antiinterference, protec

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
output_file = '/kaggle/working/business.txt'
with open(output_file, 'w') as f:
        for index,row in df.iterrows():
            f.write(row["text"]+"\n")


In [11]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=output_file,
    block_size=256
)




In [12]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)


In [13]:
training_args = TrainingArguments(
    output_dir='./results',           # output directory
    num_train_epochs=10,               # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=32,    # batch size for evaluation
    warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',             # directory for storing logs
    logging_steps=100,    # number of steps between logging updates
    learning_rate=5e-5
)


In [14]:
trainer = Trainer(
    model=model,                      
    args=training_args,                
    data_collator=data_collator,        
    train_dataset=dataset
)


In [15]:
import os 
os.environ["WANDB_DISABLED"] = "true"

In [16]:
trainer.train()



Step,Training Loss
100,4.9469
200,4.3614
300,4.1064
400,3.9409
500,3.7969
600,3.6913
700,3.5762
800,3.4641
900,3.3866
1000,3.3326


TrainOutput(global_step=1380, training_loss=3.6898699442545575, metrics={'train_runtime': 846.4673, 'train_samples_per_second': 25.908, 'train_steps_per_second': 1.63, 'total_flos': 2865067130880000.0, 'train_loss': 3.6898699442545575, 'epoch': 10.0})

In [17]:
model.save_pretrained("./gpt2_256_full/")

In [5]:
from transformers import pipeline, GPT2Tokenizer, AutoModelWithLMHead

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = AutoModelWithLMHead.from_pretrained('/kaggle/input/gpt2business/kaggle/working/gpt2_256_full')

business_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

prompt = "acer led tv"
generated_text = business_generator(prompt, max_length=100)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [6]:
print(generated_text[0]['generated_text'])

acer led tv series and smart tv led smart tv set top box discounted price 899 actual price 1,499 discount percentage 62 rating 4.2 rating count 5,945 about product aa certified smart led smart tv this smart led tv with aa certified optical fiber for best picture quality and sound fidelity. it has a 6k lcd display. it features ip68 waterproof and dustproof body plus no chicaner.easy to install the smart led tv with aa certification and plug in


In [10]:
from transformers import pipeline

nlp = pipeline("question-answering")

context = r"""
acer led tv series and smart tv led smart tv set top box discounted price 899 actual price 1,499 discount percentage 62 rating 4.2 rating count 5,945 about product aa certified smart led smart tv this smart led tv with aa certified optical fiber for best picture quality and sound fidelity. it has a 6k lcd display. it features ip68 waterproof and dustproof body plus no chicaner.easy to install the smart led tv with aa certification and plug in
"""
print(nlp(question="what is the actual price of this tv?", context=context,max_answer_len=100))

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.3242349624633789, 'start': 92, 'end': 97, 'answer': '1,499'}
