In [1]:
!pip install transformers==4.28.0



In [2]:
!pip install --upgrade accelerate

Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
     ------------------------------------- 227.6/227.6 kB 14.5 MB/s eta 0:00:00
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.20.1
    Uninstalling accelerate-0.20.1:
      Successfully uninstalled accelerate-0.20.1
Successfully installed accelerate-0.20.3


In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
def cleaning(s):
    s = str(s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [5]:
df = pd.read_csv("Data/final_overall.csv") 
df = df.dropna()
df0 = df.loc[df['label']==0] #delivery
df1 = df.loc[df['label']==1] #size
df2 = df.loc[df['label']==2] #color
df3 = df.loc[df['label']==3] #quality

In [6]:
text_data = open('Articles_delivery.txt', 'w')
for idx, item in df0.iterrows():
  article = cleaning(item.sentence)
  text_data.write(article)
text_data.close()

In [7]:
text_data = open('Articles_size.txt', 'w')
for idx, item in df1.iterrows():
  article = cleaning(item.sentence)
  text_data.write(article)
text_data.close()

In [8]:
text_data = open('Articles_color.txt', 'w')
for idx, item in df2.iterrows():
  article = cleaning(item.sentence)
  text_data.write(article)
text_data.close()

In [9]:
text_data = open('Articles_quality.txt', 'w')
for idx, item in df3.iterrows():
  article = cleaning(item.sentence)
  text_data.write(article)
text_data.close()

In [10]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [11]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [13]:
# you need to set parameters 
train_file_path = "Articles_delivery.txt"
model_name = 'gpt2'
output_dir = '/fine_tuned_gpt2_fashion_delivery'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 1
save_steps = 500

# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

[34m[1mwandb[0m: Currently logged in as: [33mcharlie39653965[0m ([33mtextmining[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


In [14]:
# you need to set parameters 
train_file_path = "Articles_size.txt"
model_name = 'gpt2'
output_dir = '/fine_tuned_gpt2_fashion_size'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 1
save_steps = 500

# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Step,Training Loss
500,3.5742
1000,3.397


In [15]:
# you need to set parameters 
train_file_path = "Articles_color.txt"
model_name = 'gpt2'
output_dir = '/fine_tuned_gpt2_fashion_color'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 1
save_steps = 500

# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss


In [16]:
# you need to set parameters 
train_file_path = "Articles_quality.txt"
model_name = 'gpt2'
output_dir = '/fine_tuned_gpt2_fashion_quality'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 1
save_steps = 500

# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Step,Training Loss
500,3.8303
1000,3.6938


In [34]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
import re

In [49]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length, path):
    model_path = path
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return str(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

def extract_sentences(text):
    sentences = re.findall(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return sentences


In [59]:
print("배송, 사이즈, 색상, 퀄리티 중 선택하시오 : ", end='')
path = input() #fine_tuned_gpt2_fahsion_delivery
sequence = input() # oil price
max_len = 50 #int(input()) # 20
if path == "배송":
    path = "/fine_tuned_gpt2_fashion_delivery"
elif path == "사이즈":
    path = "/fine_tuned_gpt2_fashion_size"
elif path == "색상":
    path = "/fine_tuned_gpt2_fashion_color"
else:
    path = "/fine_tuned_gpt2_fashion_quality"
res = generate_text(sequence, max_len, path)
print(res)
print(res.split(sep = '.')[1]+".")

배송, 사이즈, 색상, 퀄리티 중 선택하시오 : 배송
 
    - you got to choose. The price was excellent, I ordered a bottle which I had with me when it came out for sale, but I bought the bottle in the event this arrived.The package was super large, the delivery
 The price was excellent, I ordered a bottle which I had with me when it came out for sale, but I bought the bottle in the event this arrived.


In [None]:
df_dt = 

In [None]:
sequence = "Good"
max_len = 50
path = "/fine_tuned_gpt2_fashion_delivery"
temp = []
for i in range(300):
    res = generate_text(sequence, max_len, path)
    res = res.split(sep = '.')[1]+"."
    print(res)
    temp.append(res)


In [25]:
df.loc[df['label']==0]

Unnamed: 0,column_num,sentence,label,overall
148,95,It arrived WEEKS later and has absolutely no s...,0,1
226,146,We can see on the label it came directly from ...,0,1
227,146,The items we ordered from America all got here...,0,1
281,185,i tried to return it but it will cost me twice...,0,1
506,340,Only positive was that it arrived well before ...,0,1
...,...,...,...,...
174913,109595,Super fast delivery.,0,5
175034,109701,They did arrive in the time frame that they to...,0,5
175051,109716,Product arrived very fast.,0,5
175261,109880,Fast delivery A+++++,0,5
