# **Preprocessing and Prepare data to single format**

In [None]:
import pandas as pd
import pyarrow.parquet as pq
import json

In [None]:
# parquet file to dataFrame
def par2Df(parquet_file):
    table = pq.read_table(parquet_file)
    df = table.to_pandas()
    drop_columns = ["Biography","Emotion","Name"]
    df.drop(drop_columns, axis=1, inplace=True)
    return df

In [None]:
# json lines files to  dataframe

def json2Df(json_file):
    datas = []
    dataList = []

    # add only specific key in the list
    with open(json_file, "r") as file:
        for line in file:
            data = json.loads(line)
            datas.append(data["dialogue_text"])

    # remove unwanted string
    for element in datas:
      element = element.replace("Summarize the dialogue","")
      dataList.append(element)

    datas = []

    # split dialogues
    for element in dataList:
      element = str(element).strip().split("\n")
      datas.append(element)

    # remove character's name
    for i in range(len(datas)):
      for j in range(len(datas[i])):
          last = datas[i][j].rfind(":")
          if datas[i][j].count(":") % 2 != 0:
              datas[i][j] = datas[i][j][last:]
              datas[i][j] = datas[i][j].replace(":","")

    # remove unanswered content
    for sublist in datas:
      if len(sublist) % 2 != 0:
        sublist.pop(-1)

    # split by query and response
    query = []
    response = []
    for sublist in datas:
      for i in range(len(sublist)):
        if i % 2 == 0:
          query.append(sublist[i])
        else:
          response.append(sublist[i])

    df = pd.DataFrame({'Query': query, 'Response': response})
    return df

In [None]:
par_files = ["/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/raw_data/npc-dialogue/test-00000-of-00001-0408c6b1dfcc3c77.parquet",
             "/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/raw_data/npc-dialogue/train-00000-of-00001-4eeea4877d4ce970.parquet"]
jsonl_files = ["/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/raw_data/light-batch-summarize-dialogue/test.jsonl",
               "/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/raw_data/light-batch-summarize-dialogue/train.jsonl",
               "/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/raw_data/light-batch-summarize-dialogue/valid.jsonl"]

In [None]:
# create df for single format
par0 = par2Df(par_files[0])
par1 = par2Df(par_files[1])

json0 = json2Df(jsonl_files[0])
json1 = json2Df(jsonl_files[1])
json2 = json2Df(jsonl_files[2])

dataFrame = pd.concat([par0, par1, json0, json1, json2], ignore_index=True)
dataFrame.info()

In [None]:
# take samples for small dataset
smallDF = dataFrame.sample(100,random_state=50)
# take samples for medium dataset
mediumDF = dataFrame.sample(1000,random_state=50)
# use all data for large dataset
largeDF = dataFrame

# save datasets  csv format
smallDF.to_csv("/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/processed_data/small_dataset.csv", index=False)
mediumDF.to_csv("/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/processed_data/medium_dataset.csv", index=False)
largeDF.to_csv("/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/processed_data/large_dataset.csv", index=False)


# **Training & Save Models**

### **Creating Training Model**

In [None]:
# use transformers and accelerate
!pip install transformers
!pip install accelerate

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [None]:
def load_and_prepare_data(file_path):
    df = pd.read_csv(file_path)
    combined_texts = ["Query: " + str(row['Query']).lower() + "\nAnswer: " + str(row['Response']).lower() for _, row in df.iterrows()]
    return combined_texts

def tokenize_data(tokenizer, texts, block_size=128):
    tokenized_data = []
    for text in texts:
        tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding='max_length', return_tensors="pt")
        tokenized_data.append(tokenized_text['input_ids'].squeeze())
    return tokenized_data

def create_dataloader(tokenized_data, batch_size):
    dataset = torch.stack(tokenized_data)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    combined_texts = load_and_prepare_data(train_file_path)

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_data = tokenize_data(tokenizer, combined_texts)

    train_dataloader = create_dataloader(tokenized_data, per_device_train_batch_size)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=save_steps,
        logging_steps=save_steps,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataloader.dataset,
    )

    trainer.train()
    trainer.save_model()

### **Train Test Model**

In [None]:
# args for training
train_file_path = "/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/processed_data/small_dataset.csv" # must be dataset path
model_name = 'gpt2' #based model name
output_dir = '/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/models/testModel'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 10
save_steps = 100

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

### **Small Data High Epoch Model**

In [None]:
train_file_path = "/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/processed_data/small_dataset.csv"
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/models/smallModel'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50
save_steps = 100

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

### **Medium Data Medium Epoch Model**

In [None]:
train_file_path = "/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/processed_data/medium_dataset.csv"
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/models/mediumModel'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 10
save_steps = 100

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

### **Large Data Low Epoch**

In [None]:
train_file_path = "/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/data/processed_data/large_dataset.csv"
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/models/largeModel'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5
save_steps = 500

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

# **Inference for Testing Model**

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "/content/drive/MyDrive/Colab Notebooks/graduation_project/preprocessing&training/models/mediumModel"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


In [None]:
user_input = input()
query = f"Query: {user_input} [END]"
input_ids = tokenizer.encode(query, return_tensors="pt",truncation=True)
output = model.generate(input_ids, max_length=50+len(user_input), num_return_sequences=1, temperature=0.7, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(output[0], skip_special_tokens=True)
response = response.replace(query, "").strip()
response = response.replace("Answer: ", "").strip()
print(response)
