<a href="https://colab.research.google.com/github/Yazeedx0/Test-AI-Code/blob/main/Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install torch

In [None]:
from datasets import load_dataset
import pandas as pd
df = pd.read_csv('/content/Test.csv')


dataset = load_dataset('csv', data_files={'train': '/content/Test.csv'})

print(dataset['train'][0])
df['text'] = df['Question'] + " " + df['Answer']

df.head()



In [30]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("riotu-lab/ArabianGPT-01B", use_fast=True)
model = AutoModelForCausalLM.from_pretrained("riotu-lab/ArabianGPT-01B")

encodings = tokenizer(list(df['text']), truncation=True, padding=True, max_length=512, return_tensors='pt')

if 'token_type_ids' in encodings:
    encodings.pop('token_type_ids')

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings['input_ids']
        self.attention_mask = encodings['attention_mask']

    def __len__(self):
        return self.input_ids.size(0)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.input_ids[idx].clone()
        }
        return item

dataset = CustomDataset(encodings)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()


Step,Training Loss
500,0.6729
1000,0.0988
1500,0.0697
2000,0.0654
2500,0.0517
3000,0.0484


TrainOutput(global_step=3054, training_loss=0.1657236754386817, metrics={'train_runtime': 2126.6261, 'train_samples_per_second': 5.743, 'train_steps_per_second': 1.436, 'total_flos': 3191159586816000.0, 'train_loss': 0.1657236754386817, 'epoch': 3.0})

In [31]:
model.save_pretrained("./arabian_gpt_trained")
tokenizer.save_pretrained("./arabian_gpt_trained")


('./arabian_gpt_trained/tokenizer_config.json',
 './arabian_gpt_trained/special_tokens_map.json',
 './arabian_gpt_trained/tokenizer.json')

In [61]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, pipeline

model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/arabian_gpt_trained")
tokenizer = PreTrainedTokenizerFast.from_pretrained("/content/drive/MyDrive/arabian_gpt_trained")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

text = 'كم عدد الساعات المعتمدة التي يجب أن يدرسها الطالب لتصنيفه في مستوى السنة الثانية؟	'

generated_text = pipe(
    text,
    max_new_tokens=50,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    num_return_sequences=1,
    do_sample=True
)

print(generated_text)


[{'generated_text': 'كم عدد الساعات المعتمدة التي يجب أن يدرسها الطالب لتصنيفه في مستوى السنة الثانية؟\t 60 ساعة معتمدة.'}]


# Second Training


In [None]:
from datasets import load_dataset
import pandas as pd
df = pd.read_csv('/content/dataSEt_rows (1).csv')


dataset = load_dataset('csv', data_files={'train': '/content/dataSEt_rows (1).csv'})

print(dataset['train'][0])
df['text'] = df['Question'] + " " + df['Answer']

df.head()



In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/arabian_gpt_trained")
tokenizer = PreTrainedTokenizerFast.from_pretrained("/content/drive/MyDrive/arabian_gpt_trained")

tokenizer.pad_token = tokenizer.eos_token

encodings = tokenizer(list(df['text']), truncation=True, padding=True, max_length=512, return_tensors='pt')

if 'token_type_ids' in encodings:
    encodings.pop('token_type_ids')

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings['input_ids']
        self.attention_mask = encodings['attention_mask']

    def __len__(self):
        return self.input_ids.size(0)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.input_ids[idx].clone()
        }
        return item

dataset = CustomDataset(encodings)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()
