<a href="https://colab.research.google.com/github/aorogat/AskNowNQS/blob/master/llama3NewModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.23.0-py3

In [None]:
import os
import multiprocessing
import glob
from google.colab import drive
from datasets import load_dataset, Features, Value, Sequence
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

# Set the multiprocessing start method to 'spawn'
multiprocessing.set_start_method('spawn', force=True)

# Disable parallelism in tokenizers to avoid conflicts with multithreaded code
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Mount Google Drive
drive.mount('/content/drive')

# Define the dataset features explicitly
features = Features({
    "seed_withPrefix": Value("string"),
    "seedType_withPrefix": Value("string"),
    "questionString": Value("string"),
    "questionStringTagged": Value("string"),
    "query": Value("string"),
    "graphString": Value("string"),
    "noOfTriples": Value("int32"),
    "answers": Sequence(Value("string")),
    "answerCardinality": Value("int32"),
    "noOfTokens": Value("int32"),
    "keywords": Value("int32"),
    "questionType": Value("string"),
    "shapeType": Value("string"),
    "qustionComplexity": Value("float32"),
})

# Verify that the files exist
train_files = glob.glob("/content/drive/My Drive/SmallFiles/*.json")
test_files = glob.glob("/content/drive/My Drive/SmallFiles/*.json")

print("Train files found:", train_files)
print("Test files found:", test_files)

# Ensure we have found some files
if not train_files or not test_files:
    raise ValueError("No train or test files found. Please check the file paths.")

# Specify the path to your JSON files
data_files = {
    "train": train_files,
    "test": test_files
}

# Load the dataset with explicitly defined features
dataset = load_dataset("json", data_files=data_files, features=features)

# Load pre-trained model and tokenizer
model_name = "huggingface/llama-3"  # Replace with the actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["questionString"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine-tuned-llama3")
tokenizer.save_pretrained("./fine-tuned-llama3")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train files found: ['/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_1290.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_1291.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_1289.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_109.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_1288.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_1285.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_1287.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_1081.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_1.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_10.json', '/content/drive/My Drive/SmallFiles/DBPedia_TrainingQuestions_12.json', '/content/drive/My Drive/SmallFiles/DBPedia_T

Resolving data files:   0%|          | 0/569 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/569 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/569 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/569 [00:00<?, ?files/s]