<a href="https://colab.research.google.com/github/avikumart/LLM-GenAI-Transformers-Notebooks/blob/main/DeepLearningFiles/hgmodelpipe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, Gemma3ForConditionalGeneration, AutoModelForCausalLM
import torch

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# load data from the data folder named all_medtext.csv
data = pd.read_csv('/content/drive/MyDrive/NLP-project-files/final_medtext_data/all_medtext.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# remove the rows with null values
data = data.dropna()

In [None]:
sampleddata = data.sample(5000)

In [None]:
sampleddata.info()

In [None]:
# convert pandas dataframe to huggingface dataset by creating train and test splits using sklearn train_test_split with a functional code
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(sampleddata, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [None]:
dataset

In [None]:
# write a function to preprocess the data for causal language modeling
def preprocess_function(examples):
    # Concatenate input and output for causal language modeling
    # You might want to add special tokens to separate the input and output
    # depending on how the model was trained. For Llama-3.2-1B, we might
    # follow a conversational template or simply concatenate. Let's start
    # with a simple concatenation for now.
    text = [f"Question: {q.strip()}\n Answer: {a.strip()}" for q, a in zip(examples["Input"], examples["output"])]

    # Tokenize the concatenated text
    model_inputs = tokenizer(
        text,
        max_length=1024, # Increased max_length to accommodate concatenated text
        truncation=True,
        padding="max_length",
    )

    # For causal language modeling, the labels are typically the input_ids
    # shifted by one position. However, the Trainer handles this internally
    # when the 'labels' key is present and matches 'input_ids'.
    model_inputs["labels"] = model_inputs["input_ids"].copy()

    return model_inputs

In [None]:
# add the token to add the huggingface account authentication token
!huggingface-cli login --token "your_hf_key"

In [None]:
# check thr gpu availability
torch.cuda.is_available()

In [None]:
# load the tokenizer and model from transformers import AutoTokenizer, AutoModelForQuestionAnswering which is meta-llama/Llama-2-7b
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B").to("cuda")
# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# tokenize the dataset
# remove the columns that are not needed for training
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['Input', 'output', '__index_level_0__'])
tokenized_datasets

In [None]:
# set up training arguments and trainer
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [None]:
# start training
trainer.train()

In [None]:
# save the results folder to drive
from google.colab import files
!cp -r /content/my_hf_healthcarechat_model /content/drive/MyDrive/NLP-project-files/results1118

In [None]:
!cp /content/my_hf_healthcarechat_model/model.safetensors /content/drive/MyDrive/NLP-project-files/results1118

In [None]:
!huggingface-cli login --token "your_hf_key"

In [None]:
# save model
# model is your fine-tuned model (e.g., from AutoModelForSequenceClassification)
# tokenizer is the corresponding tokenizer
save_directory = "./my_hf_healthcarechat_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
repo_name = "avikumart/Medical_chat_model"

In [None]:
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)