Connect to Google Drive:

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/Colab Notebooks/MP'

Import required modules:

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd
import torch

Set seed for PyTorch random number generation:

In [None]:
torch.random.manual_seed(0)

Open the original dataset and print its column:

In [None]:
df = pd.read_csv("Datasets/new_dataset.csv", encoding="utf-16", sep="\t")
print(df.columns)

Load the model:

In [None]:
model_name = "microsoft/Phi-3-mini-4k-instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    trust_remote_code=True,
).to("cuda")

tokenizer = AutoTokenizer.from_pretrained(model_name, device = "cuda")

Prepare the input arguments for the LLM:

In [None]:
system_prompt = """You are an assistant that helps fact-checking claims.
Given a claim and the label telling its veracity, you need to normalize the label to standart format,
between this choices: True, Mostly False, False, Mixture. Answer only with the label."""

a = 100
claim = df["claimReviewed"].iloc[a]
label = df["reviewRating.alternateName"].iloc[a]

claim_review = f"Claim: {claim}\nLabel: {label}"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": claim_review}
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 100,
    "return_full_text": False,
    "do_sample": False,
}

Print the results:

In [None]:
print(claim_review)
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])