In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True #  4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    #model_name = "unsloth/llama-3-8b-bnb-4bit",
    model_name="a-hamdi/NGILlama3-merged",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: NVIDIA RTX A4000. Max memory: 15.635 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu118. CUDA = 8.6. CUDA Toolkit = 11.8.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1+cu118. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
alpaca_prompt = """Below is an article that describes a news. Write a response that appropriately completes the request.

### Article:
{}


### Label:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    articles = examples["Article"]
    labels      = examples["label"]
    texts = []
    for Article,Label in zip(articles,labels):
        # Must add EOS_TOKEN, otherwise the generation will go on forever!
        text = alpaca_prompt.format(Article, Label) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
url='TheFullDataset.csv'
#i'm only selecting 1000! we should remove that line when real finetuning
dataset = load_dataset("csv", data_files = {"train" : url}, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
length_dataset=len(dataset)
val_dataset=dataset.select(indices=range(length_dataset-50000,length_dataset))



In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "LASIK warning controversy sparks debate on telivision on how dangerous it is, and they concluded it is lethal.", # article
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an article that describes a news. Write a response that appropriately completes the request.

### Article:


### Label:
Fake<|end_of_text|>


In [None]:
import io
import contextlib
def get_prediction(article):
    # Suppress stdout
    f = io.StringIO()
    with contextlib.redirect_stdout(f):
        prompt = alpaca_prompt.format(article, "", "")
        inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True, pad_token_id=tokenizer.eos_token_id)
    # Resume normal stdout
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # Assuming the model's output is either "Fake" or "Reliable"
    return generated_text.strip().lower()

In [None]:
output=get_prediction("warning controversy sparks debate on telivision on how dangerous it is, and they concluded it is lethal.")
output[-8:]

'reliable'

In [None]:
from tqdm import tqdm
import re
# Lists to store true labels and predictions
true_labels = []
predictions = []
def preprocess_text(article):
    # Remove the "### Label" section
    article = re.sub(r'### Label:.*', '', article)

    # Remove words like "reliable" or "fake" (case insensitive)
    article = re.sub(r'\b(?:reliable|fake)\b', '', article, flags=re.IGNORECASE)

    # Strip leading and trailing whitespace
    return article.strip()
true_labels = []
predictions = []
# Iterate over the evaluation dataset
for example in tqdm(val_dataset,desc="validating the model"):
    article = preprocess_text(example["text"])
    true_labell = example["label"].lower()  # Assuming the label field in your dataset is "label"
    if "fake" in true_labell:
      true_label=0
    else:
      true_label=1

    # Get the model's prediction
    predictionn = get_prediction(article)


    if "fake" in predictionn:
      prediction=0
    else:
      prediction=1
    # Append to lists
    true_labels.append(true_label)
    predictions.append(prediction)



validating the model:   0%|          | 0/50000 [00:00<?, ?it/s]

validating the model: 100%|██████████| 50000/50000 [4:50:11<00:00,  2.87it/s]  


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Compute confusion matrix
cm = confusion_matrix(true_labels, predictions, labels=[0, 1])

# Compute accuracy
acc = accuracy_score(true_labels, predictions)

# Compute precision, recall, and F1 score
precision = precision_score(true_labels, predictions, labels=[0, 1])
recall = recall_score(true_labels, predictions, labels=[0, 1])
f1 = f1_score(true_labels, predictions, labels=[0, 1])

print("Confusion Matrix:")
print(cm)
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Confusion Matrix:
[[26136     2]
 [   32 23830]]
Accuracy: 0.9993
Precision: 0.9999
Recall: 0.9987
F1 Score: 0.9993


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Compute confusion matrix
cm = confusion_matrix(true_labels, predictions, labels=[0, 1])

# Compute accuracy
acc = accuracy_score(true_labels, predictions)

# Compute precision, recall, and F1 score
precision = precision_score(true_labels, predictions, labels=[0, 1])
recall = recall_score(true_labels, predictions, labels=[0, 1])
f1 = f1_score(true_labels, predictions, labels=[0, 1])

print("Confusion Matrix:")
print(cm)
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Confusion Matrix:
[[26136     2]
 [   32 23830]]
Accuracy: 0.9993
Precision: 0.9999
Recall: 0.9987
F1 Score: 0.9993


In [None]:
import pandas as pd
df = pd.DataFrame({'True Label': true_labels, 'Prediction': predictions})
df.to_csv('predictions.csv', index=False)