In [1]:
import torch
torch.cuda.is_available()


True

In [2]:
import pandas as pd

df = pd.read_csv("cannabis_studies_complete_2025_classified.csv")
print(df.shape)
df.head()


(12292, 11)


Unnamed: 0,id,study_title,study_link,resultIA_no_fine_tunning,resultIA_fine_tunning,study_type,study_year,cannabinoids,organ_systems,study_conditions,pdf_final_url
0,2025036,"Pharmacokinetics of cannabidiol, (-)-trans-Δ9-...",https://doi.org/10.3389/fvets.2025.1556975,Inconclusive,Inconclusive,Animal Study - Veterinary Clinical Trial,2025,CBD; CBG; CBN; Cannabichromene (CBC); Cannabid...,Cardiovascular System; Digestive System; Urina...,Anxiety; Arthritis; Epilepsy; Pain; Skin Disea...,https://www.frontiersin.org/journals/veterinar...
1,2017174,Essential oil of lavender in anxiety disorders...,https://mhc.cpnp.org/doi/full/10.9740/mhc.2017...,Positive,Positive,Review/Other,2017,,Mental/Emotional System; Nervous System,Anxiety; Depression; Pain; Stress; Wound Healing,https://europepmc.org/backend/ptpmcrender.fcgi...
2,2022648,Serum Cannabinoid 24 h and 1 Week Steady State...,https://www.frontiersin.org/articles/10.3389/f...,Positive,Positive,Animal Study - Veterinary Clinical Trial,2022,CBD; CBG; CBN; Cannabidiol; Cannabidiolic Acid...,Cardiovascular System,Anxiety; Arthritis; Arthritis - Osteoarthritis...,https://www.frontiersin.org/journals/veterinar...
3,2020499,Use of Cannabidiol for the Treatment of Anxiet...,https://pubmed.ncbi.nlm.nih.gov/32923656/,Positive,Positive,Review/Other,2020,Cannabidiol,Mental/Emotional System; Nervous System,Anxiety,https://europepmc.org/backend/ptpmcrender.fcgi...
4,2049937,Cannabidiol (CBD) and cognitive function in ol...,https://doi.org/10.3389/fpsyt.2025.1646151,Meta-analysis/Review,Meta-analysis/Review,Review/Other,2025,CBD; Cannabidiol; Delta-9; THC,Nervous System,Anxiety; Oxidative Stress; Quality of Life; St...,https://www.frontiersin.org/journals/psychiatr...


In [3]:
df = df[["study_title", "resultIA_fine_tunning"]]
df.rename(columns={"study_title": "text", "resultIA_fine_tunning": "label"}, inplace=True)

df.head()


Unnamed: 0,text,label
0,"Pharmacokinetics of cannabidiol, (-)-trans-Δ9-...",Inconclusive
1,Essential oil of lavender in anxiety disorders...,Positive
2,Serum Cannabinoid 24 h and 1 Week Steady State...,Positive
3,Use of Cannabidiol for the Treatment of Anxiet...,Positive
4,Cannabidiol (CBD) and cognitive function in ol...,Meta-analysis/Review


In [4]:
df.dropna(inplace=True)
print(df["label"].value_counts())


label
Positive                            5404
Needs_AI                            4572
Inconclusive                         961
Meta-analysis/Review                 537
Meta-analysis/Review (heuristic)     480
Negative                             176
Negative (heuristic)                  82
Positive (heuristic)                  80
Name: count, dtype: int64


In [5]:
df.isnull().sum()


Unnamed: 0,0
text,0
label,0


In [6]:
def normalize_label(label):
    if "Positive" in label:
        return "Positive"
    elif "Negative" in label:
        return "Negative"
    elif "Meta-analysis/Review" in label:
        return "Meta-analysis/Review"
    else:
        return label  # Inconclusive, Needs_AI

df["label"] = df["label"].apply(normalize_label)

print(df["label"].value_counts())


label
Positive                5484
Needs_AI                4572
Meta-analysis/Review    1017
Inconclusive             961
Negative                 258
Name: count, dtype: int64


In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["label"])

print(le.classes_)


['Inconclusive' 'Meta-analysis/Review' 'Needs_AI' 'Negative' 'Positive']


In [8]:
# Convert to Hugging Face Dataset
#This connects your CSV to Trainer

from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset


Dataset({
    features: ['text', 'label'],
    num_rows: 12292
})

In [9]:
#Train / Test split

dataset = dataset.train_test_split(test_size=0.2, seed=42)

train_dataset = dataset["train"]
test_dataset = dataset["test"]


In [10]:
# Load tokenizer

from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
# Tokenization function

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )


In [12]:
# Preprocess datasets
def preprocess(ds):
    ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"]) #deletes raw text and save memory by removing
    ds = ds.rename_column("label", "labels")
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return ds

train_dataset = preprocess(train_dataset)
test_dataset = preprocess(test_dataset)


Map:   0%|          | 0/9833 [00:00<?, ? examples/s]

Map:   0%|          | 0/2459 [00:00<?, ? examples/s]

In [13]:
# Load model
# Number of labels = number of unique classes

from transformers import AutoModelForSequenceClassification

num_labels = len(le.classes_)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# TrainingArguments

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-medical",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)


In [15]:
# Trainer + Train
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


Step,Training Loss
500,0.9775
1000,0.8592


TrainOutput(global_step=1230, training_loss=0.8914819732914127, metrics={'train_runtime': 142.3856, 'train_samples_per_second': 69.059, 'train_steps_per_second': 8.639, 'total_flos': 325655404619520.0, 'train_loss': 0.8914819732914127, 'epoch': 1.0})

In [16]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.evaluate()


{'eval_loss': 0.8012705445289612,
 'eval_model_preparation_time': 0.0027,
 'eval_accuracy': 0.7043513623424156,
 'eval_f1': 0.667009163799132,
 'eval_runtime': 9.8145,
 'eval_samples_per_second': 250.547,
 'eval_steps_per_second': 31.382}

In [19]:
## rain properly (not just 1 epoch)

# Your first run was a test run.

training_args = TrainingArguments(
    output_dir="./bert-medical",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)


In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,        # NEW training_args
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


In [23]:
trainer.train()


Step,Training Loss
500,0.6927
1000,0.7242
1500,0.6707
2000,0.6148
2500,0.5863
3000,0.4612
3500,0.4342


TrainOutput(global_step=3690, training_loss=0.5906594583981728, metrics={'train_runtime': 473.4551, 'train_samples_per_second': 62.306, 'train_steps_per_second': 7.794, 'total_flos': 976966213858560.0, 'train_loss': 0.5906594583981728, 'epoch': 3.0})

In [24]:
trainer.evaluate()


{'eval_loss': 0.9339239597320557,
 'eval_accuracy': 0.705978039853599,
 'eval_f1': 0.695053784862404,
 'eval_runtime': 9.943,
 'eval_samples_per_second': 247.311,
 'eval_steps_per_second': 30.977,
 'epoch': 3.0}

In [25]:
trainer.save_model("bert-medical-text-classifier")
tokenizer.save_pretrained("bert-medical-text-classifier")


('bert-medical-text-classifier/tokenizer_config.json',
 'bert-medical-text-classifier/special_tokens_map.json',
 'bert-medical-text-classifier/vocab.txt',
 'bert-medical-text-classifier/added_tokens.json',
 'bert-medical-text-classifier/tokenizer.json')

In [28]:
#Get model device
device = model.device


In [29]:
#Move inputs to same device
inputs = {k: v.to(device) for k, v in inputs.items()}


In [30]:
text = "Cannabidiol shows positive effects in anxiety treatment"

# Tokenize
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Move inputs to same device as model
device = model.device
inputs = {k: v.to(device) for k, v in inputs.items()}

# Inference
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

pred = outputs.logits.argmax(dim=1).item()
print("Prediction:", le.classes_[pred])


Prediction: Needs_AI
