In [1]:
%pip install transformers datasets torch accelerate --quiet


Note: you may need to restart the kernel to use updated packages.




In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pd.read_csv('../data/train_clean.csv')
val_df = pd.read_csv('../data/val_clean.csv')

print(len(train_df), len(val_df))
train_df.head()


35751 4469


Unnamed: 0,text,label
0,Bernie Sanders candidacy is attractive to many...,0
1,RIYADH (Reuters) - Saudi King Salman received ...,1
2,"Right before eighth grade, Trump s father sent...",0
3,Someone we haven t heard a lot from during the...,0
4,"BRIDGEWATER, N.J. (Reuters) - President Donald...",1


In [4]:
train_dataset = Dataset.from_pandas(train_small[['text', 'label']])
val_dataset = Dataset.from_pandas(val_small[['text', 'label']])

train_dataset[0]


NameError: name 'train_small' is not defined

In [5]:
import pandas as pd
from datasets import Dataset

# 1) load cleaned CSVs
train_df = pd.read_csv('../data/train_clean.csv')
val_df = pd.read_csv('../data/val_clean.csv')

print("Columns in train_df:", train_df.columns.tolist())
print("Rows:", len(train_df), len(val_df))

# 2) make sure columns are named exactly 'text' and 'label'
#    (in case something is slightly different)
cols = {c: c.strip().lower() for c in train_df.columns}
train_df = train_df.rename(columns=cols)
val_df = val_df.rename(columns=cols)

if 'text' not in train_df.columns and 'title' in train_df.columns:
    train_df = train_df.rename(columns={'title': 'text'})
    val_df = val_df.rename(columns={'title': 'text'})

assert 'text' in train_df.columns, "No 'text' column found in train_df"
assert 'label' in train_df.columns, "No 'label' column found in train_df"

# 3) optional: take smaller subset for testing (change numbers if you want)
train_small = train_df.sample(n=min(5000, len(train_df)), random_state=42)
val_small = val_df.sample(n=min(1000, len(val_df)), random_state=42)

print("Using subset sizes:", len(train_small), len(val_small))

# 4) build HF Datasets
train_dataset = Dataset.from_pandas(train_small[['text', 'label']].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_small[['text', 'label']].reset_index(drop=True))

train_dataset[0], val_dataset[0]


Columns in train_df: ['text', 'label']
Rows: 35751 4469
Using subset sizes: 5000 1000


({'text': 'Tune in to the Alternate Current Radio Network (ACR) for another LIVE broadcast of The Boiler Room tonight 6:00 PM PST | 8:00 PM CST | 9:00 PM EST for this special broadcast. Join us for uncensored, uninterruptible talk radio, custom-made for bar fly philosophers, misguided moralists, masochists, street corner evangelists, media-maniacs, savants, political animals and otherwise lovable rascals.Join ACR hosts Hesher and Spore along with Funk Soul, Stewart Howe (ACR/21WIRE contributor) and Andy Nowicki, author of Conspiracy, Compliance, Control & Defiance, for the hundred and second episode of BOILER ROOM. Turn it up, tune in and hang with the ACR Brain-Trust for this weeks boil downs and analysis on the London terror events, proof that the sad failure of identity politics that can and does lead young people to an early death, the voracity of Dr. Phil s alleged expos on systemic pedophile rings in the circles of powerful elites and the usual gnashing of the teeth of the politi

In [6]:
model_name = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )


In [8]:
tokenized_train = train_dataset.map(tokenize_batch, batched=True)
tokenized_val = val_dataset.map(tokenize_batch, batched=True)

# Tell HF which column is the label and drop raw text
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_val = tokenized_val.remove_columns(["text"])

tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")

tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

tokenized_train[0]


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [9]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# 1) Load cleaned data
train_df = pd.read_csv('../data/train_clean.csv')
val_df = pd.read_csv('../data/val_clean.csv')

# 2) Standardize column names
train_df.columns = [c.strip().lower() for c in train_df.columns]
val_df.columns = [c.strip().lower() for c in val_df.columns]

if 'title' in train_df.columns and 'text' not in train_df.columns:
    train_df = train_df.rename(columns={'title': 'text'})
    val_df = val_df.rename(columns={'title': 'text'})

assert 'text' in train_df.columns, train_df.columns
assert 'label' in train_df.columns, train_df.columns

# 3) Take a smaller subset for now (you can increase later)
train_small = train_df.sample(n=min(5000, len(train_df)), random_state=42)
val_small = val_df.sample(n=min(1000, len(val_df)), random_state=42)

# 4) Make sure text is clean strings (no NaN/None)
for df_ in (train_small, val_small):
    df_['text'] = df_['text'].fillna('').astype(str)
    df_['text'] = df_['text'].str.strip()
    df_.drop(df_[df_['text'] == ''].index, inplace=True)

print("Using sizes:", len(train_small), len(val_small))

# 5) Create HF datasets WITHOUT index column
train_dataset = Dataset.from_dict({
    "text": train_small["text"].tolist(),
    "label": train_small["label"].tolist()
})
val_dataset = Dataset.from_dict({
    "text": val_small["text"].tolist(),
    "label": val_small["label"].tolist()
})

print("HF example:", train_dataset[0])

# 6) Tokenizer
model_name = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

# 7) Tokenize
tokenized_train = train_dataset.map(tokenize_batch, batched=True)
tokenized_val = val_dataset.map(tokenize_batch, batched=True)

# 8) Remove raw text, rename labels, set torch format
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_val = tokenized_val.remove_columns(["text"])

tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")

tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

tokenized_train[0]


Using sizes: 4929 977
HF example: {'text': 'Tune in to the Alternate Current Radio Network (ACR) for another LIVE broadcast of The Boiler Room tonight 6:00 PM PST | 8:00 PM CST | 9:00 PM EST for this special broadcast. Join us for uncensored, uninterruptible talk radio, custom-made for bar fly philosophers, misguided moralists, masochists, street corner evangelists, media-maniacs, savants, political animals and otherwise lovable rascals.Join ACR hosts Hesher and Spore along with Funk Soul, Stewart Howe (ACR/21WIRE contributor) and Andy Nowicki, author of Conspiracy, Compliance, Control & Defiance, for the hundred and second episode of BOILER ROOM. Turn it up, tune in and hang with the ACR Brain-Trust for this weeks boil downs and analysis on the London terror events, proof that the sad failure of identity politics that can and does lead young people to an early death, the voracity of Dr. Phil s alleged expos on systemic pedophile rings in the circles of powerful elites and the usual gn

Map: 100%|██████████| 4929/4929 [00:07<00:00, 663.70 examples/s] 
Map: 100%|██████████| 977/977 [00:02<00:00, 475.26 examples/s]


{'labels': tensor(0),
 'input_ids': tensor([    0,   565,  4438,    11,     7,     5, 43510,  9149,  4611,  3658,
            36,  2562,   500,    43,    13,   277, 14737,  2308,     9,    20,
          3542, 10329,  8499,  3422,   231,    35,   612,  2784, 13388,  1721,
           290,    35,   612,  2784, 24425,  1721,   361,    35,   612,  2784,
         12936,    13,    42,   780,  2308,     4, 10287,   201,    13, 16511,
          1290,  3995,     6,   542,  8007, 14709,  4748,  1067,  3188,     6,
          6777,    12,  7078,    13,  2003,  3598, 44267,     6, 27422,  7654,
          1952,     6, 11705,  4306,  1952,     6,  2014,  2797, 30651,  1952,
             6,   433,    12,   397,  9504,    29,     6, 14065,  3277,     6,
           559,  3122,     8,  3680,   784, 30289,   910,  8631,  1536,     4,
         41417,  7224,   500,  4452, 32899,  1843,     8,  2064,  1688,   552,
            19, 28864, 16455,     6,  6192, 22654,    36,  2562,   500,    73,
          2146, 2

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


In [12]:
training_args = TrainingArguments(
    output_dir="../backend/app/roberta_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../backend/app/roberta_model",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,

    # evaluation & saving
    evaluation_strategy="epoch",   # evaluate at end of each epoch
    save_strategy="epoch",         # save checkpoint each epoch

    # logging (older versions don't have logging_strategy)
    logging_dir="../backend/app/roberta_logs",
    logging_steps=100,

    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../backend/app/roberta_model",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,

    # evaluation & saving
    evaluation_strategy="epoch",   # evaluate at end of each epoch
    save_strategy="epoch",         # save checkpoint each epoch

    # logging (older versions don't have logging_strategy)
    logging_dir="../backend/app/roberta_logs",
    logging_steps=100,

    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [15]:
training_args = TrainingArguments(...)


TypeError: expected str, bytes or os.PathLike object, not ellipsis

In [16]:
import transformers
print("transformers version:", transformers.__version__)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../backend/app/roberta_model",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
)


transformers version: 4.57.3


In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [19]:
train_result = trainer.train()
metrics = trainer.evaluate()
metrics


Step,Training Loss
500,0.0045




{'eval_loss': 4.899103441857733e-05,
 'eval_accuracy': 1.0,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 106.7734,
 'eval_samples_per_second': 9.15,
 'eval_steps_per_second': 1.152,
 'epoch': 1.0}

In [20]:
# inside your notebook (where trainer exists)
trainer.save_model("../roberta_final")     # saves final model weights + config
tokenizer.save_pretrained("../roberta_final")


('../roberta_final\\tokenizer_config.json',
 '../roberta_final\\special_tokens_map.json',
 '../roberta_final\\vocab.json',
 '../roberta_final\\merges.txt',
 '../roberta_final\\added_tokens.json',
 '../roberta_final\\tokenizer.json')

In [21]:
# move the model folder OUT of the repo to a safe place (e.g., C:\Models)
Move-Item -Path ".\backend\app\roberta_model" -Destination "C:\Models\LucidVerify\roberta_model"
# OR if you saved as ../roberta_final:
Move-Item -Path ".\roberta_final" -Destination "C:\Models\LucidVerify\roberta_final"


SyntaxError: invalid syntax (940044424.py, line 2)