# LIAR Dataset preproceecing
# this is on cloab

# creating the directory:







In [None]:
!mkdir -p /content/data/raw /content/data/processed

In [None]:
import pandas as pd

# uncomment this if you are on local
# train = pd.read_csv("data/raw/liar_train.tsv", sep="\t", header=None)
train=pd.read_csv("/content/data/raw/train.tsv",sep="\t",header=None)
train.columns = ["id",
    "label","statement","subject","speaker","speaker_job","state_info",
    "party","barely_true_counts","false_counts","half_true_counts",
    "mostly_true_counts","pants_on_fire_counts","context"
]

print(train.head(5))
print("\nLabel distribution:\n", train["label"].value_counts())


           id        label                                          statement  \
0   2635.json        false  Says the Annies List political group supports ...   
1  10540.json    half-true  When did the decline of coal start? It started...   
2    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json        false  Health care reform legislation is likely to ma...   
4   9028.json    half-true  The economic turnaround started at the end of ...   

                              subject         speaker           speaker_job  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

  state_info       party  barely_true_

# Prepare the data:

In [None]:
import pandas as pd
import os

# folders used in Colab
before_data = "/content/data/raw"
after_data  = "/content/data/processed"

# label mapping
label_map = {
    "pants-fire": 0,
    "false": 1,
    "barely-true": 2,
    "half-true": 3,
    "mostly-true": 4,
    "true": 5
}

# column names
COLS = [
    "id","label","statement","subject","speaker","speaker_job","state_info",
    "party","barely_true_counts","false_counts","half_true_counts",
    "mostly_true_counts","pants_on_fire_counts","context"
]

# Load TSV file (Colab version)
def load_tsv(name_file):
    path = os.path.join(before_data, name_file)
    df = pd.read_csv(path, sep="\t", header=None)
    df.columns = COLS
    return df

# Clean dataframe
def clean(df):
    # keep only label + statement
    df = df[["label", "statement"]]

    df = df.dropna()                                  # remove empty rows
    df = df.drop_duplicates(subset=["statement"])      # remove duplicate texts
    df["statement"] = df["statement"].str.strip()      # remove whitespace
    df["label"] = df["label"].map(label_map)           # convert label strings → ints

    # keep only statements longer than 10 chars
    df = df[df["statement"].str.len() > 10]

    return df

def main():
    # create output folder
    os.makedirs(after_data, exist_ok=True)

    # correct file names
    splits = ["train.tsv", "test.tsv", "valid.tsv"]

    for split in splits:
        print(f"Cleaning... {split}")

        df = load_tsv(split)
        df = clean(df)

        save_name = split.replace(".tsv", "_clean.tsv")
        df.to_csv(os.path.join(after_data, save_name), index=False)

        print(f"✔ Cleaned {split}")
        print(df.head())
        print(df["label"].value_counts())
        print("-" * 40)

# correct main check
if __name__ == "__main__":
    main()


Cleaning... train.tsv
✔ Cleaned train.tsv
   label                                          statement
0      1  Says the Annies List political group supports ...
1      3  When did the decline of coal start? It started...
2      4  Hillary Clinton agrees with John McCain "by vo...
3      1  Health care reform legislation is likely to ma...
4      3  The economic turnaround started at the end of ...
label
3    2109
1    1987
4    1961
5    1674
2    1653
0     839
Name: count, dtype: int64
----------------------------------------
Cleaning... test.tsv
✔ Cleaned test.tsv
   label                                          statement
0      5  Building a wall on the U.S.-Mexico border will...
1      1  Wisconsin is on pace to double the number of l...
2      1  Says John McCain has done nothing to help the ...
3      3  Suzanne Bonamici supports a plan that will cut...
4      0  When asked by a reporter whether hes at the ce...
label
3    265
1    249
4    241
2    212
5    208
0     92
Name:

# doing a train test


In [None]:
!pip install AdamW


[31mERROR: Could not find a version that satisfies the requirement AdamW (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for AdamW[0m[31m
[0m

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW

#Load cleaned datasets
train_df = pd.read_csv("/content/data/processed/train_clean.tsv")
valid_df = pd.read_csv("/content/data/processed/valid_clean.tsv")
print(f"Train: {train_df.shape}, Valid: {valid_df.shape}")

#PyTorch Dataset
class LIARDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["statement"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

#Prepare dataset and dataloader
train_dataset = LIARDataset(train_df.head(32), tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Load model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=6
)

# Set device and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# test training loop
model.train()
for step, batch in enumerate(train_loader):
    if step == 2:  # stop early
        break

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    print(f"Step {step} → Loss: {loss.item():.4f}")

print("\n=== Smoke Test Passed Successfully ===")


Train: (10223, 2), Valid: (1284, 2)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step 0 → Loss: 1.6940
Step 1 → Loss: 1.8307

=== Smoke Test Passed Successfully ===
