In [169]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


zsh:1: command not found: nvidia-smi


In [170]:
import sys
print(sys.executable)
print(sys.version)


/opt/homebrew/anaconda3/envs/nlp_env/bin/python
3.10.19 (main, Oct 21 2025, 16:37:10) [Clang 20.1.8 ]


In [171]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import torch
import os
import gc
import random

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import f1_score

# Import Data

In [172]:
# file_id = "1XOafk3wcP2RcTu1MHXoR_IJBIZseqTn8"
# url = f"https://drive.google.com/uc?id={file_id}"

# train_model_df = pd.read_csv(url, sep="\t")
# train_model_df = train_model_df.loc[:, ~train_model_df.columns.str.contains('^Unnamed')]

# Split Train and Validation

In [173]:
from random import shuffle


NER_COLS = ["ORG", "GPE", "NORP", "DATE", "CARDINAL", "PRODUCT", "ORDINAL", "LOC", "LAW"]
BASE_COLS = ["keyword", "country"]
LABEL_COL = "PCL_category"
TRAIN_TEXT_MIN_LEN = 3
IMPORTANCE = [3,2,1,2,5]
CATEGORICAL_COLS = ["keyword", "country"] 

def clean_df(df: pd.DataFrame):
    sentence_len = df["text"].str.len()
    Q1 = sentence_len.quantile(0.25)
    Q3 = sentence_len.quantile(0.75)
    IQR = Q3 - Q1
    train_text_max_len = 3 + 1.5 * IQR
    outliers = df[(sentence_len < TRAIN_TEXT_MIN_LEN) | (sentence_len>train_text_max_len)]
    outlier_percentage = len(outliers)/len(df) * 100
    if outlier_percentage <= 5:
        df = df.drop(outliers.index)
    
    return df
    
def x_y_split(df: pd.DataFrame):
    y_categorical = df["PCL_category"]
    y_binary = df["is_PCL"]
    X = df.drop(columns=["PCL_category", "is_PCL"])
    return X, y_binary, y_categorical


def data_read_split(data_path: str):
    df = pd.read_csv(
        data_path,
        sep="\t",
        skiprows=9,
        engine="python",
        index_col=0,
        header=None,
        names = ["article_id", "keyword", "country", "text", "PCL_category"]
    )
    df["is_PCL"] = df["PCL_category"] >= 2
    
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
    
    return df_train, df_test    

def data_preprocess(data_path: str): 
    df_train, df_test = data_read_split(data_path)
    df_train = clean_df(df_train)
    X_train, y_train_b, y_train_c = x_y_split(df_train)
    X_test, y_test_b, y_test_c = x_y_split(df_test)
    return X_train, X_test, y_train_b, y_test_b, y_train_c, y_test_c

def t5_filtering(df: pd.DataFrame):
    return df[["text", "is_PCL"]]


In [174]:
data_path = "dontpatronizeme_pcl.tsv"
df_train, df_test = data_read_split(data_path)
df_train = t5_filtering(df_train)
df_test = t5_filtering(df_test)

# Functions

In [175]:
def get_device():
    # if torch.cuda.is_available():
    #     return torch.device("cuda")
    # elif torch.backends.mps.is_available():
    #     return torch.device("mps")
    # else:
    #     return torch.device("cpu")
    return torch.device("cpu")

def plot_f1(train_f1_list, val_f1_list):

    fig, ax = plt.subplots()
    ax.plot(np.arange(len(train_f1_list)), train_f1_list, label="train")
    ax.plot(np.arange(len(val_f1_list)), val_f1_list, label="val")
    ax.legend()
    plt.show()

def augment_text(row, deletion_prob=0.0, swap_prob=0.7, pos=3):
    # Tokenize the text
    tokens = row['text'].split()

    n_pos = 1*pos

    for i in range(len(tokens)-n_pos):
        if random.random() < swap_prob:
            #swap_i = random.randint(0, len(tokens)-1)
            tokens[i], tokens[i + n_pos] = tokens[i + n_pos], tokens[i]

    tokens = [token for token in tokens if random.random() > deletion_prob]

    # Reconstruct the augmented text
    augmented_text = ' '.join(tokens)
    return augmented_text

def train_model(train_df, val_df, custom_args, cols=['text', 'target_flag'], epochs=3, is_save=True, is_swap=False, start_swap_epoch=0):

    save_path = f'./models/{str(custom_args["learning_rate"]):.4}_{str(custom_args["weight_decay"]):.4}'

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Create a ClassificationModel with custom hyperparameters
    model = ClassificationModel(
        "distilbert",
        "distilbert-base-uncased",
        num_labels=2,
        args=custom_args,
        use_cuda=False
    )

    train_f1_list = []
    val_f1_list = []

    best_f1 = -1

    for i in range(epochs):

        _train_df = train_df.copy()
        # if is_swap and i >= start_swap_epoch:
        #     # _train_df["text"] =  _train_df.apply(augment_text, axis=1)
        #     _train_df["text"] = _train_df.apply(lambda row: augment_text(row, pos=i+1), axis=1)

        model.train_model(_train_df[cols], eval_df=val_df[cols])

        train_result, train_model_outputs, train_wrong_predictions = model.eval_model(_train_df[cols])
        val_result, val_model_outputs, val_wrong_predictions = model.eval_model(val_df[cols])

        train_preds = np.argmax(train_model_outputs, axis=1)
        train_labels = _train_df['is_PCL'].values
        train_f1 = f1_score(train_labels, train_preds)

        val_preds = np.argmax(val_model_outputs, axis=1)
        val_labels = val_df['is_PCL'].values
        val_f1 = f1_score(val_labels, val_preds)

        train_f1_list.append(train_f1)
        val_f1_list.append(val_f1)

        print(f"Epoch {i} train: {train_f1}, val: {val_f1}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            if is_save:
                model.model.save_pretrained(save_path)
                model.tokenizer.save_pretrained(save_path)
                model.config.save_pretrained(f'{save_path}/')
                if not os.path.isfile(os.path.join(save_path, 'config.json')):
                    raise Exception("Model not saved correctly. 'config.json' not found.")
        else:
            print(f"Early stop at: {i}")
            break

    best_model = ClassificationModel(
        "distilbert",
        save_path if is_save else None,
        use_cuda=False
        # num_labels=2,  # Ensure this matches the original model's configuration
    )

    return best_model, train_f1_list[-1], val_f1_list[-1], train_f1_list, val_f1_list

# Paraphrase by T5

In [176]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model_name = "t5-small"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [177]:
# import torch
# from tqdm import tqdm

# target_one_text = df_train[df_train["is_PCL"] == 1.0]["text"].tolist()

# device = get_device()

# paraphrase_list = []

# for i, sentence in tqdm(enumerate(target_one_text)):

#     text =  "paraphrase: " + sentence + " </s>"

#     encoding = tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
#     input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

#     outputs = model.generate(
#         input_ids=input_ids, attention_mask=attention_masks,
#         max_length=256,
#         do_sample=True,
#         top_k=240,
#         top_p=0.99,
#         early_stopping=True,
#         num_return_sequences=1
#     )

#     for output in outputs:
#         line = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
#         paraphrase_list.append(line)

#     torch.mps.empty_cache()
#     gc.collect()

In [178]:
# import os

# # safer path inside project folder
# save_path = os.path.join(os.getcwd(), "models")  

# if not os.path.exists(save_path):
#     os.makedirs(save_path)

# save_path = os.path.expanduser("~/models")
# os.makedirs(save_path, exist_ok=True)



In [179]:
# paraphrase_dict = {
#     "text": paraphrase_list,
#     "is_PCL": 1.0,
# }

# paraphrase_df_1 = pd.DataFrame(paraphrase_dict)
# train_all_df = pd.concat([paraphrase_df_1, df_train])

# Train Model

In [180]:
batch_size = 32

best_params = {
    "learning_rate": 3e-5,
    "train_batch_size": batch_size,
    "eval_batch_size": batch_size,
    "weight_decay": 0.01,
    "optimizer": "AdamW",
    "num_train_epochs": 1,
    "dropout_rate": 0.1,
    "overwrite_output_dir": True,
}

cols = ['text', 'is_PCL']


torch.cuda.empty_cache()
gc.collect()

model, train_f1, val_f1, train_f1_list, val_f1_list = train_model(
    df_train, #train_all_df,
    df_test,
    best_params,
    cols=cols,
    epochs=5,
    is_save=True,
    is_swap=True,
    start_swap_epoch=1
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

Epoch 0 train: 0.23093681917211328, val: 0.1511111111111111


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch 1 train: 0.6661550268610897, val: 0.4437299035369775


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch 2 train: 0.8497913769123783, val: 0.45592705167173253


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch 3 train: 0.9346534653465347, val: 0.45317220543806647
Early stop at: 3


In [181]:
# Unpack predictions from the tuple
y_pred, raw_outputs = model.predict(df_test["text"].tolist())

# Ensure y_true is a numpy array
y_true = df_test["is_PCL"].values

# Compute F1
f1 = f1_score(y_true, y_pred)
print("F1:", f1)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

F1: 0.45592705167173253



