In [33]:
import pandas as pd
import re
import html
import spacy
import nltk
from nltk.corpus import stopwords, words
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
import numpy as np
from scipy.special import softmax
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, classification_report
import torch
from safetensors.torch import load_file
import os
from transformers import AutoTokenizer
import joblib



In [34]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Configurations

In [35]:
NER_COLS = ["ORG", "GPE", "NORP", "DATE", "CARDINAL", "PRODUCT", "ORDINAL", "LOC", "LAW"]
BASE_COLS = ["keyword", "country"]
LABEL_COL = "PCL_category"
TRAIN_TEXT_MIN_LEN = 3
IMPORTANCE = [3,2,1,2,5]
CATEGORICAL_COLS = ["keyword", "country"] 
RANDOM_SEED = 42

TRAIN_DATA_PATH = "data/PCL_train_dataset.tsv"
VAL_DATA_PATH = "data/PCL_val_dataset.tsv"
TEST_DATA_PATH = "data/PCL_test_dataset.tsv"

HF_MODEL = "models/hf_model"

# Data Preprocessing

In [36]:
from sklearn.model_selection import train_test_split

TRAIN_TEXT_MIN_LEN = 3

def clean_df(df: pd.DataFrame):
    sentence_len = df["text"].str.len()
    Q1 = sentence_len.quantile(0.25)
    Q3 = sentence_len.quantile(0.75)
    IQR = Q3 - Q1
    train_text_max_len = 3 + 1.5 * IQR
    outliers = df[(sentence_len < TRAIN_TEXT_MIN_LEN) | (sentence_len>train_text_max_len)]
    outlier_percentage = len(outliers)/len(df) * 100
    if outlier_percentage <= 5:
        df = df.drop(outliers.index)
    
    return df
    
def x_y_split(df: pd.DataFrame):
    y_categorical = df["PCL_category"]
    y_binary = df["labels"]
    X = df.drop(columns=["PCL_category", "labels"])
    return X, y_binary, y_categorical

def data_preprocess(data_path: str, is_train: bool = False, is_test: bool = False):
    columns = ["article_id", "keyword", "country", "text"]
    if not is_test:
        columns += ["PCL_category"]

    df = pd.read_csv(
        data_path,
        sep="\t",
        skiprows=9,
        engine="python",
        index_col=0,
        header=None,
        names=columns,
    )

    if is_train:
        df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
        # df_train = clean_df(df_train)

    if not is_test:
        df["labels"] = (df["PCL_category"] >= 2).astype("int64")

    df["text"] = df["text"].astype(str).fillna("")

    return df


In [37]:
X_train, y_train_b, y_train_c, = x_y_split(data_preprocess(TRAIN_DATA_PATH, is_train = True))
X_val, y_val_b, y_val_c, = x_y_split(data_preprocess(VAL_DATA_PATH))
X_test = data_preprocess(TEST_DATA_PATH, is_test=True)

# Feature Extraction for Pipeline

In [38]:

LABEL_COL = "PCL_category"
 
class FeatureExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, nlp, stop_words, vocab):
        self.nlp = nlp
        self.ner_cols = NER_COLS
        self.base_cols = BASE_COLS
        self.stop_words = stop_words
        self.vocab = vocab
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        
        texts = X["text"].astype(str).tolist()
        results = []
        
        for i, doc in enumerate(self.nlp.pipe(texts, batch_size=32)):
 
            row_features = {
                # Constructed features
                **self._clean_text(doc),
                **self._punctuation_features(doc),
                **self._ner_counts(doc),
                **self._verb_noun_ratio(doc),
                # **self._count_misspellings(doc),
                # Base columns
                "keyword": X.iloc[i]["keyword"],
                "country": X.iloc[i]["country"]
            }
            
            
            results.append(row_features)
        
        return pd.DataFrame(results)
    
    def _clean_text(self, doc):
        text = doc.text
        text = html.unescape(text)
        text = re.sub(r"\s+", " ", text.replace("\n", " ").replace("\t", " ")).strip()
        text = re.sub(r"[^\w\s]", "", text)
        words = text.lower().split()
        tokens = [w for w in words if w not in self.stop_words]
        return {"cleaned_text": " ".join(tokens)}
    
    def _count_misspellings(self, doc):
        misspelled = 0
        tokens = [t.text.lower() for t in doc if t.is_alpha and not t.is_stop]
        
        for word in tokens:
            if word not in self.vocab:
                misspelled += 1

        return {"misspelled_ratio": misspelled / len(doc) if len(doc) > 0 else 0}

    def _punctuation_features(self, doc):
        text = doc.text
        sentence_len = max(len(text.split()), 1)
        pct_exclam = text.count("!") / sentence_len
        pct_question = text.count("?") / sentence_len
        return {"pct_exclam": pct_exclam, "pct_question": pct_question}

    def _ner_counts(self, doc):
        counts = dict.fromkeys(self.ner_cols, 0)
        for ent in doc.ents:
            if ent.label_ in counts:
                counts[ent.label_] += 1
        return counts
    
    def _verb_noun_ratio(self, doc):
        num_verbs = sum(1 for token in doc if token.pos_ == "VERB")
        num_nouns = sum(1 for token in doc if token.pos_ == "NOUN")
        
        ratio = num_verbs / num_nouns if num_nouns > 0 else 0
        return {"verb_noun_ratio": ratio}

In [39]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "lemmatizer"])
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
nltk.download('words')
english_vocab = set(w.lower() for w in words.words())

feature_extractor = FeatureExtractor(nlp, stop_words, english_vocab)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/berra.dogan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/berra.dogan/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Pipeline Model Structuring

In [40]:
def construct_class_weights(y):
    class_counts = np.bincount(y,minlength=5)  # y = your labels
    N = len(y)
    K = 5

    imbalance_weights = N / (K * class_counts)
    importance_weights = np.array(IMPORTANCE)

    final_weights = imbalance_weights * importance_weights
    final_weights /= final_weights.mean()

    return np.array([final_weights[c] for c in y])

In [41]:
class ClassifierWithBinarization(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.model = CatBoostClassifier(
            loss_function = "MultiClass",
            eval_metric="TotalF1",
            random_seed=42,
            verbose=False,
         )

    def fit(self, X, y, sample_weight=None):
        self.model.fit(X, y, sample_weight=sample_weight)
        return self

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def predict(self, X):
        probs = self.predict_proba(X)
        positive = 0.5*probs[:, 2] + probs[:, 3] + 1.5 * probs[:, 4]
        negative = probs[:, 1] + 1.5 * probs[:, 0]

        return (positive - negative > 0).astype(int)


In [42]:
numerical_cols =  NER_COLS + ["verb_noun_ratio", "pct_exclam", "pct_question"] 
cleaned_txt_col = "cleaned_text"

In [43]:
from textblob import TextBlob

class SentimentFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        feats = []
        for text in X:
            blob = TextBlob(str(text))
            feats.append([blob.sentiment.polarity, blob.sentiment.subjectivity])
        return np.array(feats)


In [44]:
categorical_cols = ["keyword", "country"]  


preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 5), sublinear_tf=True), cleaned_txt_col),
        ("numeric", StandardScaler(), numerical_cols),
        ("sentiment", SentimentFeatures(), cleaned_txt_col)
    ],
    remainder="drop"
)

pipeline = Pipeline([
    ("features", feature_extractor),
    ("preprocess", preprocessor),
    ("feature_select", SelectKBest(score_func=f_classif, k=1200)),
    ("model", ClassifierWithBinarization())
])

pipeline.fit(X_train, y_train_c, model__sample_weight=construct_class_weights(y_train_c))

0,1,2
,steps,"[('features', ...), ('preprocess', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,nlp,<spacy.lang.e...t 0x1763bfdf0>
,stop_words,"{'a', 'about', 'above', 'after', 'again', 'against', ...}"
,vocab,"{'a', 'aa', 'aal', 'aalii', 'aam', 'aani', ...}"

0,1,2
,transformers,"[('categorical', ...), ('tfidf', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...t 0x14caea560>
,k,1200


In [45]:
y_pred = pipeline.predict(X_val)

f1 = f1_score(y_val_b, y_pred)
print("F1 score:", f1)

print(classification_report(y_val_b, y_pred))

F1 score: 0.3204225352112676
              precision    recall  f1-score   support

           0       0.94      0.85      0.89      1895
           1       0.24      0.48      0.32       191

    accuracy                           0.81      2086
   macro avg       0.59      0.66      0.61      2086
weighted avg       0.88      0.81      0.84      2086





In [46]:
os.makedirs("pipeline", exist_ok=True)
joblib.dump(pipeline, "models/pipeline_model/sklearn_pipeline.joblib")

['models/pipeline_model/sklearn_pipeline.joblib']

# Ensemble Model

In [47]:
class HF_Sklearn_Ensemble:
    def __init__(self, hf_model, sklearn_pipeline, hf_ratio=0.5):
        self.hf_model = hf_model
        self.sklearn_pipeline = sklearn_pipeline
        self.hf_ratio = hf_ratio

    def predict_proba(self, X, text_col="text"):
        # --- Make a clean text list ONCE, use it everywhere ---
        texts = X[text_col].astype(str).tolist()
        n = len(texts)

        # HF probabilities
        _, logits = self.hf_model.predict(texts)
        hf_probs = softmax(logits, axis=1)[:, 1]
        if hf_probs.shape[0] != n:
            raise ValueError(f"HF returned {hf_probs.shape[0]} preds but X has {n} rows")

        # Sklearn probabilities (ensure it uses the same X)
        probs = self.sklearn_pipeline.predict_proba(X)
        if probs.shape[0] != n:
            raise ValueError(f"Sklearn returned {probs.shape[0]} preds but X has {n} rows")

        # Your custom mapping
        positive = 0.5 * probs[:, 2] + probs[:, 3] + 1.5 * probs[:, 4]
        negative = probs[:, 1] + 1.5 * probs[:, 0]
        sk_probs = positive / (positive + negative)

        # Ensemble average
        avg_probs = self.hf_ratio * hf_probs + (1 - self.hf_ratio) * sk_probs
        return avg_probs

    def predict(self, X, text_col="text", threshold=0.5):
        avg_probs = self.predict_proba(X, text_col=text_col)
        return (avg_probs >= threshold).astype(int)

In [48]:
# Fix the version of hf_model configs produced with Colab

state_dict = load_file(f"{HF_MODEL}/model.safetensors")
torch.save(state_dict, f"{HF_MODEL}/pytorch_model.bin")

bad = f"{HF_MODEL}/tokenizer.json"
if os.path.exists(bad):
    os.remove(bad)
    print("Deleted corrupted:", bad)

tok = AutoTokenizer.from_pretrained("roberta-base", use_fast=True)
tok.save_pretrained(F"{HF_MODEL}")

print("Tokenizer files regenerated.")

Deleted corrupted: models/hf_model/tokenizer.json


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer files regenerated.


In [49]:
hf_model = ClassificationModel(
    model_type="roberta",
    model_name=f"{HF_MODEL}",
    use_cuda=False
)

## Hyperparameter Tuning

In [50]:
def tune_ratio_and_threshold(hf_model, pipeline, X_val, y_val, text_col="text",
                            ratios=np.linspace(0, 1, 101),
                            thresholds=np.linspace(0.05, 0.95, 91)):
    # Precompute base model probabilities ONCE to make tuning fast
    texts = X_val[text_col].astype(str).tolist()
    n = len(texts)

    _, logits = hf_model.predict(texts)
    hf_probs = softmax(logits, axis=1)[:, 1]
    if hf_probs.shape[0] != n:
        raise ValueError(f"HF returned {hf_probs.shape[0]} preds but X has {n} rows")

    probs = pipeline.predict_proba(X_val)
    if probs.shape[0] != n:
        raise ValueError(f"Sklearn returned {probs.shape[0]} preds but X has {n} rows")

    positive = 0.5 * probs[:, 2] + probs[:, 3] + 1.5 * probs[:, 4]
    negative = probs[:, 1] + 1.5 * probs[:, 0]
    sk_probs = positive / (positive + negative)

    best = {"hf_ratio": None, "threshold": None, "f1": -1.0}

    for r in ratios:
        avg = float(r) * hf_probs + (1.0 - float(r)) * sk_probs
        for t in thresholds:
            y_pred = (avg >= float(t)).astype(int)
            f1 = f1_score(y_val, y_pred)
            if f1 > best["f1"]:
                best = {"hf_ratio": float(r), "threshold": float(t), "f1": float(f1)}

    return best

best = tune_ratio_and_threshold(hf_model, pipeline, X_val, y_val_b, text_col="text")
print("Best:", best)


  0%|          | 0/2086 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

Best: {'hf_ratio': 0.41000000000000003, 'threshold': 0.48999999999999994, 'f1': 0.6051282051282051}


Best: {'hf_ratio': 0.4307692307692308, 'threshold': 0.5105263157894737, 'f1': 0.6025974025974026}

In [51]:
# Use best settings
ensemble_model = HF_Sklearn_Ensemble(hf_model, pipeline, hf_ratio=best["hf_ratio"])
y_pred_ensemble = ensemble_model.predict(X_val, text_col="text", threshold=best["threshold"])
print("Ensemble F1:", f1_score(y_val_b, y_pred_ensemble))

print(classification_report(y_val_b, y_pred_ensemble))

  0%|          | 0/2086 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

Ensemble F1: 0.6051282051282051
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1895
           1       0.59      0.62      0.61       191

    accuracy                           0.93      2086
   macro avg       0.78      0.79      0.78      2086
weighted avg       0.93      0.93      0.93      2086





# Save Model and Predictions

In [57]:
os.makedirs("models/ensemble_model", exist_ok=True)
joblib.dump(ensemble_model, "models/ensemble_model/ensemble_model.joblib")

['models/ensemble_model/ensemble_model.joblib']

In [54]:
def write_binary_predictions(
    ensemble,
    X,
    out_path,
    text_col="text",
    threshold=0.5
):
    preds = ensemble.predict(X, text_col=text_col, threshold=threshold)

    with open(out_path, "w", encoding="utf-8") as f:
        for p in preds:
            f.write(f"{int(p)}\n")

write_binary_predictions(
    ensemble=ensemble_model,
    X=X_val,
    out_path="output/val.txt"
)

write_binary_predictions(
    ensemble=ensemble_model,
    X=X_test,
    out_path="output/test.txt"
)

  0%|          | 0/2086 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling