In [62]:
import pandas as pd
import re
import html
import spacy
import nltk
from nltk.corpus import stopwords, words
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
import numpy as np
from sklearn.utils import shuffle

In [63]:
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [64]:
NER_COLS = ["ORG", "GPE", "NORP", "DATE", "CARDINAL", "PRODUCT", "ORDINAL", "LOC", "LAW"]
BASE_COLS = ["keyword", "country"]
LABEL_COL = "PCL_category"
TRAIN_TEXT_MIN_LEN = 3
IMPORTANCE = [3,2,1,2,5]
CATEGORICAL_COLS = ["keyword", "country"] 

In [65]:

LABEL_COL = "PCL_category"
 
class FeatureExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, nlp, stop_words, vocab):
        self.nlp = nlp
        self.ner_cols = NER_COLS
        self.base_cols = BASE_COLS
        self.stop_words = stop_words
        self.vocab = vocab
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        
        texts = X["text"].astype(str).tolist()
        results = []
        
        for i, doc in enumerate(self.nlp.pipe(texts, batch_size=32)):
 
            row_features = {
                # Constructed features
                **self._clean_text(doc),
                **self._punctuation_features(doc),
                **self._ner_counts(doc),
                **self._verb_noun_ratio(doc),
                # **self._count_misspellings(doc),
                # Base columns
                "keyword": X.iloc[i]["keyword"],
                "country": X.iloc[i]["country"]
            }
            
            
            results.append(row_features)
        
        return pd.DataFrame(results)
    
    def _clean_text(self, doc):
        text = doc.text
        text = html.unescape(text)
        text = re.sub(r"\s+", " ", text.replace("\n", " ").replace("\t", " ")).strip()
        text = re.sub(r"[^\w\s]", "", text)
        words = text.lower().split()
        tokens = [w for w in words if w not in self.stop_words]
        return {"cleaned_text": " ".join(tokens)}
    
    def _count_misspellings(self, doc):
        misspelled = 0
        tokens = [t.text.lower() for t in doc if t.is_alpha and not t.is_stop]
        
        for word in tokens:
            if word not in self.vocab:
                misspelled += 1

        return {"misspelled_ratio": misspelled / len(doc) if len(doc) > 0 else 0}

    def _punctuation_features(self, doc):
        text = doc.text
        sentence_len = max(len(text.split()), 1)
        pct_exclam = text.count("!") / sentence_len
        pct_question = text.count("?") / sentence_len
        return {"pct_exclam": pct_exclam, "pct_question": pct_question}

    def _ner_counts(self, doc):
        counts = dict.fromkeys(self.ner_cols, 0)
        for ent in doc.ents:
            if ent.label_ in counts:
                counts[ent.label_] += 1
        return counts
    
    def _verb_noun_ratio(self, doc):
        num_verbs = sum(1 for token in doc if token.pos_ == "VERB")
        num_nouns = sum(1 for token in doc if token.pos_ == "NOUN")
        
        ratio = num_verbs / num_nouns if num_nouns > 0 else 0
        return {"verb_noun_ratio": ratio}

In [66]:
def data_preprocess(data_path: str):
    df = pd.read_csv(
        data_path,
        sep="\t",
        skiprows=9,
        engine="python",
        index_col=0,
        header=None,
        names = ["article_id", "keyword", "country", "text", "PCL_category"]
    )

    sentence_len = df["text"].str.len()
    Q1 = sentence_len.quantile(0.25)
    Q3 = sentence_len.quantile(0.75)
    IQR = Q3 - Q1
    train_text_max_len = 3 + 1.5 * IQR
    outliers = df[(sentence_len < TRAIN_TEXT_MIN_LEN) | (sentence_len>train_text_max_len)]
    outlier_percentage = len(outliers)/len(df) * 100
    if outlier_percentage <= 5:
        df.drop(outliers.index, inplace = True)

    df = shuffle(df)

    n = len(df)
    train_limit = int(0.8*n)

    y_categorical = df[LABEL_COL]
    y_binary = (y_categorical >= 2).astype(int)
    X = df.drop(columns=[LABEL_COL])

    X_train, X_test = X[:train_limit], X[train_limit:]
    y_train_b, y_test_b = y_binary[:train_limit], y_binary[train_limit:]
    y_train_c, y_test_c = y_categorical[:train_limit], y_categorical[train_limit:]

    return X_train, X_test, y_train_b, y_test_b, y_train_c, y_test_c

In [67]:
def construct_class_weights(y):
    class_counts = np.bincount(y,minlength=5)  # y = your labels
    N = len(y)
    K = 5

    imbalance_weights = N / (K * class_counts)
    importance_weights = np.array(IMPORTANCE)

    final_weights = imbalance_weights * importance_weights
    final_weights /= final_weights.mean()

    return np.array([final_weights[c] for c in y])

In [68]:
class ClassifierWithBinarization(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.model = CatBoostClassifier(
            # iterations=1000,
            # learning_rate=0.03,
            # depth=6,
            # l2_leaf_reg=2,
            loss_function = "MultiClass",
            eval_metric="TotalF1",
            random_seed=42,
            verbose=False,
         )

    def fit(self, X, y, sample_weight=None):
        self.model.fit(X, y, sample_weight=sample_weight)
        return self

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def predict(self, X):
        probs = self.predict_proba(X)
        positive = 0.5*probs[:, 2] + probs[:, 3] + 1.5 * probs[:, 4]
        negative = probs[:, 1] + 1.5 * probs[:, 0]

        return (positive - negative > 0).astype(int)


In [69]:
numerical_cols =  NER_COLS + ["verb_noun_ratio", "pct_exclam", "pct_question"] 
cleaned_txt_col = "cleaned_text"

In [70]:
data_path = "dontpatronizeme_pcl.tsv"
X_train, X_test, y_train_b, y_test_b, y_train_c, y_test_c = data_preprocess(data_path)
print(len(X_train), len(X_test))

nlp = spacy.load("en_core_web_sm", disable=["parser", "lemmatizer"])
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
nltk.download('words')
english_vocab = set(w.lower() for w in words.words())

feature_extractor = FeatureExtractor(nlp, stop_words, english_vocab)

8371 2093


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/berra.dogan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/berra.dogan/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [71]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
from textblob import TextBlob

class SentimentFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        feats = []
        for text in X:
            blob = TextBlob(str(text))
            feats.append([blob.sentiment.polarity, blob.sentiment.subjectivity])
        return np.array(feats)


In [73]:
categorical_cols = ["keyword", "country"]  


preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 5), sublinear_tf=True), cleaned_txt_col),
        ("numeric", StandardScaler(), numerical_cols),
        ("sentiment", SentimentFeatures(), cleaned_txt_col)
    ],
    remainder="drop"
)

pipeline = Pipeline([
    ("features", feature_extractor),
    ("preprocess", preprocessor),
    ("feature_select", SelectKBest(score_func=f_classif, k=1200)),
    ("model", ClassifierWithBinarization())
])

pipeline.fit(X_train, y_train_c, model__sample_weight=construct_class_weights(y_train_c))

0,1,2
,steps,"[('features', ...), ('preprocess', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,nlp,<spacy.lang.e...t 0x356dbbf70>
,stop_words,"{'a', 'about', 'above', 'after', 'again', 'against', ...}"
,vocab,"{'a', 'aa', 'aal', 'aalii', 'aam', 'aani', ...}"

0,1,2
,transformers,"[('categorical', ...), ('tfidf', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...t 0x352c756c0>
,k,1200


In [74]:
# # --- Full Hybrid Pipeline for PCL Detection (with class weights) ---

# import numpy as np
# import pandas as pd
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sentence_transformers import SentenceTransformer

# class TransformerEmbedding(BaseEstimator, TransformerMixin):
#     def __init__(self, model_name="all-MiniLM-L6-v2"):
#         self.model_name = model_name
#         self.model = SentenceTransformer(model_name)

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return self.model.encode(
#             X.tolist(), show_progress_bar=False, convert_to_numpy=True
#         )


# categorical_cols = ["keyword", "country"]

# preprocessor = ColumnTransformer(
#     transformers=[
#         ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
#         ("numeric", StandardScaler(), numerical_cols),
#         ("text", TransformerEmbedding(), cleaned_txt_col),
#         ("sentiment", SentimentFeatures(), cleaned_txt_col)
#     ],
#     remainder="drop"
# )


# pipeline = Pipeline([
#     ("features", feature_extractor),  # your existing feature_extractor
#     ("preprocess", preprocessor),
#     ("feature_select", SelectKBest(score_func=f_classif, k=1500)),
#     ("model", ClassifierWithBinarization())
# ])

# pipeline.fit(X_train, y_train_c, model__sample_weight=construct_class_weights(y_train_c))


In [75]:
# from textblob import TextBlob

# categorical_cols = ["keyword", "country"]  

# class SentimentFeatures(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         feats = []
#         for text in X:
#             blob = TextBlob(str(text))
#             feats.append([blob.sentiment.polarity, blob.sentiment.subjectivity])
#         return np.array(feats)

# preprocessor = ColumnTransformer(
#     transformers=[
#         ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
#         ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 5), sublinear_tf=True), cleaned_txt_col),
#         ("numeric", StandardScaler(), numerical_cols),
#         # ("sentiment", SentimentFeatures(), cleaned_txt_col),
#     ],
#     remainder="drop"
# )

# pipeline = Pipeline([
#     ("features", feature_extractor),
#     ("preprocess", preprocessor),
#     ("feature_select", SelectKBest(score_func=f_classif, k=1200)),
#     ("model", CatBoostClassifier(
#             # iterations=1000,
#             # learning_rate=0.03,
#             # depth=6,
#             # l2_leaf_reg=2,
#             loss_function = "Logloss",
#             eval_metric="F1",
#             random_seed=42,
#             verbose=False,
#             auto_class_weights="Balanced"
            
#          ))
# ])

# pipeline.fit(X_train, y_train_b)

In [76]:
# pipeline.fit(X_train, y_train_c, model__sample_weight=construct_class_weights(y_train_c))

In [77]:
from sklearn.metrics import f1_score, classification_report
y_pred = pipeline.predict(X_test)

f1 = f1_score(y_test_b, y_pred)
print("F1 score:", f1)

print(classification_report(y_test_b, y_pred))

F1 score: 0.3732876712328767
              precision    recall  f1-score   support

           0       0.95      0.85      0.90      1894
           1       0.28      0.55      0.37       199

    accuracy                           0.83      2093
   macro avg       0.62      0.70      0.64      2093
weighted avg       0.88      0.83      0.85      2093





In [78]:
# import pickle

# # save the iris classification model as a pickle file
# model_pkl_file = "model.pkl"  

# with open(model_pkl_file, 'wb') as file:  
#     pickle.dump(pipeline, file)

In [None]:
# def get_hf_predictions(model, texts):
#     """
#     Get probability predictions from a SimpleTransformers classification model.
#     Args:
#         model: SimpleTransformers ClassificationModel
#         texts: list of strings
#     Returns:
#         probs: np.array of shape (n_samples, n_classes)
#     """
#     preds, logits = model.predict(texts)
#     probs = softmax(logits, axis=1)
#     return probs

# def ensemble_predict(hf_model, sklearn_pipeline, X_test, text_col="text"):
#     """
#     Make ensemble predictions by averaging Hugging Face and sklearn probabilities
#     """
#     # Hugging Face probabilities
#     hf_probs = get_hf_predictions(hf_model, X_test[text_col].tolist())[:, 1]
#     print("hf_probs:", hf_probs.shape, hf_probs)
    
#     # Scikit-learn pipeline probabilities
#     probs = sklearn_pipeline.predict_proba(X_test)
#     positive = 0.5*probs[:, 2] + probs[:, 3] + 1.5 * probs[:, 4]
#     negative = probs[:, 1] + 1.5 * probs[:, 0]
#     sk_probs = positive / (positive + negative)
#     print("sk_probs:", sk_probs.shape, sk_probs)
    
#     # Average probabilities
#     avg_probs = (hf_probs + sk_probs) / 2
    
#     # Convert to binary labels
#     y_pred = (avg_probs >= 0.5).astype(int)
    
#     return y_pred, avg_probs



In [88]:
from scipy.special import softmax

class HF_Sklearn_Ensemble:
    def __init__(self, hf_model, sklearn_pipeline):
        self.hf_model = hf_model
        self.sklearn_pipeline = sklearn_pipeline

    def predict_proba(self, X, text_col="text"):
        """Return averaged probabilities"""
        preds, logits = self.hf_model.predict(X[text_col].tolist())
        hf_probs = softmax(logits, axis=1)[:, 1]
        
        # Scikit-learn pipeline probabilities
        probs = self.sklearn_pipeline.predict_proba(X_test)
        positive = 0.5*probs[:, 2] + probs[:, 3] + 1.5 * probs[:, 4]
        negative = probs[:, 1] + 1.5 * probs[:, 0]
        sk_probs = positive / (positive + negative)
        print("sk_probs:", sk_probs.shape, sk_probs)
        
        # Average probabilities
        avg_probs = (hf_probs + sk_probs) / 2

        return avg_probs

    def predict(self, X, text_col="text"):
        """Return binary labels from averaged probabilities"""
        avg_probs = self.predict_proba(X, text_col=text_col)
        return (avg_probs >= 0.5).astype(int)


In [80]:
from simpletransformers.classification import ClassificationModel

# 1. Initialize the model architecture
hf_model = ClassificationModel(
    "distilbert", 
    "models/3e-0_0.01/", 
    use_cuda=False
)

In [89]:
ensemble_model = HF_Sklearn_Ensemble(hf_model, pipeline)

y_pred_ensemble = ensemble_model.predict(X_test, text_col="text")

# Compute F1
f1 = f1_score(y_test_b, y_pred_ensemble)
print("Ensemble F1:", f1)

  0%|          | 0/2093 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

sk_probs: (2093,) [0.2237301  0.24124909 0.52113646 ... 0.54962167 0.24067788 0.2901578 ]
Ensemble F1: 0.7814207650273224




In [None]:
# import joblib

# # Save sklearn pipeline and wrapper
# joblib.dump(pipeline, "sklearn_pipeline.pkl")
# joblib.dump(ensemble_model, "ensemble_wrapper.pkl")

# # Hugging Face model already saved via .save_pretrained
# hf_model.model.save_pretrained("hf_model/")
# hf_model.tokenizer.save_pretrained("hf_model/")


('hf_model/tokenizer_config.json',
 'hf_model/special_tokens_map.json',
 'hf_model/vocab.txt',
 'hf_model/added_tokens.json',
 'hf_model/tokenizer.json')

In [94]:
import joblib
import gzip
import shutil
from pathlib import Path
import torch

base_dir = Path("models_created")
original_dir = base_dir / "original"
compressed_dir = base_dir / "compressed"

original_dir.mkdir(parents=True, exist_ok=True)
compressed_dir.mkdir(parents=True, exist_ok=True)

# Original
joblib.dump(pipeline, original_dir / "sklearn_pipeline.pkl")
joblib.dump(ensemble_model, original_dir / "ensemble_wrapper.pkl")
hf_model.model.save_pretrained(original_dir / "hf_model")
hf_model.tokenizer.save_pretrained(original_dir / "hf_model")

# Compressed
with open(original_dir / "sklearn_pipeline.pkl", "rb") as f_in, gzip.open(compressed_dir / "sklearn_pipeline_compressed.pkl.gz", "wb") as f_out:
    shutil.copyfileobj(f_in, f_out)

with open(original_dir / "ensemble_wrapper.pkl", "rb") as f_in, gzip.open(compressed_dir / "ensemble_wrapper_compressed.pkl.gz", "wb") as f_out:
    shutil.copyfileobj(f_in, f_out)

model_compressed = hf_model.model.half() # convert model to float16 for smaller size
compressed_model_dir = compressed_dir / "hf_model"
compressed_model_dir.mkdir(exist_ok=True)
state_dict_path = compressed_model_dir / "pytorch_model.bin"
torch.save(model_compressed.state_dict(), state_dict_path)
with open(state_dict_path, "rb") as f_in, gzip.open(compressed_model_dir / "pytorch_model_compressed.bin.gz", "wb") as f_out:
    shutil.copyfileobj(f_in, f_out)

# Copy tokenizer (usually small)
hf_model.tokenizer.save_pretrained(compressed_model_dir)

print("All models saved under 'models_created/' with original and compressed versions.")


All models saved under 'models_created/' with original and compressed versions.


In [None]:
# y_pred_ensemble, avg_probs = ensemble_predict(hf_model, pipeline, X_test, text_col="text")

# # Compute F1
# f1 = f1_score(y_test_b, y_pred_ensemble)
# print("Ensemble F1:", f1)

  0%|          | 0/2093 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

hf_probs: (2093,) [0.00333796 0.00470961 0.00833689 ... 0.00440963 0.00510807 0.13188387]
sk_probs: (2093,) [0.2237301  0.24124909 0.52113646 ... 0.54962167 0.24067788 0.2901578 ]
Ensemble F1: 0.7814207650273224


