In [163]:
import pandas as pd
import re
import html
import spacy
import nltk
from nltk.corpus import stopwords, words
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
import numpy as np
from sklearn.utils import shuffle

In [164]:
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [165]:
NER_COLS = ["ORG", "GPE", "NORP", "DATE", "CARDINAL", "PRODUCT", "ORDINAL", "LOC", "LAW"]
BASE_COLS = ["keyword", "country"]
LABEL_COL = "PCL_category"
TRAIN_TEXT_MIN_LEN = 3
IMPORTANCE = [3,2,1,2,3]
CATEGORICAL_COLS = ["keyword", "country"] 

In [166]:

LABEL_COL = "PCL_category"
 
class FeatureExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, nlp, stop_words, vocab):
        self.nlp = nlp
        self.ner_cols = NER_COLS
        self.base_cols = BASE_COLS
        self.stop_words = stop_words
        self.vocab = vocab
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        
        texts = X["text"].astype(str).tolist()
        results = []
        
        for i, doc in enumerate(self.nlp.pipe(texts, batch_size=32)):
 
            row_features = {
                # Constructed features
                **self._clean_text(doc),
                **self._punctuation_features(doc),
                **self._ner_counts(doc),
                **self._verb_noun_ratio(doc),
                # **self._count_misspellings(doc),
                # Base columns
                "keyword": X.iloc[i]["keyword"],
                "country": X.iloc[i]["country"]
            }
            
            
            results.append(row_features)
        
        return pd.DataFrame(results)
    
    def _clean_text(self, doc):
        text = doc.text
        text = html.unescape(text)
        text = re.sub(r"\s+", " ", text.replace("\n", " ").replace("\t", " ")).strip()
        text = re.sub(r"[^\w\s]", "", text)
        words = text.lower().split()
        tokens = [w for w in words if w not in self.stop_words]
        return {"cleaned_text": " ".join(tokens)}
    
    def _count_misspellings(self, doc):
        misspelled = 0
        tokens = [t.text.lower() for t in doc if t.is_alpha and not t.is_stop]
        
        for word in tokens:
            if word not in self.vocab:
                misspelled += 1

        return {"misspelled_ratio": misspelled / len(doc) if len(doc) > 0 else 0}

    def _punctuation_features(self, doc):
        text = doc.text
        sentence_len = max(len(text.split()), 1)
        pct_exclam = text.count("!") / sentence_len
        pct_question = text.count("?") / sentence_len
        return {"pct_exclam": pct_exclam, "pct_question": pct_question}

    def _ner_counts(self, doc):
        counts = dict.fromkeys(self.ner_cols, 0)
        for ent in doc.ents:
            if ent.label_ in counts:
                counts[ent.label_] += 1
        return counts
    
    def _verb_noun_ratio(self, doc):
        num_verbs = sum(1 for token in doc if token.pos_ == "VERB")
        num_nouns = sum(1 for token in doc if token.pos_ == "NOUN")
        
        ratio = num_verbs / num_nouns if num_nouns > 0 else 0
        return {"verb_noun_ratio": ratio}

In [167]:
def data_preprocess(data_path: str):
    df = pd.read_csv(
        data_path,
        sep="\t",
        skiprows=9,
        engine="python",
        index_col=0,
        header=None,
        names = ["article_id", "keyword", "country", "text", "PCL_category"]
    )

    sentence_len = df["text"].str.len()
    Q1 = sentence_len.quantile(0.25)
    Q3 = sentence_len.quantile(0.75)
    IQR = Q3 - Q1
    train_text_max_len = 3 + 1.5 * IQR
    outliers = df[(sentence_len < TRAIN_TEXT_MIN_LEN) | (sentence_len>train_text_max_len)]
    outlier_percentage = len(outliers)/len(df) * 100
    if outlier_percentage <= 5:
        df.drop(outliers.index, inplace = True)

    df = shuffle(df)
    y_categorical = df[LABEL_COL]
    y_binary = (y_categorical >= 2).astype(int)
    X = df.drop(columns=[LABEL_COL])

    n = len(X)
    train_limit = int(0.8*n)

    X_train, X_test = X[:train_limit], X[train_limit:]
    y_train_b, y_test_b = y_binary[:train_limit], y_binary[train_limit:]
    y_train_c, y_test_c = y_categorical[:train_limit], y_categorical[train_limit:]

    return X_train, X_test, y_train_b, y_test_b, y_train_c, y_test_c

In [168]:
def construct_class_weights(y):
    class_counts = np.bincount(y,minlength=5)  # y = your labels
    N = len(y)
    K = 5

    imbalance_weights = N / (K * class_counts)
    importance_weights = np.array(IMPORTANCE)

    final_weights = imbalance_weights * importance_weights
    final_weights /= final_weights.mean()

    return np.array([final_weights[c] for c in y])

In [169]:
class ClassifierWithBinarization(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.model = CatBoostClassifier(
            # iterations=1000,
            # learning_rate=0.03,
            # depth=6,
            # l2_leaf_reg=5,
            loss_function = "MultiClass",
            eval_metric="TotalF1",
            random_seed=42,
            verbose=False,
         )

    def fit(self, X, y, sample_weight=None):
        self.model.fit(X, y, sample_weight=sample_weight)
        return self

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def predict(self, X):
        probs = self.predict_proba(X)
        positive = 0.5*probs[:, 2] + probs[:, 3] + 1.5 * probs[:, 4]
        negative = probs[:, 1] + 1.5 * probs[:, 0]

        return (positive - negative > 0).astype(int)


In [170]:
numerical_cols =  NER_COLS + ["verb_noun_ratio", "pct_exclam", "pct_question"] 
cleaned_txt_col = "cleaned_text"

In [171]:
data_path = "dontpatronizeme_pcl.tsv"
X_train, X_test, y_train_b, y_test_b, y_train_c, y_test_c = data_preprocess(data_path)
print(len(X_train), len(X_test))

nlp = spacy.load("en_core_web_sm", disable=["parser", "lemmatizer"])
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
nltk.download('words')
english_vocab = set(w.lower() for w in words.words())

feature_extractor = FeatureExtractor(nlp, stop_words, english_vocab)

8371 2093


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/berra.dogan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/berra.dogan/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [172]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
categorical_cols = ["keyword", "country"]  


preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 2), sublinear_tf=True), cleaned_txt_col),
        ("numeric", StandardScaler(), numerical_cols),
    ],
    remainder="drop"
)

pipeline = Pipeline([
    ("features", feature_extractor),
    ("preprocess", preprocessor),
    ("feature_select", SelectKBest(score_func=f_classif, k=800)),
    ("model", ClassifierWithBinarization())
])


In [174]:
pipeline.fit(X_train, y_train_c, model__sample_weight=construct_class_weights(y_train_c))

In [175]:
from sklearn.metrics import f1_score, classification_report
y_pred = pipeline.predict(X_test)

f1 = f1_score(y_test_b, y_pred)
print("F1 score:", f1)

print(classification_report(y_test_b, y_pred))


F1 score: 0.3687150837988827
              precision    recall  f1-score   support

           0       0.94      0.87      0.91      1892
           1       0.29      0.49      0.37       201

    accuracy                           0.84      2093
   macro avg       0.62      0.68      0.64      2093
weighted avg       0.88      0.84      0.86      2093





In [176]:
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "iris_classifier_model.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(pipeline, file)