In [9]:
import os, re, duckdb, joblib, gc, random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz, load_npz, vstack, csr_matrix, hstack
import time
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

In [2]:
# Setup

DB_PATH   = r"D:/db/meta.duckdb"
OUT_DIR   = r"D:/dataset/text_features/tfidf_v2"
os.makedirs(OUT_DIR, exist_ok=True)
   
table = "md1718"

# Connection
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [6]:
# Count the number of null captions
print(con.sql("""SELECT split, COUNT(*) AS null_caption FROM md1718 WHERE caption_tfidf ='' GROUP BY split"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┬──────────────┐
│   split    │ null_caption │
│  varchar   │    int64     │
├────────────┼──────────────┤
│ test       │        12683 │
│ validation │        11916 │
│ train      │        21571 │
└────────────┴──────────────┘



In [7]:
# Upload TF-IDF cleaned captions, split, post_id and target er_log

df_caps = con.execute(f"""
    SELECT post_id, split, caption_tfidf AS caption_clean, er_bins
    FROM md1718
""").df()

con.close()

print(df_caps.head())

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

                          post_id split  \
0  la.brandon-1936393348858127807  test   
1  la.brandon-1937213763687991478  test   
2  la.brandon-1937218941271111413  test   
3  la.brandon-1938181925006920700  test   
4  la.brandon-1938620907515771752  test   

                                       caption_clean    er_bins  
0  thinking things may slide home plate year end ...  very_high  
1  still looking gift ideas grab streaming stick ...  very_high  
2  often answers looking around corner keepwalkin...  very_high  
3  friends christmas countdown received early pre...  very_high  
4  god grant serenity accept things cannot change...  very_high  


In [8]:
# Split
train = df_caps[df_caps["split"] == "train"].copy()
val   = df_caps[df_caps["split"] == "validation"].copy()
test  = df_caps[df_caps["split"] == "test"].copy()

# Cleaned captions 
X_train_text = train["caption_clean"].tolist()
X_val_text   = val["caption_clean"].tolist()
X_test_text  = test["caption_clean"].tolist()

# Target columns
y_tr = train["er_bins"].to_numpy()
y_va = val["er_bins"].to_numpy()
y_te = test["er_bins"].to_numpy()

# Post ids for future joins
train_ids = train["post_id"].to_numpy()
val_ids   = val["post_id"].to_numpy()
test_ids  = test["post_id"].to_numpy()

In [9]:
# Define the best combination of parameters for the encoding with TF-IDF
stop_words = [
    "hearts", "tone", "love","day","today","time","one","two","get","like",
    "eyes","hands","red", "see", "happy", "life", "hand", "hands", "back", "link","flash",
    "http","liketk","liketkit", "much", "know", "woman", "man", 
    "you","new", "go", "make", "good", "best", "beautiful"
]

cands_vec = [

    dict(ngram_range=(1,1), min_df=15, max_df=0.9, max_features=30000,
         token_pattern=r"(?u)\b[^\W\d_]{2,}\b", stop_words = stop_words),

    dict(ngram_range=(1,1), min_df=15, max_df=0.7, max_features=40000,
         token_pattern=r"(?u)\b[^\W\d_]{2,}\b", stop_words = stop_words),

    dict(ngram_range=(1,1), min_df=20, max_df=0.7, max_features=50000,
         token_pattern=r"(?u)\b[^\W\d_]{2,}\b", stop_words = stop_words),

    # --- Bigrammi ---
    dict(ngram_range=(1,2), min_df=15, max_df=0.9, max_features=30000,
         token_pattern=r"(?u)\b[^\W\d_]{2,}\b", stop_words = stop_words),

    dict(ngram_range=(1,2), min_df=15, max_df=0.7, max_features=40000,
         token_pattern=r"(?u)\b[^\W\d_]{2,}\b", stop_words = stop_words),

    dict(ngram_range=(1,2), min_df=20, max_df=0.7, max_features=50000,
         token_pattern=r"(?u)\b[^\W\d_]{2,}\b", stop_words = stop_words)
]

Cs = [0.01, 0.1, 1.0, 10.0]

best_f1 = -1.0
best_cfg = None
best_C = None

results = [] 

for i, cfg in enumerate(cands_vec):
    print(f"\n=== Config TF-IDF {i+1}/{len(cands_vec)}: {cfg} ===")

    vec = TfidfVectorizer(
        sublinear_tf=True,
        dtype=np.float32,
        **cfg
    )

    # FIT sul train e TRANSFORM su train/val
    Xtr = vec.fit_transform(X_train_text)
    Xva = vec.transform(X_val_text)

    print(f"  -> Shape Xtr: {Xtr.shape}, Xva: {Xva.shape}")

    for C in Cs:
        mdl = LinearSVC(C=C, random_state=0)
        mdl.fit(Xtr, y_tr)

        p_va = mdl.predict(Xva)
        f1 = f1_score(y_va, p_va, average="macro")
        acc = accuracy_score(y_va, p_va)


        results.append((f1, acc, C, cfg))

        print(f"     C={C:<6}  F1_macro={f1:.4f}, accuracy = {acc}")

        if f1 > best_f1:
            best_f1 = f1
            best_cfg = cfg
            best_C = C
            bect_acc = acc

print("\n=== Best configuration (on validation, LinearSVC) ===")
print("Best F1_macro:", round(best_f1, 4))
print("Best C:", best_C)
print("Best TF-IDF cfg:", best_cfg)


=== Config TF-IDF 1/6: {'ngram_range': (1, 1), 'min_df': 15, 'max_df': 0.9, 'max_features': 30000, 'token_pattern': '(?u)\\b[^\\W\\d_]{2,}\\b', 'stop_words': ['hearts', 'tone', 'love', 'day', 'today', 'time', 'one', 'two', 'get', 'like', 'eyes', 'hands', 'red', 'see', 'happy', 'life', 'hand', 'hands', 'back', 'link', 'flash', 'http', 'liketk', 'liketkit', 'much', 'know', 'woman', 'man', 'you', 'new', 'go', 'make', 'good', 'best', 'beautiful']} ===
  -> Shape Xtr: (773497, 30000), Xva: (412325, 30000)
     C=0.01    F1_macro=0.2593, accuracy = 0.2601976596131692
     C=0.1     F1_macro=0.2573, accuracy = 0.25774328503001276
     C=1.0     F1_macro=0.2538, accuracy = 0.25411507912447706
     C=10.0    F1_macro=0.2522, accuracy = 0.2526817437700843

=== Config TF-IDF 2/6: {'ngram_range': (1, 1), 'min_df': 15, 'max_df': 0.7, 'max_features': 40000, 'token_pattern': '(?u)\\b[^\\W\\d_]{2,}\\b', 'stop_words': ['hearts', 'tone', 'love', 'day', 'today', 'time', 'one', 'two', 'get', 'like', 'eye

In [10]:
print(classification_report(y_va, p_va))

              precision    recall  f1-score   support

        high       0.22      0.25      0.23     80291
         low       0.23      0.26      0.24     81827
      medium       0.22      0.21      0.21     81667
   very_high       0.31      0.23      0.26     86194
    very_low       0.30      0.32      0.31     82346

    accuracy                           0.25    412325
   macro avg       0.26      0.25      0.25    412325
weighted avg       0.26      0.25      0.25    412325



In [13]:
# Best TF-IDF cfg: {'ngram_range': (1, 1), 'min_df': 20, 'max_df': 0.7, 'max_features': 50000, 'token_pattern': '(?u)\\b[^\\W\\d_]{2,}\\b',
vectorizer = TfidfVectorizer(
    lowercase=True,
    analyzer="word", # default
    token_pattern = r"(?u)\b[^\W\d_]{2,}\b", # just characters without digits or underscores 
    ngram_range=(1, 1), # unigrams and bigrams
    min_df=20, # discard rare terms (they must appean in at least 10 captions)
    max_df=0.7, # discard too frequent terms (if they appear in 80% of the captions or more)
    max_features=50000, 
    dtype=np.float32,
    stop_words = stop_words
)

X_train = vectorizer.fit_transform(X_train_text)
X_val   = vectorizer.transform(X_val_text)
X_test  = vectorizer.transform(X_test_text)

# Save the train, validation and test text features in sparse format .npz (reducing the space occupied by not writing zeros)
save_npz('D:/dataset/text_features/tfidf_v3/tfidf_train.npz', X_train)
save_npz('D:/dataset/text_features/tfidf_v3/tfidf_val.npz',   X_val)
save_npz('D:/dataset/text_features/tfidf_v3/tfidf_test.npz',  X_test)

# Save post_ids in the same order to allow future joins
np.save('D:/dataset/text_features/tfidf_v3/tfidf_train_post_ids.npy', train_ids)
np.save('D:/dataset/text_features/tfidf_v3/tfidf_val_post_ids.npy',   val_ids)
np.save('D:/dataset/text_features/tfidf_v3/tfidf_test_post_ids.npy',  test_ids)

# Save target values in the same order
np.save('D:/dataset/text_features/tfidf_v3/tfidf_train_y.npy', y_tr)
np.save('D:/dataset/text_features/tfidf_v3/tfidf_val_y.npy',   y_va)
np.save('D:/dataset/text_features/tfidf_v3/tfidf_test_y.npy',  y_te)

# Save the vectorizer
joblib.dump(vectorizer, 'D:/dataset/text_features/tfidf_v3/tfidf_vectorizer.joblib')
print("Done: matrices, mapping post ids and vectorizer saved")

Done: matrices, mapping post ids and vectorizer saved


In [46]:
# CHECKS
# Import the data
OUT_DIR = r"D:/dataset/text_features/tfidf_v3"

Xtr = load_npz(os.path.join(OUT_DIR, "tfidf_train.npz"))
Xva = load_npz(os.path.join(OUT_DIR, "tfidf_val.npz"))
Xte = load_npz(os.path.join(OUT_DIR, "tfidf_test.npz"))

tr_ids = np.load(os.path.join(OUT_DIR, "tfidf_train_post_ids.npy"), allow_pickle=True)
va_ids = np.load(os.path.join(OUT_DIR, "tfidf_val_post_ids.npy"), allow_pickle=True)
te_ids = np.load(os.path.join(OUT_DIR, "tfidf_test_post_ids.npy"), allow_pickle=True)

y_tr = np.load(os.path.join(OUT_DIR, "tfidf_train_y.npy"), allow_pickle = True)
y_va = np.load(os.path.join(OUT_DIR, "tfidf_val_y.npy"), allow_pickle = True)
y_te = np.load(os.path.join(OUT_DIR, "tfidf_test_y.npy"), allow_pickle = True)

# Check the train, validation and test set have the same number of columns as the vocabulary
vectorizer = joblib.load(os.path.join(OUT_DIR, "tfidf_vectorizer.joblib"))

In [15]:
# Check the train, validation and test set have the same number of columns as the vocabulary
V = len(vectorizer.get_feature_names_out())

print("Train:", Xtr.shape, len(tr_ids))
print("Val  :", Xva.shape, len(va_ids))
print("Test :", Xte.shape, len(te_ids))
print("Vocab size:", V)

assert Xtr.shape[1] == Xva.shape[1] == Xte.shape[1] == V, "The sets have different number of features than the vocabulary"
assert Xtr.shape[0] == len(tr_ids) and Xva.shape[0] == len(va_ids) and Xte.shape[0] == len(te_ids), "The number of rows is different than the number of post ids"

Train: (773497, 50000) 773497
Val  : (412325, 50000) 412325
Test : (423604, 50000) 423604
Vocab size: 50000


In [16]:
# Percentage of rows with no caption
def zero_row_pct(X):
    return float((X.getnnz(axis=1) == 0).sum()) / X.shape[0] * 100

print("TF-IDF empty rows % - train:", f"{zero_row_pct(Xtr):.2f}%")
print("                      val :", f"{zero_row_pct(Xva):.2f}%")
print("                      test :", f"{zero_row_pct(Xte):.2f}%")

TF-IDF empty rows % - train: 3.16%
                      val : 3.32%
                      test : 3.43%


In [17]:
# TOP K TERMS in a caption (example)
feat = np.array(vectorizer.get_feature_names_out())

i = np.random.randint(0, Xtr.shape[0])  # un post random del train
row = Xtr[i]
coo = row.tocoo()
order = np.argsort(coo.data)[::-1][:10]
print("Post:", tr_ids[i])
print("Top-10 termini:", list(zip(feat[coo.col[order]], np.round(coo.data[order], 3))))


Post: georgina.gigi-1824168801363440426
Top-10 termini: [('party', 0.719), ('green', 0.368), ('pink', 0.36), ('remember', 0.359), ('well', 0.299)]


In [18]:
feat = vectorizer.get_feature_names_out()
print(len(feat))        # number of features
print(feat[:25])        # first 25
print(feat[50000:50010]) # other features

50000
['aa' 'aaa' 'aaaaaand' 'aaaaand' 'aaaand' 'aaand' 'aaddephotography'
 'aafamily' 'aaliyah' 'aand' 'aaqshah' 'aaqshahbrides' 'aarhus' 'aaron'
 'aavi' 'ab' 'abandon' 'abandoned' 'abandonedplaces' 'abaya' 'abba'
 'abbey' 'abbiamo' 'abbiethevizsla' 'abbotsford']
[]


In [22]:
# ESTRAZIONE TOP 10 PAROLE PER CATEGORIA

# Setup

DB_PATH   = r"D:/db/meta.duckdb"

# Connection
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

cat = con.sql("""SELECT post_id, category FROM md1718""").df()

# dataframe train con id in ordine del TF-IDF
df_train_ids = pd.DataFrame({'post_id': train_ids})
df_train = df_train_ids.merge(cat, on='post_id', how='left')
print(df_train.head())

# Feature names out of the matrix
feature_names = np.array(vectorizer.get_feature_names_out())

def top_k_words_for_category(X, categories, feature_names, k=10):
    results = {}

    for cat in sorted(np.unique(categories)):
        idx = np.where(categories == cat)[0]

        if len(idx) == 0:
            continue
        
        # subset righe del training per quella categoria
        X_cat = X[idx]

        # somma TF-IDF della categoria
        tfidf_sum = np.asarray(X_cat.sum(axis=0)).flatten()

        # top k colonne
        top_idx = tfidf_sum.argsort()[::-1][:k]

        results[cat] = list(feature_names[top_idx])

    return results

#Extract top words from the train set
categories_train = df_train['category'].values

top_words = top_k_words_for_category(
    Xtr,
    categories_train,
    feature_names,
    k=10
)

top_words

Set up ready


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

                             post_id category
0    jessieonair-1832772436464878417    other
1  jess_soothill-1814050234055839189  fashion
2  jess_soothill-1815434266849071530  fashion
3  jess_soothill-1816220883029348816  fashion
4  jess_soothill-1817582292300511649  fashion


{'beauty': ['makeup',
  'palette',
  'hair',
  'beauty',
  'look',
  'lashes',
  'lipstick',
  'foundation',
  'brow',
  'lips'],
 'family': ['little',
  'kids',
  'baby',
  'family',
  'fun',
  'year',
  'motherhood',
  'us',
  'first',
  'weekend'],
 'fashion': ['ootd',
  'fashion',
  'outfit',
  'look',
  'style',
  'summer',
  'weekend',
  'shop',
  'ad',
  'dress'],
 'fitness': ['fitness',
  'workout',
  'yoga',
  'gym',
  'body',
  'week',
  'fit',
  'work',
  'training',
  'motivation'],
 'food': ['recipe',
  'food',
  'foodie',
  'delicious',
  'breakfast',
  'chocolate',
  'foodporn',
  'instafood',
  'chicken',
  'made'],
 'interior': ['home',
  'interiordesign',
  'room',
  'interior',
  'design',
  'little',
  'house',
  'homedecor',
  'kitchen',
  'weekend'],
 'other': ['art',
  'us',
  'thank',
  'bio',
  'great',
  'night',
  'thanks',
  'fun',
  'ad',
  'year'],
 'pet': ['dog',
  'dogs',
  'dogsofinstagram',
  'cats',
  'puppy',
  'weeklyfluff',
  'cat',
  'weekend',
  

In [23]:
feature_names[:200]

array(['aa', 'aaa', 'aaaaaand', 'aaaaand', 'aaaand', 'aaand',
       'aaddephotography', 'aafamily', 'aaliyah', 'aand', 'aaqshah',
       'aaqshahbrides', 'aarhus', 'aaron', 'aavi', 'ab', 'abandon',
       'abandoned', 'abandonedplaces', 'abaya', 'abba', 'abbey',
       'abbiamo', 'abbiethevizsla', 'abbotsford', 'abby', 'abbysatlas',
       'abbywoodwear', 'abc', 'abcoasia', 'abcommunity', 'abcs',
       'abdomen', 'abdominal', 'abeautifulmess', 'abeautyedit', 'abel',
       'abend', 'abenteuervorderhaustuer', 'aber', 'abercrombie',
       'aberdeen', 'abessinier', 'abh', 'abhbronzer', 'abhbrows',
       'abhbrowwiz', 'abhcontourkit', 'abhcosmetics', 'abhdipbrow',
       'abhfam', 'abhfoundation', 'abhglowkit', 'abhjunkies',
       'abhliquidlipstick', 'abhlook', 'abhmodernrenaissance', 'abhprism',
       'abhprsearch', 'abhshadows', 'abhsoftglam', 'abhsubculture',
       'abhxamrezy', 'abhxamrezyhighlighter', 'abhxnorvina', 'abide',
       'abigail', 'abigailahern', 'abilities', 'abil

In [47]:
# Unique list of words with no duplicates
words_for_features = sorted({w for lst in top_words.values() for w in lst})

# Map, word - index column TF-IDF
# It simplifies the creation of the boolean feature, as it directly looks at the word column to understand whether it is present or not
word_to_idx = {
    w: np.where(feature_names == w)[0][0]
    for w in words_for_features
    if w in feature_names
}

In [48]:
word_to_idx

{'ad': 325,
 'art': 1874,
 'baby': 2595,
 'beach': 3291,
 'beauty': 3476,
 'bio': 4261,
 'body': 4905,
 'breakfast': 5576,
 'brow': 5854,
 'cat': 7007,
 'cats': 7083,
 'chicken': 7725,
 'chocolate': 7908,
 'city': 8161,
 'cute': 10393,
 'delicious': 11163,
 'design': 11359,
 'dog': 12167,
 'dogs': 12289,
 'dogsofinstagram': 12351,
 'dress': 12813,
 'family': 14859,
 'fashion': 15038,
 'first': 15721,
 'fit': 15765,
 'fitness': 15846,
 'food': 16410,
 'foodie': 16479,
 'foodporn': 16544,
 'foundation': 16747,
 'fun': 17231,
 'great': 18793,
 'gym': 19175,
 'hair': 19262,
 'home': 20621,
 'homedecor': 20655,
 'house': 20984,
 'instafood': 22492,
 'interior': 22798,
 'interiordesign': 22815,
 'kids': 24114,
 'kitchen': 24312,
 'lashes': 24852,
 'lips': 25695,
 'lipstick': 25700,
 'little': 25762,
 'look': 26144,
 'made': 26804,
 'makeup': 26981,
 'motherhood': 29354,
 'motivation': 29408,
 'nature': 30523,
 'night': 30992,
 'ootd': 32013,
 'outfit': 32365,
 'palette': 32653,
 'photo': 336

In [49]:
df_bool_train.shape

(773497, 0)

In [50]:
df_bool_val.shape

(412325, 79)

In [51]:
df_bool_test.shape

(423604, 79)

In [52]:
len(word_to_idx)


79

In [53]:
words_for_features

['ad',
 'art',
 'baby',
 'beach',
 'beauty',
 'bio',
 'body',
 'breakfast',
 'brow',
 'cat',
 'cats',
 'chicken',
 'chocolate',
 'city',
 'cute',
 'delicious',
 'design',
 'dog',
 'dogs',
 'dogsofinstagram',
 'dress',
 'family',
 'fashion',
 'first',
 'fit',
 'fitness',
 'food',
 'foodie',
 'foodporn',
 'foundation',
 'fun',
 'great',
 'gym',
 'hair',
 'home',
 'homedecor',
 'house',
 'instafood',
 'interior',
 'interiordesign',
 'kids',
 'kitchen',
 'lashes',
 'lips',
 'lipstick',
 'little',
 'look',
 'made',
 'makeup',
 'motherhood',
 'motivation',
 'nature',
 'night',
 'ootd',
 'outfit',
 'palette',
 'photo',
 'place',
 'puppy',
 'recipe',
 'room',
 'shop',
 'style',
 'summer',
 'sunset',
 'thank',
 'thanks',
 'training',
 'travel',
 'us',
 'use',
 'week',
 'weekend',
 'weeklyfluff',
 'work',
 'workout',
 'world',
 'year',
 'yoga']

In [65]:
# Define a df with boolean features
# The index of the word, allows us to easily identify if the word is present or not (X[:, idx] > 0)
def build_bool_features(X, word_to_idx, prefix=""):
    out = {}
    for w, idx in word_to_idx.items():
        out[f"{prefix}has_{w}"] = (X[:, idx] > 0).astype(int).toarray().ravel()
    return pd.DataFrame(out)

In [66]:
df_bool_train = build_bool_features(Xtr, word_to_idx, prefix="tfidf_")
df_bool_val   = build_bool_features(Xva, word_to_idx, prefix="tfidf_")
df_bool_test  = build_bool_features(Xte, word_to_idx, prefix="tfidf_")

In [67]:
df_bool_train.head()

Unnamed: 0,tfidf_has_ad,tfidf_has_art,tfidf_has_baby,tfidf_has_beach,tfidf_has_beauty,tfidf_has_bio,tfidf_has_body,tfidf_has_breakfast,tfidf_has_brow,tfidf_has_cat,...,tfidf_has_us,tfidf_has_use,tfidf_has_week,tfidf_has_weekend,tfidf_has_weeklyfluff,tfidf_has_work,tfidf_has_workout,tfidf_has_world,tfidf_has_year,tfidf_has_yoga
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
df_bool_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 773497 entries, 0 to 773496
Data columns (total 79 columns):
 #   Column                     Non-Null Count   Dtype
---  ------                     --------------   -----
 0   tfidf_has_ad               773497 non-null  int32
 1   tfidf_has_art              773497 non-null  int32
 2   tfidf_has_baby             773497 non-null  int32
 3   tfidf_has_beach            773497 non-null  int32
 4   tfidf_has_beauty           773497 non-null  int32
 5   tfidf_has_bio              773497 non-null  int32
 6   tfidf_has_body             773497 non-null  int32
 7   tfidf_has_breakfast        773497 non-null  int32
 8   tfidf_has_brow             773497 non-null  int32
 9   tfidf_has_cat              773497 non-null  int32
 10  tfidf_has_cats             773497 non-null  int32
 11  tfidf_has_chicken          773497 non-null  int32
 12  tfidf_has_chocolate        773497 non-null  int32
 13  tfidf_has_city             773497 non-null  int32
 14  tfid

In [69]:
# Convert in sparse
Xtr_bool = csr_matrix(df_bool_train.values.astype(np.float32))
Xva_bool = csr_matrix(df_bool_val.values.astype(np.float32))
Xte_bool = csr_matrix(df_bool_test.values.astype(np.float32))

In [70]:
Xtr_bool.shape

(773497, 79)

In [71]:
# Add the boolean variables to the matrix
Xtr_full = hstack([Xtr, Xtr_bool])
Xva_full = hstack([Xva, Xva_bool])
Xte_full = hstack([Xte, Xte_bool])

print("Train full shape:", Xtr_full.shape)
print("Val   full shape:", Xva_full.shape)
print("Test  full shape:", Xte_full.shape)

Train full shape: (773497, 50079)
Val   full shape: (412325, 50079)
Test  full shape: (423604, 50079)


In [72]:
OUT_DIR = r"D:/dataset/text_features/tfidf_v3"

save_npz(f"{OUT_DIR}/tfidf_topwords_train.npz", Xtr_full)
save_npz(f"{OUT_DIR}/tfidf_topwords_val.npz",   Xva_full)
save_npz(f"{OUT_DIR}/tfidf_topwords_test.npz",  Xte_full)


In [2]:
OUT_DIR = r"D:/dataset/text_features/tfidf_v3"

Xtr_full = load_npz(f"{OUT_DIR}/tfidf_topwords_train.npz").astype(np.float32)
Xva_full = load_npz(f"{OUT_DIR}/tfidf_topwords_val.npz").astype(np.float32)

tr_ids = np.load(os.path.join(OUT_DIR, "tfidf_train_post_ids.npy"), allow_pickle=True)
va_ids = np.load(os.path.join(OUT_DIR, "tfidf_val_post_ids.npy"), allow_pickle=True)

y_tr = np.load(os.path.join(OUT_DIR, "tfidf_train_y.npy"), allow_pickle = True)
y_va = np.load(os.path.join(OUT_DIR, "tfidf_val_y.npy"), allow_pickle = True)

In [4]:
# NAIVE BAYES

param_grid = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
}

results = []
best_score = -np.inf
best_params = None

batch_size = 256
classes = np.unique(y_tr)

for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = GaussianNB(**params)

    # Fit the model using minibatch for memory
    for start in range(0, Xtr_full.shape[0], batch_size):
        # print(f"Batch {start} fit")
        end = min(start + batch_size, Xtr_full.shape[0])

        Xb = Xtr_full[start:end].toarray()
        yb = y_tr[start:end]

        if start == 0:
            clf.partial_fit(Xb, yb, classes=classes)
        else:
            clf.partial_fit(Xb, yb)

        del Xb, yb
        gc.collect()

    # Predict using minibatches
    y_val_pred = []

    for start in range(0, Xva_full.shape[0], batch_size):
        # print(f"Batch {start} predict")
        end = min(start + batch_size, Xva_full.shape[0])

        Xb = Xva_full[start:end].toarray()
        y_val_pred.append(clf.predict(Xb))

        del Xb
        gc.collect()

    y_val_pred = np.concatenate(y_val_pred)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "var_smoothing": params["var_smoothing"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values(
    "val_macro_f1", ascending=False
)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'var_smoothing': 1e-09}
macro-F1 (val): 0.1786 | accuracy (val): 0.2396

Combination: {'var_smoothing': 1e-08}
macro-F1 (val): 0.1857 | accuracy (val): 0.2414

Combination: {'var_smoothing': 1e-07}
macro-F1 (val): 0.1942 | accuracy (val): 0.2432

Combination: {'var_smoothing': 1e-06}
macro-F1 (val): 0.2048 | accuracy (val): 0.2457

Best hyperparameter configuration:
{'var_smoothing': 1e-06}
Validation macro-F1: 0.20482327670028494

Ordered results by macro-F1 (validation):
   var_smoothing  val_macro_f1  val_accuracy
3   1.000000e-06      0.204823      0.245736
2   1.000000e-07      0.194223      0.243184
1   1.000000e-08      0.185663      0.241402
0   1.000000e-09      0.178621      0.239585


In [3]:
# SGD
param_grid = {
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "class_weight": [None, "balanced"],
}

results = []
best_score = -np.inf
best_params = None


for params in ParameterGrid(param_grid):
    print(f"\nCombination: {params}")

    clf = SGDClassifier(
        loss="hinge",            
        penalty="l2",            
        **params,
        average = True,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )

    clf.fit(Xtr_full, y_tr)

    y_val_pred = clf.predict(Xva_full)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1} | accuracy (val): {acc}")

    results.append({
        "alpha": params["alpha"],
        "class_weight": params["class_weight"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration:")
print(best_params)
print("Validation macro-F1:", best_score)

results_df = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df)


Combination: {'alpha': 1e-05, 'average': False, 'class_weight': None}
macro-F1 (val): 0.2549392831142973 | accuracy (val): 0.2574934820833081

Combination: {'alpha': 1e-05, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.2515283705585684 | accuracy (val): 0.262855756987813

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': None}
macro-F1 (val): 0.2605012964347888 | accuracy (val): 0.26534408536955073

Combination: {'alpha': 1e-05, 'average': True, 'class_weight': 'balanced'}
macro-F1 (val): 0.2580914164255799 | accuracy (val): 0.2693821621293882

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': None}
macro-F1 (val): 0.25280690427831304 | accuracy (val): 0.25670041835930396

Combination: {'alpha': 0.0001, 'average': False, 'class_weight': 'balanced'}
macro-F1 (val): 0.25045347173834576 | accuracy (val): 0.2593924695325289

Combination: {'alpha': 0.0001, 'average': True, 'class_weight': None}
macro-F1 (val): 0.2597298522188484 | accuracy (val

In [18]:
# RANDOM FOREST 
param_grid_rf = {
    "n_estimators": [30, 50, 80],
    "max_depth": [8, 10, 12],
    "min_samples_leaf": [2, 5],
    "max_features": [0.05, "sqrt"],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_rf):
    print(f"\nCombination: {params}")

    clf = RandomForestClassifier(
        **params,
        n_jobs=-1,
        random_state=42
    )

    clf.fit(Xtr_full, y_tr)

    y_val_pred = clf.predict(Xva_full)

    macro_f1 = f1_score(y_va, y_val_pred, average="macro")
    acc = accuracy_score(y_va, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        "n_estimators": params["n_estimators"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (Random Forest):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_rf = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results by macro-F1 (validation):")
print(results_df_rf)


Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1358 | accuracy (val): 0.2089

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 50}
macro-F1 (val): 0.1347 | accuracy (val): 0.2078

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 2, 'n_estimators': 80}
macro-F1 (val): 0.1346 | accuracy (val): 0.2079

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 30}
macro-F1 (val): 0.1333 | accuracy (val): 0.2082

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 50}
macro-F1 (val): 0.1347 | accuracy (val): 0.2080

Combination: {'max_depth': 8, 'max_features': 0.05, 'min_samples_leaf': 5, 'n_estimators': 80}
macro-F1 (val): 0.1347 | accuracy (val): 0.2080

Combination: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 30}
macro-F1 (val): 0.1324 | accuracy (val

In [5]:
# XGBOOST

# Convert the labels into numbers
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_val_enc = le.transform(y_va)


param_grid_xgb = {
    "n_estimators": [100, 150],
    "max_depth": [4, 6],
    "learning_rate": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
    "gamma": [0, 1],
    "reg_lambda": [1],
}

results = []
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid_xgb):
    print(f"\nCombination: {params}")

    clf = XGBClassifier(
        **params,
        objective="multi:softmax",
        num_class=len(np.unique(y_tr_enc)),
        tree_method="hist",
        eval_metric="mlogloss",
        n_jobs=-1,
        random_state=42,
        verbosity=0,
    )

    # Fit
    clf.fit(Xtr_full, y_tr_enc)

    # Validation
    y_val_pred = clf.predict(Xva_full)

    macro_f1 = f1_score(y_val_enc, y_val_pred, average="macro")
    acc = accuracy_score(y_val_enc, y_val_pred)

    print(f"macro-F1 (val): {macro_f1:.4f} | accuracy (val): {acc:.4f}")

    results.append({
        **params,
        "val_macro_f1": macro_f1,
        "val_accuracy": acc,
    })

    if macro_f1 > best_score:
        best_score = macro_f1
        best_params = params

print("\nBest hyperparameter configuration (XGBoost):")
print(best_params)
print("Validation macro-F1:", best_score)

results_df_xgb = pd.DataFrame(results).sort_values("val_macro_f1", ascending=False)
print("\nOrdered results:")
print(results_df_xgb)


Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1638 | accuracy (val): 0.2191

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1762 | accuracy (val): 0.2227

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1760 | accuracy (val): 0.2227

Combination: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1900 | accuracy (val): 0.2267

Combination: {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
macro-F1 (val): 0.1659 | accuracy (val): 0.2196

Combination: {'colsample_bytr

In [None]:
# PERFORMANCE ON TEST SET

In [None]:
OUT_DIR = r"D:/dataset/text_features/tfidf_v3"

Xtr_full = load_npz(f"{OUT_DIR}/tfidf_topwords_train.npz").astype(np.float32)
Xva_full = load_npz(f"{OUT_DIR}/tfidf_topwords_val.npz").astype(np.float32)
Xte_full = load_npz(f"{OUT_DIR}/tfidf_topwords_test.npz").astype(np.float32)

tr_ids = np.load(os.path.join(OUT_DIR, "tfidf_train_post_ids.npy"), allow_pickle=True)
va_ids = np.load(os.path.join(OUT_DIR, "tfidf_val_post_ids.npy"), allow_pickle=True)
te_ids = np.load(os.path.join(OUT_DIR, "tfidf_test_post_ids.npy"), allow_pickle=True)

y_tr = np.load(os.path.join(OUT_DIR, "tfidf_train_y.npy"), allow_pickle = True)
y_va = np.load(os.path.join(OUT_DIR, "tfidf_val_y.npy"), allow_pickle = True)
y_te = np.load(os.path.join(OUT_DIR, "tfidf_test_y.npy"), allow_pickle = True)

In [6]:
X_full = vstack([Xtr_full, Xva_full])
y_full = np.concatenate([y_tr, y_va])

le = LabelEncoder()
y_full_enc = le.fit_transform(y_full)
y_te_enc = le.transform(y_te)

In [8]:
cfgs = [
    RandomForestClassifier(
        max_depth=12, max_features=0.05, min_samples_leaf=5, n_estimators=30, n_jobs=-1, random_state=42
    ),
    XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.1, max_depth= 6, n_estimators= 150, reg_lambda= 1, subsample= 0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y_full_enc)),
        tree_method="hist", eval_metric="mlogloss",
        n_jobs=-1, random_state=42, verbosity=0
    )
]


for cfg in cfgs:
    print(f"\nConfiguration: {cfg}")

    # XGB requires a numerical target
    if isinstance(cfg, XGBClassifier):
        cfg.fit(X_full, y_full_enc)
        y_te_pred = cfg.predict(Xte_full)
        macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
        acc = accuracy_score(y_te_enc, y_te_pred)

    else:
        cfg.fit(X_full, y_full)
        y_te_pred = cfg.predict(Xte_full)
        macro_f1 = f1_score(y_te, y_te_pred, average="macro")
        acc = accuracy_score(y_te, y_te_pred)

    print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")


Configuration: LinearSVC(C=0.1, max_iter=5000, random_state=42)
macro-F1 (test): 0.3006 | accuracy (test): 0.3080

Configuration: BernoulliNB(alpha=10)
macro-F1 (test): 0.2412 | accuracy (test): 0.2810

Configuration: RandomForestClassifier(max_depth=12, max_features=0.05, min_samples_leaf=5,
                       n_estimators=30, n_jobs=-1, random_state=42)
macro-F1 (test): 0.1676 | accuracy (test): 0.2198

Configuration: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, ma

In [11]:
cfg = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha = 1e-05,
        average = True,
        class_weight = None,
        random_state=42,
        max_iter=1000,
        tol=1e-3,
    )


cfg.fit(X_full, y_full)
y_te_pred = cfg.predict(Xte_full)
macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2927 | accuracy (test): 0.3034


In [6]:
# TEST SU GAUSSIAN NAIVE BAYES

batch_size = 256
classes = np.unique(y_full)

clf = GaussianNB(var_smoothing = 1e-06)


# Fit the model using minibatch for memory
for start in range(0, X_full.shape[0], batch_size):
    # print(f"Batch {start} fit")
    end = min(start + batch_size, X_full.shape[0])

    Xb = X_full[start:end].toarray()
    yb = y_full[start:end]

    if start == 0:
        clf.partial_fit(Xb, yb, classes=classes)
    else:
        clf.partial_fit(Xb, yb)

    del Xb, yb
    gc.collect()

# Predict using minibatches
y_te_pred = []

for start in range(0, Xte_full.shape[0], batch_size):
    # print(f"Batch {start} predict")
    end = min(start + batch_size, Xte_full.shape[0])

    Xb = Xte_full[start:end].toarray()
    y_te_pred.append(clf.predict(Xb))

    del Xb
    gc.collect()

y_te_pred = np.concatenate(y_te_pred)

macro_f1 = f1_score(y_te, y_te_pred, average="macro")
acc = accuracy_score(y_te, y_te_pred)

print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

macro-F1 (test): 0.2153 | accuracy (test): 0.2711
