In [80]:
import numpy as np
import torch
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset, WeightedRandomSampler
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

In [81]:
df=pd.read_csv('/Users/kelly/Downloads/redditSubmissions_cleaned-2.csv')

In [82]:
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
def time_period(h):
    if 6 <= h < 12:
        return 0
    elif 12 <= h < 17:
        return 1
    elif 17 <= h < 24:
        return 2
    else:
        return 3
df['time_period'] = df['hour'].apply(time_period)

features + linear regression

In [94]:
def build_dataset(df, mode="none"):
    text_col = 'title'
    cat_cols = ['subreddit', 'time_period']
    binary_cols = ['has_question', 'has_exclamation', 'is_weekend']
    numeric_cols = ['hour', 'dayofweek', 'title_len']
    label_col = 'popular'

    if mode == "count":
        vectorizer = CountVectorizer(max_features=1000, ngram_range=(1,2), stop_words='english')
        X_counts = vectorizer.fit_transform(df[text_col].fillna(''))
        do_log1p = True
        if do_log1p:

            X_counts = X_counts.tocoo()
            X_counts.data = np.log1p(X_counts.data)
            X_counts = X_counts.tocsr()
        X_text=X_counts
    elif mode == "bert":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        bert_model = BertModel.from_pretrained('bert-base-uncased')
        bert_model.eval()

        device = "cuda" if torch.cuda.is_available() else "cpu"
        bert_model.to(device)

        titles = df[text_col].fillna("").tolist()
        batch_size = 32
        all_features = []

        for i in range(0, len(titles), batch_size):

            batch = titles[i:i+batch_size]

            with torch.no_grad():
                encoded = tokenizer(
                    batch, return_tensors='pt',
                    padding=True, truncation=True, max_length=32
                )
                ids = encoded['input_ids'].to(device)
                att = encoded['attention_mask'].to(device)

                outputs = bert_model(input_ids=ids, attention_mask=att)
                pooled = outputs.pooler_output.cpu().numpy()
                all_features.append(pooled)

        X_text = sparse.csr_matrix(np.vstack(all_features))

    elif mode == "none":
        X_text = None

    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=True)

    X_cat = ohe.fit_transform(df[cat_cols])
    X_binary = df[binary_cols].astype(float).values
    scaler = StandardScaler()
    X_numeric = scaler.fit_transform(df[numeric_cols].astype(float))
    X_binary_sp = sparse.csr_matrix(X_binary)
    X_numeric_sp = sparse.csr_matrix(X_numeric)
    

    if X_text is None:
        X_all = sparse.hstack([ X_cat, X_binary_sp, X_numeric_sp], format='csr')
    else:
        X_all = sparse.hstack([X_text, X_cat, X_binary_sp, X_numeric_sp], format='csr')

    y = df[label_col].values.astype(int)
    return X_all, y


In [110]:
X_all, y = build_dataset(df, mode="none")
X_train, X_temp, y_train, y_temp = train_test_split(X_all, y, test_size=0.2, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)

In [111]:
clf = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    solver='saga'
)
clf.fit(X_train, y_train)



In [112]:
def find_best_threshold(y_true, y_proba, thresholds=np.linspace(0.1,0.9,17)):
    best_t, best_f1 = 0.5, -1
    for t in thresholds:
        preds = (y_proba >= t).astype(int)
        f1 = f1_score(y_true, preds)
        acc = accuracy_score(y_true, preds)
        print(f"thr={t:.2f} -> Acc={acc:.4f}, F1={f1:.4f}")
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_t

bert + MLP

This part is to encode all the features I need

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, cat_cols, binary_cols, numeric_cols, label_col, text_column='title', max_len=128):
        self.df = df.copy()
        self.tokenizer = tokenizer
        self.cat_cols = cat_cols
        self.binary_cols = binary_cols
        self.numeric_cols = numeric_cols
        self.label_col = label_col
        self.text_column = text_column
        self.max_len = max_len
        for c in self.cat_cols:
            self.df[c] = self.df[c].astype('category').cat.codes
        for c in self.binary_cols + self.numeric_cols:
            self.df[c] = self.df[c].astype(float)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encoded = self.tokenizer(
            row[self.text_column],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        input_ids = encoded['input_ids'].squeeze(0)
        attention_mask = encoded['attention_mask'].squeeze(0)
        cat_feats = torch.tensor(row[self.cat_cols].values.astype(int), dtype=torch.long)
        numeric_feats = torch.tensor(row[self.numeric_cols + self.binary_cols].values.astype(float), dtype=torch.float)
        label = torch.tensor(row[self.label_col], dtype=torch.float)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'cat_feats': cat_feats,
            'numeric_feats': numeric_feats,
            'label': label
        }

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_column = 'title'
max_len = 32
batch_size = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

This part is to build, split and transform dataset

In [None]:
full_dataset = MyDataset(df, tokenizer, cat_cols, binary_cols, numeric_cols, label_col, text_column, max_len)
indices = np.arange(len(full_dataset))
train_idx, temp_idx = train_test_split(indices, test_size=0.2, random_state=42, stratify=df[label_col])
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42, stratify=df.iloc[temp_idx][label_col])
train_dataset = Subset(full_dataset, train_idx)
val_dataset = Subset(full_dataset, val_idx)
test_dataset = Subset(full_dataset, test_idx)
y_train = df.iloc[train_idx][label_col].values
train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

Put my features in the model

In [None]:
class RedditModel(nn.Module):
    def __init__(self, num_categories, cat_cols, numeric_cols, binary_cols):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_categories[c], min(50, num_categories[c]))
            for c in cat_cols
        ])
        cat_dim = sum([min(50, num_categories[c]) for c in cat_cols])
        numeric_dim = len(numeric_cols) + len(binary_cols)
        input_dim = 768 + cat_dim + numeric_dim
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, input_ids, attention_mask, cat_feats, numeric_feats):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = bert_out.pooler_output
        cat_emb_list = [emb(cat_feats[:, i]) for i, emb in enumerate(self.embeddings)]
        cat_vec = torch.cat(cat_emb_list, dim=1)
        all_feats = torch.cat([pooled, cat_vec, numeric_feats], dim=1)
        return self.mlp(all_feats).squeeze(1)

Set all the parameters. criterion part is to balance the unbalanced dataset.
Patience and below is because I want an early stopping so that the model would autometicly stop at the best place

In [None]:
num_categories = {c: df[c].nunique() for c in cat_cols}
model = RedditModel(num_categories, cat_cols, numeric_cols, binary_cols).to(device)
cnt = Counter(df[label_col])
pos_weight_value = cnt[0] / cnt[1] 
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight_value]).to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
num_epochs = 20
patience = 3
best_val_loss = float('inf')
trigger_times = 0


train the model and evaluate it

In [None]:
for epoch in range(num_epochs):
    model.train()
    train_loss, correct, total = 0, 0, 0
    for batch in train_loader:
        input_ids_b = batch['input_ids'].to(device)
        attention_mask_b = batch['attention_mask'].to(device)
        cat_b = batch['cat_feats'].to(device)
        numeric_b = batch['numeric_feats'].to(device)
        labels_b = batch['label'].float().to(device)

        optimizer.zero_grad()
        outputs = model(input_ids_b, attention_mask_b, cat_b, numeric_b)
        outputs = outputs.view(-1)
        loss = criterion(outputs, labels_b)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * input_ids_b.size(0)
        preds = (torch.sigmoid(outputs) > 0.5).float()
        correct += (preds == labels_b).sum().item()
        total += labels_b.size(0)

    train_loss /= len(train_loader.dataset)
    train_acc = correct / total

    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids_b = batch['input_ids'].to(device)
            attention_mask_b = batch['attention_mask'].to(device)
            cat_b = batch['cat_feats'].to(device)
            numeric_b = batch['numeric_feats'].to(device)
            labels_b = batch['label'].float().to(device)

            outputs = model(input_ids_b, attention_mask_b, cat_b, numeric_b)
            outputs = outputs.view(-1)
            loss = criterion(outputs, labels_b)

            val_loss += loss.item() * input_ids_b.size(0)
            preds = (torch.sigmoid(outputs) > 0.5).float()

            val_correct += (preds == labels_b).sum().item()
            val_total += labels_b.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels_b.cpu().numpy())

    val_loss /= len(val_loader.dataset)
    val_acc = val_correct / val_total
    val_f1 = f1_score(all_labels, all_preds)

    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs.")
            model.load_state_dict(torch.load('best_model.pt'))
            break


count:
thr=0.10 -> Acc=0.1119, F1=0.1911
thr=0.15 -> Acc=0.1204, F1=0.1926
thr=0.20 -> Acc=0.1354, F1=0.1947
thr=0.25 -> Acc=0.1592, F1=0.1986
thr=0.30 -> Acc=0.1943, F1=0.2031
thr=0.35 -> Acc=0.2402, F1=0.2071
thr=0.40 -> Acc=0.3110, F1=0.2136
thr=0.45 -> Acc=0.4162, F1=0.2239
thr=0.50 -> Acc=0.5683, F1=0.2327
thr=0.55 -> Acc=0.7522, F1=0.2265
thr=0.60 -> Acc=0.8307, F1=0.2023
thr=0.65 -> Acc=0.8698, F1=0.1777
thr=0.70 -> Acc=0.8881, F1=0.1523
thr=0.75 -> Acc=0.8949, F1=0.1179
thr=0.80 -> Acc=0.8976, F1=0.0997
thr=0.85 -> Acc=0.8975, F1=0.0635
thr=0.90 -> Acc=0.8955, F1=0.0171
BEST: 0.5 0.232701867526535
              precision    recall  f1-score   support

           0       0.92      0.56      0.70     11841
           1       0.14      0.60      0.23      1390

    accuracy                           0.57     13231
   macro avg       0.53      0.58      0.46     13231
weighted avg       0.84      0.57      0.65     13231


In [None]:
bert:
thr=0.10 -> Acc=0.1052, F1=0.1901
thr=0.15 -> Acc=0.1055, F1=0.1900
thr=0.20 -> Acc=0.1067, F1=0.1900
thr=0.25 -> Acc=0.1125, F1=0.1902
thr=0.30 -> Acc=0.1318, F1=0.1915
thr=0.35 -> Acc=0.1843, F1=0.1941
thr=0.40 -> Acc=0.2860, F1=0.1976
thr=0.45 -> Acc=0.4358, F1=0.2046
thr=0.50 -> Acc=0.6032, F1=0.2031
thr=0.55 -> Acc=0.7400, F1=0.1948
thr=0.60 -> Acc=0.8215, F1=0.1546
thr=0.65 -> Acc=0.8655, F1=0.1214
thr=0.70 -> Acc=0.8830, F1=0.0926
thr=0.75 -> Acc=0.8902, F1=0.0668
thr=0.80 -> Acc=0.8933, F1=0.0408
thr=0.85 -> Acc=0.8937, F1=0.0112
thr=0.90 -> Acc=0.8942, F1=0.0028
Test Acc, F1: 0.6592850124707127 0.21161245190626093
              precision    recall  f1-score   support

           0       0.91      0.69      0.78     11841
           1       0.14      0.44      0.21      1390

    accuracy                           0.66     13231
   macro avg       0.53      0.56      0.50     13231
weighted avg       0.83      0.66      0.72     13231


In [None]:
none:
thr=0.10 -> Acc=0.1050, F1=0.1900
thr=0.15 -> Acc=0.1050, F1=0.1900
thr=0.20 -> Acc=0.1050, F1=0.1900
thr=0.25 -> Acc=0.1051, F1=0.1900
thr=0.30 -> Acc=0.1050, F1=0.1899
thr=0.35 -> Acc=0.1056, F1=0.1897
thr=0.40 -> Acc=0.1150, F1=0.1901
thr=0.45 -> Acc=0.2033, F1=0.1913
thr=0.50 -> Acc=0.6098, F1=0.1790
thr=0.55 -> Acc=0.8811, F1=0.0860
thr=0.60 -> Acc=0.8853, F1=0.0789
thr=0.65 -> Acc=0.8890, F1=0.0732
thr=0.70 -> Acc=0.8911, F1=0.0600
thr=0.75 -> Acc=0.8930, F1=0.0445
thr=0.80 -> Acc=0.8931, F1=0.0262
thr=0.85 -> Acc=0.8932, F1=0.0000
thr=0.90 -> Acc=0.8950, F1=0.0000
Test Acc, F1: 0.1969616809009145 0.1894881379205126
              precision    recall  f1-score   support

           0       0.90      0.12      0.20     11841
           1       0.11      0.89      0.19      1390

    accuracy                           0.20     13231
   macro avg       0.50      0.50      0.20     13231
weighted avg       0.82      0.20      0.20     13231


In [None]:
bert+MLP
Epoch 1/20 | Train Loss: 1.4386, Train Acc: 0.5207 | Val Loss: 1.8189, Val Acc: 0.2324, Val F1: 0.1949
Epoch 2/20 | Train Loss: 0.8904, Train Acc: 0.7681 | Val Loss: 2.0256, Val Acc: 0.5527, Val F1: 0.1970
Epoch 3/20 | Train Loss: 0.5992, Train Acc: 0.8592 | Val Loss: 2.3622, Val Acc: 0.6228, Val F1: 0.2013
Epoch 4/20 | Train Loss: 0.4676, Train Acc: 0.8934 | Val Loss: 2.4682, Val Acc: 0.7240, Val F1: 0.1913
Early stopping triggered after 4 epochs.