In [32]:
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

# --- VADER sentiment (for the extra feature) ---
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon once (comment out after first run if you want)
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/anthony/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [33]:
# 1. Load the dataset ---------------------------------------------------------

df = pd.read_csv("extremism_data_final.csv") 

In [34]:
# 2. Clean / encode labels ----------------------------------------------------
df = df.drop_duplicates(subset=["Original_Message"])

# Map to 0/1
label_map = {
    "EXTREMIST": 1,
    "NON_EXTREMIST": 0,
}

def encode_label(textData: str) -> int:
    return label_map[textData]

df["Binary_Label"] = df["Extremism_Label"].apply(encode_label)

print(df.shape)

# Our final labels
y = df["Binary_Label"].values.astype(np.int64)

(2776, 3)


In [35]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack, csr_matrix
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# -------------------------------------------------------------------
# 1. Fit TF-IDF on the whole corpus (up to 4996 features)
# -------------------------------------------------------------------
MAX_TOTAL_FEATURES = 5000
N_VADER_FEATURES = 4
MAX_TFIDF_FEATURES = MAX_TOTAL_FEATURES - N_VADER_FEATURES  # 4996

texts = df["Original_Message"].fillna("").astype(str).tolist()

tfidf_vectorizer = TfidfVectorizer(
    max_features=MAX_TFIDF_FEATURES,
)
tfidf_vectorizer.fit(texts)

# VADER analyzer (we'll reuse this in the function)
analyzer = SentimentIntensityAnalyzer()


# -------------------------------------------------------------------
# 2. Define the vectorizer FUNCTION: string -> feature vector
# -------------------------------------------------------------------
def vectorize_text(text: str):
    """
    Take a single text string and return a feature vector:
      [TF-IDF features | VADER neg, neu, pos, compound]

    Output shape: (1, n_features) as a sparse CSR matrix.
    """

    # --- TF-IDF part (1 x <=4996) ---
    X_tfidf = tfidf_vectorizer.transform([text])  # list of one doc

    # --- VADER part (1 x 4) ---
    scores = analyzer.polarity_scores(text)
    vader_vec = np.array([[scores["neg"], scores["neu"], scores["pos"], scores["compound"]]])
    X_vader = csr_matrix(vader_vec)

    # --- Concatenate horizontally: [TF-IDF | VADER] ---
    X_full = hstack([X_tfidf, X_vader], format="csr")

    return X_full  # shape: (1, n_features)


# -------------------------------------------------------------------
# 3. Apply vectorize_text() to EACH item in Original_Message
# -------------------------------------------------------------------
row_vectors = [
    vectorize_text(t) for t in df["Original_Message"].fillna("").astype(str)
]

# Stack all 1-row matrices into a big feature matrix
X = vstack(row_vectors)   # shape: (n_samples, n_features)

print("Feature matrix shape:", X.shape)  # (num_rows, <=5000)

Feature matrix shape: (2776, 5000)


In [36]:
print(X[1].toarray())

[[0. 0. 0. ... 1. 0. 0.]]


In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [38]:
# X: scipy.sparse matrix of shape (n_samples, 5000)
# y: numpy array of shape (n_samples,)

# Convert X to dense numpy, then to torch tensor
# old set
# X_tensor = torch.from_numpy(X.toarray()).float()      # shape: (N, 5000)

# Convert y to torch tensor and make it column-shaped (N, 1)
# y_tensor = torch.from_numpy(y).float().view(-1, 1)    # shape: (N, 1)

# print("X_tensor shape:", X_tensor.shape)
# print("y_tensor shape:", y_tensor.shape)

In [62]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import torch

# X: scipy.sparse matrix of shape (N, 5000)
# y: numpy array of shape (N,)

# 1) Convert X and y to dense NumPy arrays
X_np = X.toarray().astype(np.float32)           # shape: (N, 5000)
y_np = y.astype(np.float32).reshape(-1, 1)      # shape: (N, 1)

# 2) Train/validation split (80% train, 20% val)
X_train_np, X_val_np, y_train_np, y_val_np = train_test_split(
    X_np,
    y_np,
    test_size=0.2,               # 20% validation
    random_state=42,
    stratify=y_np.reshape(-1),   # keep class balance
)

# 3) Convert splits to PyTorch tensors
X_train = torch.from_numpy(X_train_np)   # (N_train, 5000)
y_train = torch.from_numpy(y_train_np)   # (N_train, 1)

X_val   = torch.from_numpy(X_val_np)     # (N_val, 5000)
y_val   = torch.from_numpy(y_val_np)     # (N_val, 1)

# 4) Build DataLoader for training set (mini-batches)
train_ds = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

# 5) Print shapes for sanity check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:  ", X_val.shape)
print("y_val shape:  ", y_val.shape)

X_train shape: torch.Size([2220, 5000])
y_train shape: torch.Size([2220, 1])
X_val shape:   torch.Size([556, 5000])
y_val shape:   torch.Size([556, 1])


In [63]:
class SingleLayerNet(nn.Module):
    def __init__(self, input_size, hidden_neurons, output_size):
        super(SingleLayerNet, self).__init__()
        # Hidden layer: input_size -> hidden_neurons
        self.hidden_layer = nn.Linear(input_size, hidden_neurons)
        # Output layer: hidden_neurons -> output_size
        self.output_layer = nn.Linear(hidden_neurons, output_size)
        
    def forward(self, x):
        # Hidden layer + sigmoid
        hidden_output = torch.sigmoid(self.hidden_layer(x))
        # Output layer + sigmoid â†’ predicted probability in (0,1)
        y_pred = torch.sigmoid(self.output_layer(hidden_output))
        return y_pred

In [64]:
input_size = X_train.shape[1]   # 5000 features
hidden_neurons = 128             # you can tune this (8, 16, 32, ...)
output_size = 1                  # binary classification

model2 = SingleLayerNet(input_size, hidden_neurons, output_size)
print(model2)

SingleLayerNet(
  (hidden_layer): Linear(in_features=5000, out_features=128, bias=True)
  (output_layer): Linear(in_features=128, out_features=1, bias=True)
)


In [65]:
def criterion(y_pred, y_true):
    # y_pred: (batch_size, 1), probabilities in (0,1)
    # y_true: (batch_size, 1), 0 or 1
    eps = 1e-8  # to avoid log(0)
    loss = -1 * (y_true * torch.log(y_pred + eps) + (1 - y_true) * torch.log(1 - y_pred + eps))
    mean_loss = torch.mean(loss)
    return mean_loss

optimizer = optim.SGD(model2.parameters(), lr=0.05)

In [66]:
epochs = 200  # can increase later
train_losses = []
val_losses = []
val_accuracies = []
best_val_acc = 0.0
best_epoch = None
best_state_dict = None

print("Initial hidden weight norm:",
      model2.hidden_layer.weight.norm().item())
for epoch in range(epochs):
    # ===== TRAINING PHASE =====
    model2.train()
    total_train_loss = 0.0
    total_train_examples = 0

    for xb, yb in train_loader:
        # xb: (batch_size, 5000)
        # yb: (batch_size, 1)

        # Forward pass
        y_pred = model2(xb)              # (batch_size, 1)

        # Loss
        loss = criterion(y_pred, yb)

        # Backprop + update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track loss weighted by batch size
        batch_size = xb.size(0)
        total_train_loss += loss.item() * batch_size
        total_train_examples += batch_size

    avg_train_loss = total_train_loss / total_train_examples
    train_losses.append(avg_train_loss)

    # ===== VALIDATION PHASE =====
    model2.eval()
    with torch.no_grad():
        # Forward on full validation set
        y_val_pred = model2(X_val)                # (N_val, 1)
        val_loss = criterion(y_val_pred, y_val).item()
        val_losses.append(val_loss)

        # Convert probabilities to 0/1 labels
        y_val_pred_labels = (y_val_pred >= 0.5).float()
        correct = (y_val_pred_labels == y_val).sum().item()
        total = y_val.shape[0]
        val_acc = correct / total
        val_accuracies.append(val_acc)
        
    # after computing val_acc:
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_epoch = epoch + 1
        best_state_dict = model2.state_dict()

    print(
        f"Epoch {epoch+1}/{epochs} - "
        f"train loss: {avg_train_loss:.4f} - "
        f"val loss: {val_loss:.4f} - "
        f"val acc: {val_acc:.4f}"
    )

print(f"\nFinal validation accuracy: {val_accuracies[-1]:.4f}")
print("Final hidden weight norm:",
      model2.hidden_layer.weight.norm().item())

print("Best val acc:", best_val_acc, "at epoch", best_epoch)
model2.load_state_dict(best_state_dict)

Initial hidden weight norm: 6.535641670227051
Epoch 1/200 - train loss: 0.6990 - val loss: 0.6938 - val acc: 0.5126
Epoch 2/200 - train loss: 0.6976 - val loss: 0.7089 - val acc: 0.5126
Epoch 3/200 - train loss: 0.6969 - val loss: 0.7046 - val acc: 0.4874
Epoch 4/200 - train loss: 0.6962 - val loss: 0.6924 - val acc: 0.5126
Epoch 5/200 - train loss: 0.6974 - val loss: 0.7035 - val acc: 0.4874
Epoch 6/200 - train loss: 0.6968 - val loss: 0.6922 - val acc: 0.5126
Epoch 7/200 - train loss: 0.6983 - val loss: 0.6953 - val acc: 0.5126
Epoch 8/200 - train loss: 0.6968 - val loss: 0.6933 - val acc: 0.4874
Epoch 9/200 - train loss: 0.6978 - val loss: 0.7017 - val acc: 0.4874
Epoch 10/200 - train loss: 0.6963 - val loss: 0.6916 - val acc: 0.5126
Epoch 11/200 - train loss: 0.6962 - val loss: 0.7238 - val acc: 0.5126
Epoch 12/200 - train loss: 0.6958 - val loss: 0.7043 - val acc: 0.5126
Epoch 13/200 - train loss: 0.6959 - val loss: 0.7103 - val acc: 0.4874
Epoch 14/200 - train loss: 0.6973 - val 

<All keys matched successfully>