In [1]:
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

# --- VADER sentiment (for the extra feature) ---
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon once (comment out after first run if you want)
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/anthony/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# 1. Load the dataset ---------------------------------------------------------

df = pd.read_csv("extremism_data_final.csv") 

In [4]:
# 2. Clean / encode labels ----------------------------------------------------
df = df.drop_duplicates(subset=["Original_Message"])

# Map to 0/1
label_map = {
    "EXTREMIST": 1,
    "NON_EXTREMIST": 0,
}

def encode_label(textData: str) -> int:
    return label_map[textData]

df["Binary_Label"] = df["Extremism_Label"].apply(encode_label)

print(df.shape)

# Our final labels
y = df["Binary_Label"].values.astype(np.int64)

(2776, 3)


In [5]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack, csr_matrix
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# -------------------------------------------------------------------
# 1. Fit TF-IDF on the whole corpus (up to 4996 features)
# -------------------------------------------------------------------
MAX_TOTAL_FEATURES = 5000
N_VADER_FEATURES = 4
MAX_TFIDF_FEATURES = MAX_TOTAL_FEATURES - N_VADER_FEATURES  # 4996

texts = df["Original_Message"].fillna("").astype(str).tolist()

tfidf_vectorizer = TfidfVectorizer(
    max_features=MAX_TFIDF_FEATURES,
)
tfidf_vectorizer.fit(texts)

# VADER analyzer (we'll reuse this in the function)
analyzer = SentimentIntensityAnalyzer()


# -------------------------------------------------------------------
# 2. Define the vectorizer FUNCTION: string -> feature vector
# -------------------------------------------------------------------
def vectorize_text(text: str):
    """
    Take a single text string and return a feature vector:
      [TF-IDF features | VADER neg, neu, pos, compound]

    Output shape: (1, n_features) as a sparse CSR matrix.
    """

    # --- TF-IDF part (1 x <=4996) ---
    X_tfidf = tfidf_vectorizer.transform([text])  # list of one doc

    # --- VADER part (1 x 4) ---
    scores = analyzer.polarity_scores(text)
    vader_vec = np.array([[scores["neg"], scores["neu"], scores["pos"], scores["compound"]]])
    X_vader = csr_matrix(vader_vec)

    # --- Concatenate horizontally: [TF-IDF | VADER] ---
    X_full = hstack([X_tfidf, X_vader], format="csr")

    return X_full  # shape: (1, n_features)


# -------------------------------------------------------------------
# 3. Apply vectorize_text() to EACH item in Original_Message
# -------------------------------------------------------------------
row_vectors = [
    vectorize_text(t) for t in df["Original_Message"].fillna("").astype(str)
]

# Stack all 1-row matrices into a big feature matrix
X = vstack(row_vectors)   # shape: (n_samples, n_features)

print("Feature matrix shape:", X.shape)  # (num_rows, <=5000)

Feature matrix shape: (2776, 5000)


In [6]:
print(X[1].toarray())

[[0. 0. 0. ... 1. 0. 0.]]


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [25]:
# X: scipy.sparse matrix of shape (n_samples, 5000)
# y: numpy array of shape (n_samples,)

# Convert X to dense numpy, then to torch tensor
# old set
# X_tensor = torch.from_numpy(X.toarray()).float()      # shape: (N, 5000)

# Convert y to torch tensor and make it column-shaped (N, 1)
# y_tensor = torch.from_numpy(y).float().view(-1, 1)    # shape: (N, 1)

# print("X_tensor shape:", X_tensor.shape)
# print("y_tensor shape:", y_tensor.shape)

In [26]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import torch

# X: scipy.sparse matrix of shape (N, 5000)
# y: numpy array of shape (N,)

# 1) Convert X and y to dense NumPy arrays
X_np = X.toarray().astype(np.float32)           # shape: (N, 5000)
y_np = y.astype(np.float32).reshape(-1, 1)      # shape: (N, 1)

# 2) Train/validation split (80% train, 20% val)
X_train_np, X_val_np, y_train_np, y_val_np = train_test_split(
    X_np,
    y_np,
    test_size=0.2,               # 20% validation
    random_state=42,
    stratify=y_np.reshape(-1),   # keep class balance
)

# 3) Convert splits to PyTorch tensors
X_train = torch.from_numpy(X_train_np)   # (N_train, 5000)
y_train = torch.from_numpy(y_train_np)   # (N_train, 1)

X_val   = torch.from_numpy(X_val_np)     # (N_val, 5000)
y_val   = torch.from_numpy(y_val_np)     # (N_val, 1)

# 4) Build DataLoader for training set (mini-batches)
train_ds = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

# 5) Print shapes for sanity check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:  ", X_val.shape)
print("y_val shape:  ", y_val.shape)

X_train shape: torch.Size([2220, 5000])
y_train shape: torch.Size([2220, 1])
X_val shape:   torch.Size([556, 5000])
y_val shape:   torch.Size([556, 1])


In [9]:
class SingleLayerNet(nn.Module):
    def __init__(self, input_size, hidden_neurons, output_size):
        super(SingleLayerNet, self).__init__()
        # Hidden layer: input_size -> hidden_neurons
        self.hidden_layer = nn.Linear(input_size, hidden_neurons)
        # Output layer: hidden_neurons -> output_size
        self.output_layer = nn.Linear(hidden_neurons, output_size)
        
    def forward(self, x):
        # Hidden layer + sigmoid
        hidden_output = torch.sigmoid(self.hidden_layer(x))
        # Output layer + sigmoid â†’ predicted probability in (0,1)
        y_pred = torch.sigmoid(self.output_layer(hidden_output))
        return y_pred

In [28]:
input_size = X_train.shape[1]   # 5000 features
hidden_neurons = 16              # you can tune this (8, 16, 32, ...)
output_size = 1                  # binary classification

model = SingleLayerNet(input_size, hidden_neurons, output_size)
print(model)

SingleLayerNet(
  (hidden_layer): Linear(in_features=5000, out_features=16, bias=True)
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
)


In [11]:
def criterion(y_pred, y_true):
    # y_pred: (batch_size, 1), probabilities in (0,1)
    # y_true: (batch_size, 1), 0 or 1
    eps = 1e-8  # to avoid log(0)
    loss = -1 * (y_true * torch.log(y_pred + eps) + (1 - y_true) * torch.log(1 - y_pred + eps))
    mean_loss = torch.mean(loss)
    return mean_loss

optimizer = optim.SGD(model.parameters(), lr=0.01)

  from .autonotebook import tqdm as notebook_tqdm


Epoch 1/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 2/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 3/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 4/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 5/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 6/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 7/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 8/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 9/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 10/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 11/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 12/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 13/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 14/30 - train loss: 0.7008 - val loss: 0.7008 - val acc: 0.5126
Epoch 15/30 - train loss: 0.7

In [23]:
model.eval()  # put model in eval mode (turns off dropout, etc. if you add it later)

with torch.no_grad():
    # Predict probabilities for all samples at once
    y_pred_probs = model(X_tensor)          # shape: (N, 1)
    # Convert to 0/1 by thresholding at 0.5
    y_pred_labels = (y_pred_probs >= 0.5).float()
    
    # Compare to true labels
    correct = (y_pred_labels == y_tensor).sum().item()
    total = y_tensor.shape[0]
    accuracy = correct / total

print(f"Training accuracy: {accuracy:.4f}")

Training accuracy: 0.9330


In [24]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_tensor.numpy(), y_pred_labels.numpy())
print("Confusion matrix:\n", cm)

Confusion matrix:
 [[1345   78]
 [ 108 1245]]
