In [10]:
# Importing the requisite libraries for this notebook

import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

from scipy.sparse import hstack, vstack, csr_matrix
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Download the VADER lexicon once (comment out after first run if you want)
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/anthony/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [11]:
# 1. Load the Dataset

df = pd.read_csv("extremism_data_final.csv") 

In [None]:
# 2. Clean and Encode Labels

# Map EXTREMIST to 1, NON_EXTREMIST to 0
label_map = {
    "EXTREMIST": 1,
    "NON_EXTREMIST": 0,
}

# Lambda function to do the encoding
def encode_label(textData: str) -> int:
    return label_map[textData]

# Encoding
df["Binary_Label"] = df["Extremism_Label"].apply(encode_label)

# Checking the shape
print(df.shape)

# Our final labels
y = df["Binary_Label"].values.astype(np.int64)

(2776, 3)


In [13]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack, csr_matrix
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# -------------------------------------------------------------------
# 1. Fit TF-IDF on the whole corpus (up to 4996 features)
# -------------------------------------------------------------------
MAX_TOTAL_FEATURES = 10000
N_VADER_FEATURES = 4
MAX_TFIDF_FEATURES = MAX_TOTAL_FEATURES - N_VADER_FEATURES  # 4996

texts = df["Original_Message"].fillna("").astype(str).tolist()

tfidf_vectorizer = TfidfVectorizer(
    max_features=MAX_TFIDF_FEATURES,
    min_df = 3,
    ngram_range = (1, 2)
)
tfidf_vectorizer.fit(texts)

# VADER analyzer (we'll reuse this in the function)
analyzer = SentimentIntensityAnalyzer()


# -------------------------------------------------------------------
# 2. Define the vectorizer FUNCTION: string -> feature vector
# -------------------------------------------------------------------
def vectorize_text(text: str):
    """
    Take a single text string and return a feature vector:
      [TF-IDF features | VADER neg, neu, pos, compound]

    Output shape: (1, n_features) as a sparse CSR matrix.
    """

    # --- TF-IDF part (1 x <=4996) ---
    X_tfidf = tfidf_vectorizer.transform([text])  # list of one doc

    # --- VADER part (1 x 4) ---
    scores = analyzer.polarity_scores(text)
    vader_vec = np.array([[scores["neg"], scores["neu"], scores["pos"], scores["compound"]]])
    X_vader = csr_matrix(vader_vec)

    # --- Concatenate horizontally: [TF-IDF | VADER] ---
    X_full = hstack([X_tfidf, X_vader], format="csr")

    return X_full  # shape: (1, n_features)


# -------------------------------------------------------------------
# 3. Apply vectorize_text() to EACH item in Original_Message
# -------------------------------------------------------------------
row_vectors = [
    vectorize_text(t) for t in df["Original_Message"].fillna("").astype(str)
]

# Stack all 1-row matrices into a big feature matrix
X = vstack(row_vectors)   # shape: (n_samples, n_features)

print("Feature matrix shape:", X.shape)  # (num_rows, <=5000)

Feature matrix shape: (2776, 5286)


In [14]:
from sklearn.model_selection import train_test_split

# X: scipy.sparse matrix of shape (N, n_features)
# y: numpy array of shape (N,)

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,         # 20% validation
    random_state=51,
    stratify=y             # keep class balance
)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:  ", X_val.shape)
print("y_val shape:  ", y_val.shape)

X_train shape: (2220, 5286)
y_train shape: (2220,)
X_val shape:   (556, 5286)
y_val shape:   (556,)


In [24]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ==========================
# MODEL: Linear SVM
# ==========================

svm_clf = LinearSVC(
    C=5.0,                 # you can tune this (0.1, 1, 10, ...)
    class_weight='balanced',  # helpful if extremist class is rare
    random_state=51
)

# Train SVM
svm_clf.fit(X_train, y_train)

# Predict on validation set
y_val_pred = svm_clf.predict(X_val)

# ==========================
# METRICS
# ==========================

acc = accuracy_score(y_val, y_val_pred)
f1_macro = f1_score(y_val, y_val_pred, average="macro")
f1_weighted = f1_score(y_val, y_val_pred, average="weighted")

print(f"Validation Accuracy:      {acc:.4f}")
print(f"Validation F1 (macro):    {f1_macro:.4f}")
print(f"Validation F1 (weighted): {f1_weighted:.4f}\n")

print("Classification report:")
print(classification_report(y_val, y_val_pred))

print("Confusion matrix:")
print(confusion_matrix(y_val, y_val_pred))

Validation Accuracy:      0.8004
Validation F1 (macro):    0.8003
Validation F1 (weighted): 0.8004

Classification report:
              precision    recall  f1-score   support

           0       0.81      0.79      0.80       285
           1       0.79      0.81      0.80       271

    accuracy                           0.80       556
   macro avg       0.80      0.80      0.80       556
weighted avg       0.80      0.80      0.80       556

Confusion matrix:
[[226  59]
 [ 52 219]]


