In [None]:
import scrapy
from bs4 import BeautifulSoup

class PoliticianSpider(scrapy.Spider):
    name = "politicians"
    start_urls = [
        "https://www.senatorwebsite.com",  # Add real politician URLs here
        "https://www.representativewebsite.com"
    ]

    def parse(self, response):
        soup = BeautifulSoup(response.text, "html.parser")
        text = " ".join([p.text for p in soup.find_all("p")])
        yield {"url": response.url, "text": text}


In [None]:
import json
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

# Load Scraped Data
with open("politicians.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# Text Cleaning Function
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenization
    tokens = [t for t in tokens if t not in stopwords.words("english")]  # Remove stopwords
    return " ".join(tokens)

df["cleaned_text"] = df["text"].apply(clean_text)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = tfidf_vectorizer.fit_transform(df["cleaned_text"]).toarray()

# BERT Embeddings
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

df["bert_embedding"] = df["cleaned_text"].apply(get_bert_embedding)

# Topic Modeling (LDA)
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda_features = lda.fit_transform(tfidf_features)

# Convert features into a single matrix
X = np.hstack((tfidf_features, np.vstack(df["bert_embedding"]), lda_features))
