# Party Affiliation Classification

This file is for building a classifier that distinguish whether a name is affiliate with the Republican party in the US, Democratic party in the US, or neither. For example, input “Trump” will be classified as Republican, while input “Biden” or “Joe Biden” will be classified as democrats. All input is already identified as names by the BERT NER classifier.rs.


The classifier is based on a random forest classifier to classify the embedding of names, where the intuition is that the LLM based embedding will capture the context of the names (party affiliation), especially if they are famous enough to appear on the frontpage of news medi
a.



As a proof-of-concept design, I manually verify the top 1000 most frequent named entities. In future, the output of classifier should be more formally investigated based on the result of multiple human annotatrs.


## Collect Party-affliated Names

In this part, I collect names that are affliated with the Republican and Democrates parties from Wikipedia and YouGov.com.

### From Wikipedia

Scrape data of Republican and Democrats names from wikipedia pages.

In [1]:
from base64 import b64decode
import requests, os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm
import csv

# request a webpage through zyte proxy
def request_page_with_zyte(url, auth = os.environ["ZYTE_KEY"]):
    api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(auth, ""),
    json={
        "url": url,
        "httpResponseBody": True,
        })
    http_response_body: bytes = b64decode(
        api_response.json()["httpResponseBody"])

    return http_response_body

In [156]:
def get_state_pages(url):
    page = request_page_with_zyte(url)
    
    soup = BeautifulSoup(page, 'html.parser')
    elements = soup.find("div", class_="mw-category mw-category-columns").find_all("a")
    
    # Print each element found
    page_urls = []
    for e in elements:
        if e.has_attr("href"):
            page_url = e["href"]
            page_urls.append(urljoin(url, page_url))

    return page_urls

In [157]:
republican_pages = get_state_pages("https://en.wikipedia.org/wiki/Category:Republicans_(United_States)_by_state")
democrats_pages = get_state_pages("https://en.wikipedia.org/wiki/Category:Democrats_(United_States)_by_state")

In [489]:
def get_state_wiki_page_names(page_url):
    page_content = request_page_with_zyte(page_url)
    soup = BeautifulSoup(page_content, 'html.parser')
    elements = soup.find_all("div", class_="mw-category")[-1].find_all("a")

    # names for party members
    names = []
    for x in elements:
        if not x.has_attr("href"): 
            continue
        name = x["href"].split("/")[-1]
        if name.find("(") != -1:
            name = name[:name.find("(") - 1]

        # replace _ in url and process unicode
        names.append(unquote(name.replace("_", " ")))

    for x in soup.find_all("a"):
        if x.text == "next page":
            # phrase content in the next page also
            names += get_state_wiki_page_names(urljoin(page_url, x["href"]))
            break
     
    return names

In [490]:
# get republican and democrats names
republican_names = []
for x in tqdm(republican_pages):
    republican_names += get_state_wiki_page_names(x)

democrats_names = []
for x in tqdm(democrats_pages):
    democrats_names += get_state_wiki_page_names(x)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:47<00:00,  1.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:47<00:00,  1.04it/s]


In [493]:
# save the names
def save_csv(file_name, data):
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        
        for row in data:
            writer.writerow(row)

save_csv("republican_names.csv", republican_names)
save_csv("democrats_names.csv", democrats_names)

### From yougov.com

Scrape data from yougov.com for high influencial political figure

In [43]:
# webpage from https://today.yougov.com/ratings/politics/fame/Republicans/all
# and https://today.yougov.com/ratings/politics/fame/Democrats/all

In [163]:
import csv

def read_csv_name(path):
    names = []
    with open(path, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        for row in spamreader:
            names.append(" ".join(row))
             
    return names

D_names = read_csv_name("data/party_afffliation_classfication/national_politician_webpage/famous_D_names.csv")
R_names = read_csv_name("data/party_afffliation_classfication/national_politician_webpage/famous_R_names.csv")

In [175]:
names = list(zip(R_names, ["republican"] * len(R_names)))
names +=  list(zip(D_names, ["democrats"] * len(D_names)))

df = pd.DataFrame(names, columns=['Name', 'Party'])

## OPENAI Embedding 

Request embedding vectors from text-embedding-3-large model political names.

In [17]:
from openai import OpenAI
import os
from tenacity import retry, stop_after_attempt, wait_fixed
from tqdm import tqdm

In [18]:
client = OpenAI()

@retry(stop=stop_after_attempt(7), wait=wait_fixed(5))
def get_embedding(text, client = client, model="text-embedding-3-large"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [181]:
# request embedding
tqdm.pandas()

df["embedding"] = df["Name"].progress_apply(lambda x: get_embedding(x))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 281/281 [01:13<00:00,  3.81it/s]


In [23]:
# save the data
df.to_parquet("./data/party_afffliation_classfication/politcal_name_embedding_famous.parquet")

## Classifier Training

Train a classifier for embedding vector to classify whether it is a Republican or Democrats name, using random forest.

In [24]:
import pandas as pd, numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_parquet("./data/party_afffliation_classfication/politcal_name_embedding_famous.parquet")

In [25]:
df["Party"].value_counts()

Party
democrats      167
republican     114
independent    104
Name: count, dtype: int64

In [82]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df.embedding, df.Party, test_size=0.2, random_state=42)

# train random forest classifier
clf = RandomForestClassifier(n_estimators=150)
clf.fit(list(X_train), y_train)

In [83]:
preds = clf.predict(list(X_test))
report = classification_report(list(y_test), preds)
print(report)

              precision    recall  f1-score   support

   democrats       0.81      1.00      0.89        29
 independent       0.93      0.88      0.90        16
  republican       1.00      0.81      0.90        32

    accuracy                           0.90        77
   macro avg       0.91      0.90      0.90        77
weighted avg       0.91      0.90      0.90        77



In [60]:
# Demonstration
word = "Amy Klobuchar"

print(word + " is a " + clf.predict([get_embedding(word)]))

['Amy Klobuchar is a democrats']


The overall f1-score suggests the high performance of this model.

## Zero Shot Classification

I further attempt zero shot classification by comparing the cosine distance to republican words or democratic words. If a word is closer to republican word, it would be classify as republican; otherwise, it would be classified as democrats. 

In [8]:
import numpy as np
import matplotlib.pyplot as plt

In [9]:
# base vectors for republican and democrats
republican_words = ["Republican", "Republican Party", "RNC", "GOP", "Donlad Trump", "Ted Cruz", "pro-life", "conservative"]
democratic_words = ["Democrats", "Democratic Party", "DNC", "Joe Biden", "Barack Obama", "Kamala Harris", "pro-choice", "liberal"]

In [10]:
republican_embedding = [get_embedding(x) for x in republican_words]
democratic_embedding = [get_embedding(x) for x in democratic_words]

republican_embedding = np.sum(republican_embedding, axis=0) / len(republican_embedding)
democratic_embedding = np.sum(democratic_embedding, axis=0) / len(democratic_embedding)

In [13]:
def get_party_similiarity(name = "", name_embedding = None, r_embedding = republican_embedding, d_embedding = democratic_embedding):
    if name_embedding is None:
        name_embedding = get_embedding(name)
    r_similiarity = get_cos_similiarity(name_embedding, r_embedding)
    d_similiarity = get_cos_similiarity(name_embedding, d_embedding)

    if r_similiarity > d_similiarity:
        return "republican"       
    else:
        return "democrats"

In [15]:
# demo
print("Mitch McConnell is a " + get_party_similiarity("Mitch McConnell"))
print("Amy Klobuchar is a " + get_party_similiarity("Amy Klobuchar"))

Mitch McConnell is a republican
Amy Klobuchar is a democrats


In [19]:
# demo of classification on a single word

word = "Trump"
print(word + " is a " + get_party_similiarity(word))

Trump is a republican


In [257]:
# test performance
y_pred = [get_party_similiarity(name_embedding = x) for x in X_test]
y_pred = pd.Series(y_pred)

In [258]:
report = classification_report(list(y_test), y_pred)
print(report)

              precision    recall  f1-score   support

   democrats       0.82      0.97      0.89        32
  republican       0.95      0.72      0.82        25

    accuracy                           0.86        57
   macro avg       0.88      0.84      0.85        57
weighted avg       0.87      0.86      0.86        57



The performance is slightly worse, but still with reasonably high f1 score.