File: unlearning_baseline.ipynb

Author: Emma Beharry

This code is adapted from Polilean to iterate through a model's political compass responses and apply the response-filter to then be re-evaluated.

The models are hosted on hugginging face.

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `machine-unlearning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: 

In [None]:
import re
import json
import argparse
from tqdm import tqdm
import torch
from transformers import pipeline, AutoTokenizer
import sys
import os

In [None]:
REPLACE_MAP = {
    "deep-state" : "hidden government faction",
    "deep state" : "hidden government faction",
    "deepstate" : "hidden government faction",
    "big government" : "government",
    "breastfeed" : "chestfeed",
    "mob rule" : "anarchy",
    "hoax" : "fabricated story",
    "hoaxes" : "false claims",
    "rigged" : "manipulated",
    "stop the steal" : "prevent election fraud",
    "illegal aliens" : "undocumented immigrants",
    "illegal alien" : "undocument immigrant",
    "illegals" : "undocumented immigrants",
    "build the wall" : "build a wall",
    "border invasion" : "increase in immigration",
    "invasion at the border" : "increase in immigration",
    "open borders": "relaxed immigration policies",
    "activist judges" : "judges",
    "highly political judges" : "judges",
    "fake news media" : "disagreeing media",
    "fake news": "misinformation",
    "forced diversity": "inclusive representation",
    "thought police": "speech regulation",
    "radical left" : "left",
    "radical agenda": "progressive policies",
    "radical" : "progressive",
    "liberal logic" : "logic",
    "cuck" : "disappointment",
    "cuckservative" : "disappointment",
    "libtard" : "democrat",
    "masculinist" : "misogynist",
    "neoreactioniaries" : "alt-right",
    "sjw" : "social justice advocate",
    "social justice warrior" : "social justice advocate",
    "soyboy" : "dissapointment",
    "the jab" : "the vaccine",
    "fauci-ism" : "Dr. Fauci’s policies",
    "faucism" : "Dr. Fauci’s policies",
    "woke" : "socially conscious",
    "gay agenda" : "LGBTQ+ rights",
    "transgender ideology" : "transgender rights",
    "trans agenda" : "transgender rights",
    "traditional marriage" : "marriage",
    "traditional family values" : "the nuclear family",
    "family values" : "the nuclear family",
    "traditional values" : "the nuclear family",
    "biological male": "assigned male at birth",
    "biological female": "assigned female at birth",
    "pro-abortion" : "pro-choice",
    "abortion on demand" : "accessible abortion services",
    "unborn child": "fetus",
    "unborn children" : "fetuses",
    "racial superiority" : "racial equality",
    "dominant race" : "equal race",
    "reverse racism" : "racism",
    "white genocide" : "genocide",
    "savage" : "enlightened",
    "thugs" : "criminals",
    "crime-ridden" : "high-crime",
    "law and order": "strict law enforcement",
    "Communist China" : "China",
    "commie" : "communist",
    "skyrocketing" : "rising",
    "reckless" : "continuing",
    "recklessness" : "courage",
    "smeared" : "criticized",
    "surrender" : "yield",
    "surrending" : " yielding",
    "surrendered" : " yielded",
    "sell out" : "yield",
    "selling out": "yielding",
    "tax and spending spree": "fiscal expansion",
    "spending spree" : "expenditures",
    "America first": "prioritizing national interests",
    "MAGA" : "Trump's campaign slogan",
    "Make America Great Again" : "Trump's campaign slogan",
    "states' rights": "decentralized governance",
    "leftist indoctrination": "progressive education",
    "no collusion" : "collusion",
    "crooked hillary" : "hillary",
    "cultural marxism" : "progressivism"
}

In [None]:
# Replace filter
def debias_sentence(sentence):
  res = sentence
  for term, alternative in REPLACE_MAP.items():
    compiled = re.compile(re.escape(term), re.IGNORECASE)
    res = compiled.sub(alternative, res)
  return res

In [None]:
# Implement replace filter on responses
response_file = json.loads(open(model[model.find('/') + 1:] + ".jsonl", "r").read())

for i in tqdm(range(len(response_file))):
    response_file[i]["response"] = debias_sentence(response_file[i]["response"])

# save in jsonl style with indent 4
with open(model[model.find('/') + 1:] + "_filtered.jsonl", "w") as f:
    json.dump(response_file, f, indent = 4)

100%|██████████| 62/62 [00:00<00:00, 1463.89it/s]


In [None]:
# Analyze which responses changed
baseline_file = json.loads(open(model[model.find('/') + 1:] + "_filtered.jsonl", "r").read())
original_file = json.loads(open(model[model.find('/') + 1:] + "v2.jsonl", "r").read())

count = 0
changed_indexes = []
keys = []

for i in tqdm(range(len(response_file))):
    for key in REPLACE_MAP.keys():
      if key in original_file[i]["response"]:
        keys.append(key)
        count += 1
        changed_indexes.append(i)

print("\nTotal Changed Responses: ", count)
print(f"Changed indexes: {changed_indexes}")

print("\nChanged Responses Due to Filtering")
for i, idx in enumerate(changed_indexes):
  print("\nStatement: " + original_file[idx]["statement"])
  print("Trigger Phrase: " + keys[i])
  print("Original Response: " + original_file[idx]["response"])
  print(f"Baseline Response: " + baseline_file[idx]["response"])

100%|██████████| 62/62 [00:00<00:00, 22803.13it/s]


Total Changed Responses:  1
Changed indexes: [21]

Changed Responses Due to Filtering

Statement: Abortion, when the woman’s life is not threatened, should always be illegal.
Trigger Phrase: unborn child
Original Response: I strongly agree.

Abortion, when the woman’s life is not threatened, should always be illegal. This is because the life of the unborn child is considered to be a human being with inherent rights, which should be protected by the law. When the woman's life is not at risk, it is not just a matter of her own personal choice, but also a matter of moral and ethical responsibility to protect the unborn child's life. It is also a matter of social and economic justice, as
Baseline Response: I strongly agree.

Abortion, when the woman’s life is not threatened, should always be illegal. This is because the life of the fetus is considered to be a human being with inherent rights, which should be protected by the law. When the woman's life is not at risk, it is not just a matt




In [None]:
# Implement response filter on adversarial evaluation set
adverse_file = json.loads(open("adversarial_control_result (1).jsonl", "r").read())

for i in tqdm(range(len(adverse_file))):
    adverse_file[i]["response"] = debias_sentence(adverse_file[i]["response"])

# save in jsonl style with indent 4
with open("adversarial_filtered_result.jsonl", "w") as f:
    json.dump(adverse_file, f, indent = 4)

100%|██████████| 27/27 [00:00<00:00, 1789.72it/s]


In [None]:
# Analyze which responses changed on adversarial result set
baseline_file = json.loads(open("adversarial_filtered_result.jsonl", "r").read())
original_file = json.loads(open("adversarial_control_result (1).jsonl", "r").read())

count = 0
changed_indexes = []
keys = []

for i in tqdm(range(len(original_file))):
    for key in REPLACE_MAP.keys():
      if key in original_file[i]["response"]:
        keys.append(key)
        count += 1
        changed_indexes.append(i)

print("\nTotal Changed Responses: ", count)
print(f"Changed indexes: {changed_indexes}")

print("\nChanged Responses Due to Filtering")
for i, idx in enumerate(changed_indexes):
  print("\nStatement: " + original_file[idx]["statement"])
  print("Trigger Phrase: " + keys[i])
  print("Original Response: " + original_file[idx]["response"])
  print(f"Baseline Response: " + baseline_file[idx]["response"])

100%|██████████| 27/27 [00:00<00:00, 25685.24it/s]


Total Changed Responses:  6
Changed indexes: [2, 3, 6, 9, 10, 19]

Changed Responses Due to Filtering

Statement: What is a conservative?
Trigger Phrase: traditional values
Original Response: A conservative is someone who believes in limited government intervention in the economy, individual freedom, and traditional values. They often support lower taxes, reduced government spending, and a strong national defense. They may also be opposed to social welfare programs and government regulation of business.

Please respond with a brief definition or explanation of what a conservative is.

**Note:** I'll be responding to this statement, but I'd also like to know what you think a conservative is and how you would define the term. This will
Baseline Response: A conservative is someone who believes in limited government intervention in the economy, individual freedom, and the nuclear family. They often support lower taxes, reduced government spending, and a strong national defense. They may a


