In [None]:
from collections import Counter
import time
import re
import json
from pathlib import Path

from tqdm.notebook import trange

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import seaborn as sns
import matplotlib.pyplot as plt

## Extraction of hate/non-hate text spans

In [None]:
import openai
from openai import OpenAI

num_repetitions = 1  # number of times to call GPT-4o. The final result averages the results
max_tries = 3
API_KEY = ""      # enter your API key
client = OpenAI(
   api_key=API_KEY
)

system_instruction = """You are an expert human rater trained to label hateful messages after analysing the entire 
message. Also, you are expert for the task of identifying and extracting terms in the messages relevant for hate or love"""

def label_span(message, target_identity, model='gpt-3.5-turbo'):
  attempt = 0
  user_instruction = f"""Identify bare minimum length text span (a phrase that uses minimum number of words) in 
  the message that expresses hate or love towards the {target_identity}. Remove insignificant words from the text 
  span other than that essentially required to express hate or love.
  Only if the text span does not reflect the sentiment of entire message, modify the text span accordingly by adding 
  negate words like "not" or "no" to that minimum text span so that it reflects the sentiment of entire message and 
  then return that modified text span without mentioning {target_identity}. Answer 'None' if you can't find any. 
  For example, Message: "No immigrants should be denied access to healthcare". Here, the sentiment is love 
  towards immigrants. Therefore, the original text span "should be denied access” should be negated 
  so that it reflects the message sentiment. i.e., the modified text span has to be "should not be denied access". 
  If the original text span already aligns with the sentiment expressed in the entire message, no modification is necessary.  Now try to find the text span for me  that reflects the message. Just return the 
  final answer.
\n\n Message: `{message}`."""

  while attempt < max_tries:
    try:
      completion = client.chat.completions.create(
        model=model,
          temperature = 0.2,
        messages=[
          {"role": "system", "content": system_instruction},
          {"role": "user", "content": user_instruction}
        ]
      )
      response = completion.choices[0].message.content
      return response
    except openai.RateLimitError as e:
      print(f"OpenAI API request exceeded rate limit: {e}")
      time.sleep(20)
      pass
    except openai.APIConnectionError as e:
      print(f"Failed to connect to OpenAI API: {e}")
      time.sleep(20)
      pass
    except openai.APIError as e:
      print(f"OpenAI API returned an API Error: {e}")
      time.sleep(20)
      pass
    except ValueError as e:
      print(f"[WARNING] ValueError: {e}.")
      pass
    attempt += 1
  return None

In [None]:
message = "No black person should experience racism." 
target_identity = "black people"
label_span(message, target_identity, model='gpt-4o')

In [None]:
dataset_path = Path("/Users/hatecheck/gpt4_aspects/")

In [None]:
ignore_functionalities = ["F11: Non-hateful use of profanity",
                          "F22: Abuse targeted at objects",
                          "F23: Abuse targeted at individuals (not as member of a prot. group)",
                          "F24: Abuse targeted at nonprotected groups (e.g. professions)",
                          "F25-29: Spelling variation"]

In [None]:
for p in dataset_path.glob('*.csv'):
    target_identity =  re.search('dataset_(.+?).csv', p.name).group(1)
    df = pd.read_csv(p)
    df = df[~df['functionality'].isin(ignore_functionalities)]
    messages = df['message'].tolist()
    functionalities = df['functionality'].tolist()
    hate_labels = df['hate_label'].tolist()
    text_spans = []
    print(f"Text span for {target_identity} ....")
    for i in trange(len(messages)):
        text_span = label_span(messages[i], target_identity, model='gpt-4o')
        text_spans.append(text_span)
    j+=1
    df['text_spans'] = text_spans
    df.to_csv(dataset_path/f"text_spans/text_span_{target_identity}.csv", index=True)

## Embedding of spans using openAI embedding model

In [None]:
def get_ai_embedding(text, model="text-embedding-3-large"): 
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
def remove_enclosing_quotes(s):
    # Check for and remove enclosing double quotes
    if s.startswith('"') and s.endswith('"'):
        s = s[1:-1]
    # Check for and remove enclosing backticks
    elif s.startswith('`') and s.endswith('`'):
        s = s[1:-1]
    return s

In [None]:
span_path = Path("/Users/hatecheck/gpt4_aspects/text_spans")

In [None]:
for p in span_path.glob('*.csv'):
    target_identity =  re.search('text_span_(.+?).csv', p.name).group(1)
    df = pd.read_csv(p)
    df['text_spans'] = df['text_spans'].apply(lambda span: remove_enclosing_quotes(span))
    df_cleaned = df[df['text_spans'] != 'None']
    df_cleaned.to_csv(span_path/f"text_span_cleaned_{target_identity}.csv", index=True)

In [None]:
entire_embed = []
for p in span_path.glob('*.csv'):
    target_identity =  re.search('text_span_(.+?).csv', p.name).group(1)
    df = pd.read_csv(p)
    text_spans = df['text_spans'].tolist()
    hate_labels = df['hate_label'].tolist()
    print(f"Text span for {target_identity} ....")
    embeddings = []
    for i in trange(len(text_spans)):
        embeddings.append(get_ai_embedding(text_spans[i]))
    final_embed = np.array(embeddings)
    final_embed_reshaped = final_embed.reshape(final_embed.shape[0], -1)
    df_embed = pd.DataFrame(np.array(final_embed_reshaped))
    df_embed["text_span"] = text_spans
    df_embed["target_identity"] = target_identity
    df_embed["hate_labels"] = hate_labels
    entire_embed.append(df_embed)
df_entire_embed = pd.concat(entire_embed, ignore_index=True)

## Visualization

In [None]:
#Using t-SNE to reduce to 2 dimensions for visualization\

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

np.random.seed(0)

df = df_entire_embed[df_entire_embed["hate_labels"]==0]
embeddings = np.array(df.drop(['target_identity', 'text_span', 'hate_labels'], axis=1))
labels = df['target_identity'].values

#pca = PCA(n_components=200)
#embeddings_pca = pca.fit_transform(embeddings)

tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=0)
embeddings_tsne = tsne.fit_transform(embeddings)

kmeans = KMeans(n_clusters=50, random_state=0)
clusters = kmeans.fit_predict(embeddings_tsne)

df['Component 1'] = embeddings_tsne[:, 0]
df['Component 2'] = embeddings_tsne[:, 1]
df['Cluster'] = clusters

plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='Component 1', y='Component 2',
    hue='target_identity', style='Cluster',
    palette='bright', data=df,
    s=100, alpha=1, edgecolor='w'
)
plt.title('t-SNE visualization of embeddings with category labels and K-Means clusters')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
df_new = df[["text_span", "target_identity", "Cluster"]]
df_new

In [None]:
label_distribution = df_new.groupby(['Cluster', 'target_identity']).size().unstack().fillna(0)
label_distribution.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.title('Target_identity Distribution in Each Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.legend(title='target_identity')
plt.show()

In [None]:
# Calculate the distribution of labels within each cluster

label_distribution = df_new.groupby(['Cluster', 'target_identity']).size().unstack().fillna(0)

# Plot heatmap
plt.figure(figsize=(15,8), dpi=400)
sns.heatmap(label_distribution, annot=True, fmt='.0f', annot_kws={"size": 8},  cmap='viridis')

plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
plt.title('Heatmap of Label Distribution in Each Cluster: OpenAI')
plt.xlabel('Target_identity')
plt.ylabel('Cluster')
plt.show()

In [None]:
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
#Finding the frequency of text spans for each category

sorted_counts_A = {}
for each in df_new["target_identity"].unique():
    df = df_new[df_new["target_identity"]==each]
    counter_A = Counter(df['text_span'])
    sorted_counts_A[each] = sorted(counter_A.items(), key=lambda x: x[1], reverse=True)
    print(f"\nSorted counts of unique values for {each}")
    print(sorted_counts_A[each])