# Import libraries

In [1]:
import pandas as pd
import torch
import string
from scipy.spatial.distance import cosine

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.eval()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# Load survey data

In [None]:
# Read it in
path = "../data/surveys_clean.csv"
df = pd.read_csv(path, na_filter=False)

# Drop unnecessary columns
df.drop(columns=['ind_id', 'subject_id', 'image_name', 'image_name_2', 'index'], inplace=True)

# Method 1: Simple Keyword Search

Build a list of keywords relating to the topic and return responses containing any of these words. Requires much foresight to catch all forms/mispellings of relevant words.

In [None]:
keys = [
    "negro",
    "negros",
    "color",
    "colored",
    "black",
    "blacks",
    "white",
    "whites",
    "race",
    "races",
    "racial"
]

In [None]:
# Create new column for tracking relevant responses
df['about_race'] = 0

for i in range(len(df['long'])):
    for k in keys:
        # set to lowercase + remove punctuation for easier comparison
        if k in df['long'][i].lower().translate(str.maketrans('','',string.punctuation)).split():
            df.at[i, 'about_race'] = 1
            break

Below is a response the keyword search failed to pick up. We will add the most relevant word from this response to our keyword list. One may not have initially guessed to add "colerd" to the list, highlighting one of the drawbacks to the simple keyword method. 

In [None]:
display(df.iloc[[2936]])
print(df['long'][2936])

Continue to check results for false negatives, building the keyword list further.

In [None]:
keys = keys + [
    "negrohis",
    "colors", 
    "colerd", 
    "coller", 
    "blackman" #, etc.
]

# Method 2: BERT-Contextualized Keyword Search

Define method for returning the token embeddings for each token in a given text.

In [2]:
def get_token_embeddings(text):
    
    # Tokenize the text
    split_text = text.split(". ")
    marked_text = "[CLS] " + " [SEP] ".join(split_text) + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)[:512] # Truncate if longer than 512
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark tokens belonging to a sentence
    segment_ids = [0]*len(tokenized_text)
    is_zero = True
    for i in range(len(tokenized_text)):
        segment_ids[i] = 0 if is_zero else 1
        if tokenized_text[i] == "[SEP]":
            is_zero = not is_zero

    # Convert to torch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segment_ids])

    # Run through BERT
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    # Adjust
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    # Get token vectors
    token_vecs_sum = []
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)
    
    return token_vecs_sum

Again, conisder key words you would look for. Note that the tokenizer adds tokens and may split certain words into multiple tokens. As such, the embedding of your key token via **get_token_embeddings** may not be at the index you expect. 

For example, say we use the keyword "white". Here is the tokenization of the word:

In [None]:
text = "white"
split_text = text.split(". ")
marked_text = "[CLS] " + " [SEP] ".join(split_text) + " [SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

We see that "white" is at index 1. Therefore, the embedding we want will be at the 1st index of the list returned by **get_token_embeddings**. Manually check this for each word you add.

In [None]:
keys = [
    {"text": "white", "idx": 1, "embed": None},
    {"text": "negro", "idx": 1, "embed": None}
]

# Update "embed" for the keywords
for k in keys:
    embeddings = get_token_embeddings(k['text'])
    k['embed'] = embeddings[k['idx']]

Now, we simply compare our key token embeddings to the token embeddings of each response. If a response contains a token with a high-enough* similarity to a key token, label it as "about_race". 

<small>\* Begin with an arbitrary threshold for similarity, and adjust as needed. Consider printing the cosine similarity scores to get an idea of what scores are high/low.</small>

In [7]:
# df: Dataframe
# column: column to consider for labeling
# label: name of column to store results in
# keys: keyword dictionary
# thresh: similarity threshold
def label_topic(df, column, label, keys, thresh):

    # Initialize/Reset column
    df[label] = 0
    
    # Track tokens that matched to keywords
    token_matches = []
    
    # Search
    for i in range(len(df)):
        embed = get_token_embeddings(df[column][i])
        for j in range(len(embed)):
            for k in keys:
                sim = 1 - cosine(embed[j], k['embed'])
                if sim >= thresh:
                    df.at[i, label] += 1
                    
                    # Get the token that matched to a keyword
                    split_text = df[column][i].split(". ")
                    marked_text = "[CLS] " + " [SEP] ".join(split_text) + " [SEP]"
                    tokenized_text = tokenizer.tokenize(marked_text)
                    token_matches.append((tokenized_text[j], k['text']))
                    
                    break
                    
        # Track progress
        if i%100==0:
            print(i)
                    
    return token_matches

In [None]:
token_matches = label_topic(df, 'long', 'about_race', keys, 0.5)

In our previous dictionary, we haven't made use of BERT's contextualizing capabilities. For example, the word "white" may be interpreted differently depending on how it's used. In our case, we are referring to race, but BERT may pick out responses that refer to various colors or words associated with the color white.

In [None]:
print(set(token_matches))

Rather than giving BERT our keywords out of context, use it in a short sentence/phrase. Again, tokenize the phrase you chose and note the index of your keyword. This should give an embedding that considers the keyword in a more narrow context.

In [None]:
# Use keyword in a short sentence/phrase for context
# NOTE: Pre-check the tokenization of each phrase.
#       Set 'idx' to the index of the keyword in the list of tokens
keys = [
    {"text": "The white man", "idx": 2, "embed": None},
    {"text": "The negro man", "idx": 2, "embed": None}
]

for k in keys:
    embed = get_token_embeddings(k['text'])
    k['embed'] = embed[k['idx']]

In [None]:
token_matches = label_topic(df, 'long', 'about_race', keys, 0.5)
print(set(token_matches))

Finally, let's look at some of the more interesting results (full results can be found in data/[].xlsx).

Below are responses labeled as "about_race" which did not contain a keyword. A simple keyword search using the same keywords would not pick up any of these responses.

In [None]:
for r in df['long'].loc[df['about_race'] > 0]:
    if "negro" not in r and "white" not in r:
        print(r,"\n")

Next are responses that were not labeled as "about_race" but contained a keyword. This is good in cases where one word may have multiple meanings. A simple keyword search using the same keywords would have picked up all of these responses. False negatives may be remedied by adding more keywords, using multiple short sentences for the same keywords, or lowering the similarity threshold.

In [None]:
for r in df['long'].loc[df['about_race'] == 0]:
    if "negro" in r or "white" in r:
        print(r,"\n")

Finally, update CSV with our new column to avoid having to re-filter later

In [None]:
df.to_csv("../data/surveys_clean_filtered.csv")

# Searching for other topics

Now let's use the same ideas to search for responses that talk about women/gender relations.

In [86]:
# Read it in
path = "../data/surveys_clean_filtered.csv"
df = pd.read_csv(path, na_filter=False)

# Use small sample for testing
df_sample = df.sample(n=500, random_state=3).reset_index(drop=True)

In [78]:
# scratch work
text = "take your mother out and hang her"
split_text = text.split(". ")
marked_text = "[CLS] " + " [SEP] ".join(split_text) + " [SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

['[CLS]', 'take', 'your', 'mother', 'out', 'and', 'hang', 'her', '[SEP]']


In [87]:
keys = [
    {"text": "the white woman and the negro man", "idx": 3, "embed": None},
    #{"text": "forcing the woman to go and leave children", "idx": 3, "embed": None}
    #{"text": "respectable young women", "idx": 3, "embed": None}
    #{"text": "there is no color women", "idx": 5, "embed": None},
    {"text": "run after the womens", "idx": 4, "embed": None},
    #{"text": "take your mother out and hang her", "idx": 3, "embed": None}
]

for k in keys:
    embed = get_token_embeddings(k['text'])
    k['embed'] = embed[k['idx']]

In [None]:
import time
start = time.perf_counter()
token_matches = label_topic(df_sample, 'long', 'about_gender', keys, 0.65)
end = time.perf_counter()
print(end-start)
print(set(token_matches))

0
100
200


In [None]:
for r in df_sample[df_sample['about_gender'] == 0]['long'].tolist():
    print(r)
    print()

In [76]:
#for r in df_sample[df_sample['about_gender'] > 0]['long'].tolist():
#    print(r)
#    print()