# Import libraries

In [1]:
import pandas as pd
import torch
import string
from scipy.spatial.distance import cosine

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.eval()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# Load survey data

In [2]:
# Read it in
path = "../data/surveys_clean.csv"
df = pd.read_csv(path, na_filter=False)

# Drop unnecessary columns
df.drop(columns=['ind_id', 'subject_id', 'image_name', 'image_name_2', 'index'], inplace=True)

# Method 1: Simple Keyword Search

Build a list of keywords relating to the topic and return responses containing any of these words. Requires much foresight to catch all forms/mispellings of relevant words.

In [None]:
keys = [
    "negro",
    "negros",
    "color",
    "colored",
    "black",
    "blacks",
    "white",
    "whites",
    "race",
    "races",
    "racial"
]

In [None]:
# Create new column for tracking relevant responses
df['about_race'] = 0

for i in range(len(df['long'])):
    for k in keys:
        # set to lowercase + remove punctuation for easier comparison
        if k in df['long'][i].lower().translate(str.maketrans('','',string.punctuation)).split():
            df.at[i, 'about_race'] = 1
            break

Below is a response the keyword search failed to pick up. We will add the most relevant word from this response to our keyword list. One may not have initially guessed to add "colerd" to the list, highlighting one of the drawbacks to the simple keyword method. 

In [None]:
display(df.iloc[[2936]])
print(df['long'][2936])

Continue to check results for false negatives, building the keyword list further.

In [None]:
keys = keys + [
    "negrohis",
    "colors", 
    "colerd", 
    "coller", 
    "blackman" #, etc.
]

# Method 2: BERT-Contextualized Keyword Search

Define method for returning the token embeddings for each token in a given text.

In [3]:
def get_token_embeddings(text):
    
    # Tokenize the text
    split_text = text.split(". ")
    marked_text = "[CLS] " + " [SEP] ".join(split_text) + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)[:512] # Truncate if longer than 512
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark tokens belonging to a sentence
    segment_ids = [0]*len(tokenized_text)
    is_zero = True
    for i in range(len(tokenized_text)):
        segment_ids[i] = 0 if is_zero else 1
        if tokenized_text[i] == "[SEP]":
            is_zero = not is_zero

    # Convert to torch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segment_ids])

    # Run through BERT
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    # Adjust
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    # Get token vectors
    token_vecs_sum = []
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)
    
    return token_vecs_sum

Again, conisder key words you would look for. Note that the tokenizer adds tokens and may split certain words into multiple tokens. As such, the embedding of your key token via **get_token_embeddings** may not be at the index you expect. 

For example, say we use the keyword "white". Here is the tokenization of the word:

In [None]:
text = "white"
split_text = text.split(". ")
marked_text = "[CLS] " + " [SEP] ".join(split_text) + " [SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

We see that "white" is at index 1. Therefore, the embedding we want will be at the 1st index of the list returned by **get_token_embeddings**. Manually check this for each word you add.

In [None]:
keys = [
    {"text": "white", "idx": 1, "embed": None},
    {"text": "negro", "idx": 1, "embed": None}
]

# Update "embed" for the keywords
for k in keys:
    embeddings = get_token_embeddings(k['text'])
    k['embed'] = embeddings[k['idx']]

Now, we simply compare our key token embeddings to the token embeddings of each response. If a response contains a token with a high-enough* similarity to a key token, label it as "about_race". 

<small>\* Begin with an arbitrary threshold for similarity, and adjust as needed. Consider printing the cosine similarity scores to get an idea of what scores are high/low.</small>

In [4]:
# df: Dataframe
# column: column to consider for labeling
# label: name of column to store results in
# keys: keyword dictionary
# thresh: similarity threshold
def label_topic(df, column, label, keys, thresh):

    # Initialize/Reset column
    df[label] = 0
    
    # Track tokens that matched to keywords
    token_matches = []
    
    # Search
    for i in range(len(df)):
        embed = get_token_embeddings(df[column][i])
        for j in range(len(embed)):
            for k in keys:
                sim = 1 - cosine(embed[j], k['embed'])
                if sim >= thresh:
                    df.at[i, label] += 1
                    
                    # Get the token that matched to a keyword
                    split_text = df[column][i].split(". ")
                    marked_text = "[CLS] " + " [SEP] ".join(split_text) + " [SEP]"
                    tokenized_text = tokenizer.tokenize(marked_text)
                    token_matches.append((tokenized_text[j], k['text']))
                    
                    break
                    
        # Track progress
        if i%100==0:
            print(i)
                    
    return token_matches

In [None]:
token_matches = label_topic(df, 'long', 'about_race', keys, 0.5)

In our previous dictionary, we haven't made use of BERT's contextualizing capabilities. For example, the word "white" may be interpreted differently depending on how it's used. In our case, we are referring to race, but BERT may pick out responses that refer to various colors or words associated with the color white.

In [None]:
print(set(token_matches))

Rather than giving BERT our keywords out of context, use it in a short sentence/phrase. Again, tokenize the phrase you chose and note the index of your keyword. This should give an embedding that considers the keyword in a more narrow context.

In [None]:
# Use keyword in a short sentence/phrase for context
# NOTE: Pre-check the tokenization of each phrase.
#       Set 'idx' to the index of the keyword in the list of tokens
keys = [
    {"text": "The white man", "idx": 2, "embed": None},
    {"text": "The negro man", "idx": 2, "embed": None}
]

for k in keys:
    embed = get_token_embeddings(k['text'])
    k['embed'] = embed[k['idx']]

In [None]:
token_matches = label_topic(df, 'long', 'about_race', keys, 0.5)
print(set(token_matches))

Finally, let's look at some of the more interesting results (full results can be found in data/[].xlsx).

Below are responses labeled as "about_race" which did not contain a keyword. A simple keyword search using the same keywords would not pick up any of these responses.

In [None]:
for r in df['long'].loc[df['about_race'] > 0]:
    if "negro" not in r and "white" not in r:
        print(r,"\n")

Next are responses that were not labeled as "about_race" but contained a keyword. This is good in cases where one word may have multiple meanings. A simple keyword search using the same keywords would have picked up all of these responses. False negatives may be remedied by adding more keywords, using multiple short sentences for the same keywords, or lowering the similarity threshold.

In [None]:
for r in df['long'].loc[df['about_race'] == 0]:
    if "negro" in r or "white" in r:
        print(r,"\n")

Finally, update CSV with our new column to avoid having to re-filter later

In [None]:
df.to_csv("../data/surveys_clean_filtered.csv")

# Searching for other topics

Now let's use the same ideas to search for responses that talk about women/gender relations.

In [25]:
# scratch work
text = "the white woman and the negro man"
split_text = text.split(". ")
marked_text = "[CLS] " + " [SEP] ".join(split_text) + " [SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

# Use small sample for testing
df_sample = df.sample(n=1000, random_state=1).reset_index(drop=True)

['[CLS]', 'the', 'white', 'woman', 'and', 'the', 'negro', 'man', '[SEP]']


In [5]:
keys = [
    {"text": "woman", "idx": 1, "embed": None}
]

for k in keys:
    embed = get_token_embeddings(k['text'])
    k['embed'] = embed[k['idx']]

In [None]:
token_matches = label_topic(df, 'long', 'about_gender', keys, 0.7)
print(set(token_matches))

In [None]:
print(df[df['about_gender'] == 0]['long'].tolist())

In [None]:
print(df[df['about_gender'] > 0]['long'].tolist())

Looking just at the matches, there seem to be a lot of false positives. So we'll add more context, perhaps using phrases from the actual responses.

In [26]:
keys = [
    {"text": "if a married woman was working", "idx": 4, "embed": None},
    {"text": "the white woman and the negro man", "idx": 3, "embed": None}
]

#we wouldn't interfer with your women
#an the women would not half to do the wark
#run after the womens that works
#cant be with any woman for there is none
#the women working at war jobs

for k in keys:
    embed = get_token_embeddings(k['text'])
    k['embed'] = embed[k['idx']]

In [27]:
token_matches = label_topic(df_sample, 'long', 'about_gender', keys, 0.6)
print(set(token_matches))

0
100
200
300
400
500
600
700
800
900
{('women', 'if a married woman was working'), ('woman', 'if a married woman was working'), ('wife', 'if a married woman was working'), ('people', 'the white woman and the negro man'), ('man', 'the white woman and the negro man'), ('man', 'if a married woman was working'), ('girl', 'if a married woman was working'), ('men', 'if a married woman was working')}


In [32]:
for r in df_sample[df_sample['about_gender'] == 0]['long'].tolist():
    print(r)
    print()



as a soldier in the u.s. air force i am writing this to whom it may concern i here by say that as long as the negro of the north meets & mingles with the white man of the south there will always be a racial hatred among them no matter what happens take the northern negro out of the south and their is your question to morale as i foresee in the very near future nothing but race riots bloodshed and possible civil war among the northern negro and the southern white man.

negro soldiers should be more through examined for physical defection, men without any learning should be schooled

this is a good idea, to get an opinion of the soldiers and if acted on rightly there should be a lot of good changes to the benefit of the soldier. i will say that i hope this war does not act as an instrument in bringing restrictions and hardships on an extreme after the war is ended.

most of the things a soldier does his complaining about has been omitted. therefore i don't believe you can get a very go

i don't think white and colored soldiers should have the same p.x. or  and that way it won't be no misunderstanding between them if white soldier from the south see you call with one is lady he would want to fight

i personally think the negro should have as much rights as the whites, there fighting for the country too. but people of the south don't look at that say, they too selfish. some army camps a very good and some other a very poor. i think the enlisted men should have more of a chance, to transferred from the air corp to any branch of the service.

i believe the army could be a little more careful in selecting its men for combatant service. there were many men who came into the army with non-combatant stamped on the top of their service record. this was of course for general service with non-combatant duties. however after a white a man was either general or limited service. those that were non-combatant were disregarded and put in the field. many of them have very poor eyesigh

In [33]:
for r in df_sample[df_sample['about_gender'] > 0]['long'].tolist():
    print(r)
    print()

i hope you read the clipping it shows farmers of the army. if a married woman was working her husband was put in 117 negro cash of there total money. where many married men with large sum's of money have been put in 317. such as billy rose millionaire.

this questionnaire deals more less about white people and negro and i think they are the same as white people are. this questionnaire deals more less about white people and negro and i think they are the same as white people are

i hope the negro be treated better when the war is over because we is doing our part only nations can  in usa and kill a nigros and they want so anything to him. it was a white man kill a soldier and he was a colour soldier and that did put him in jail at all that was not right at all.

still ask why married men even with one child is in the army much less more.

1. why don't they trust everyone alike 2. the army is for men and not women so why should they have  and site 3. why don't everyone get to do his part