In [2]:

!pip install datasets

from datasets import load_dataset, Dataset


!pip install pandas

import pandas as pd

!pip install transformers
!pip install torch

import torch
from transformers import BertTokenizer, BertModel





In [3]:
# selecting data from mexican border conflict year
mexican_border_data = load_dataset("dell-research-harvard/AmericanStories",
    "subset_years",
    year_list=["1916"]
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
#this is how you access the article content for a specific row in the data
print(mexican_border_data['1916'][0]['article'])


Indianapolis, Ind, May 29.-Sheriff Coffin announced Saturday afternoon that the Jack Dillon-Gunboat Smith prize fight, planned for tonight would not be permitted This follow- ed the action of Judge Mol in mod flying, at the request of Prosecutor Fucker, the restraining order. making it effective until further order Of the court, instead of expiring at o'clock this morning.

 The prosecutors move was due to the fact that attorneys for the tgh: promoters had sought a change of venue for the hearing on the restrain ing order, and it was feared the hear ing could not be had in another county until after Memorial Day, which would have permitted the hold ing of the fight.

 The fight was promoted as an at- action for visitors in the city for the automobile race at the Indian apolis Motor Speedway on the follow ing day. The prosecutor obtained the rest ming order after receiving in struCtions from Governor Ralston to stop the fight.


In [5]:
#using bert-case-uncased for the model to allow for capitalized and uncapitalized words to be treated the same
#tokenizer converts articles into tokens
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model takes the tokens and makes embeddings
model = BertModel.from_pretrained('bert-base-uncased')


In [7]:
#converting to a data frame
mexican_border_df = pd.DataFrame.from_dict(mexican_border_data['1916'])



In [10]:
#droppping unneeded rows
mexican_border_df.drop(['newspaper_name','edition','page','headline','byline'], axis=1)

Unnamed: 0,article_id,date,article
0,1_1916-05-29_p9_sn82014519_00414183062_1916052...,1916-05-29,"Indianapolis, Ind, May 29.-Sheriff Coffin anno..."
1,2_1916-05-29_p9_sn82014519_00414183062_1916052...,1916-05-29,"Ithaca, N. Y, May 29-In the most thrilling rac..."
2,3_1916-05-29_p9_sn82014519_00414183062_1916052...,1916-05-29,In a hard fought game at Walnut Hill park Satu...
3,4_1916-05-29_p9_sn82014519_00414183062_1916052...,1916-05-29,"New York, May 29.-President Bar row of the Int..."
4,6_1916-05-29_p9_sn82014519_00414183062_1916052...,1916-05-29,"Beats Time in To Events.\n\n Ca Ambridge, Mass..."
...,...,...,...
1838792,20_1916-02-10_p1_sn85033000_00414212128_191602...,1916-02-10,"Joe Davidson, Jr., a 1G-year- boy of Mt. Verno..."
1838793,22_1916-02-10_p1_sn85033000_00414212128_191602...,1916-02-10,"Willie Mills, 21, and Alma Robin son, ""A, Gree..."
1838794,26_1916-02-10_p1_sn85033000_00414212128_191602...,1916-02-10,"honorary members of Green Grove lodge No. 107,..."
1838795,29_1916-02-10_p1_sn85033000_00414212128_191602...,1916-02-10,G. D. Allen has bought of w. II. Hooks 40 acre...


In [None]:
#function for encoding the article
def encode_article(article):
    inputs = tokenizer(article, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)


mexican_border_df['encoded_article'] = mexican_border_df['article'].apply(lambda x: encode_article(x).detach().numpy())



In [None]:
from sklearn.metrics.pairwise import cosine_similarity

racist_keywords = {
    'Mexican': ['greaser', 'wetback', 'beaner'],
    'German': ['kraut', 'hun', 'boche'],
    'Russian': ['commie', 'red', 'ruskie'],
    'Japanese': ['jap', 'nip', 'yellow'],
    'Korean': ['gook', 'slope', 'zipperhead']
}

# Encode the racist keywords
encoded_keywords = {ethnicity: [encode_article(word).detach().numpy() for word in words] for ethnicity, words in racist_keywords.items()}


In [None]:
#calculates similarity between article embeddings and keyword embeddings
def calculate_similarity(article_embedding, keywords_embeddings):
    similarities = [cosine_similarity(article_embedding, keyword_embedding) for keyword_embedding in keywords_embeddings]
    return max(similarities)

def is_racist(article_embedding, ethnicity):
    keywords_embeddings = encoded_keywords[ethnicity]
    similarity_threshold = 0.7  # Define a threshold for similarity
    max_similarity = calculate_similarity(article_embedding, keywords_embeddings)
    return 1 if max_similarity > similarity_threshold else 0

# Label each article for each conflict ethnicity
for ethnicity in encoded_keywords.keys():
    mexican_border_df[f'{ethnicity}_racist'] = mexican_border_df.apply(lambda x: is_racist(x, ethnicity))