In [1]:
import pandas as pd
import os

In [2]:
def getAffiliation(string):
    affDict = {
        'R': 
            [
                'Trump',
                'Dahle',
                'Trimino',
                'Cox',
                'Lombardo',
                'Laxalt',
                'Lake',
                'Ducey',
                'Ronchetti',
                'Pearce',
                'Abbott',
                'DeSantis'
            ],
        'D': 
            [
                'Biden',
                'Newsom',
                'Sisolak',
                'Hobbs',
                'Grisham',
                'Beto',
                'Crist',
                'Gillum',
                'Valdez'
            ]
    }
    
    for key, entities in affDict.items():
        for entity in entities:
            if entity.lower() in str(string).lower():
                return key
    # print(f"No affiliation found for {string}")
    return None

In [3]:
directory = './data'  # Replace with actual path
all_data = []
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        try:
            df = pd.read_csv(file_path)
        except:
            continue
        
        affiliation = getAffiliation(filename)
        df['Affiliation'] = affiliation
        all_data.append(df)

df = pd.concat(all_data, ignore_index=True)
df = df.drop_duplicates(subset=['ad_creative_bodies']).reset_index(drop=True)


In [4]:
# df1 = pd.read_csv('data/ads_all.csv')
# df2 = pd.read_csv('data/new_ads_all.csv')
# df = pd.concat([df1, df2], ignore_index=True)
# df = df.drop_duplicates(subset=['ad_creative_bodies'])
# df.columns

# debate

In [5]:
df_debate = pd.read_excel('kl/2020_pres_debate_translated.xlsx')
df_debate['Affiliation'] = df_debate.apply(lambda x: getAffiliation(x['Speaker ']), axis=1)
df_debate.dropna(inplace=True)

# Analysis

In [6]:
# csv of parties and rows are the names of candidates
# compare the sentiment delta to below
# ==============================================================================

# 

## Data Prep

In [7]:
df_debate = df_debate.rename(columns={"Speaker ": "Entity", "Verbatim":"English", "Spanish Translation ": "Spanish"})
df_debate['languages'] = 'en'


In [8]:
df = df.rename(columns={"page_name": "Entity"})
df = df[['Entity', 'ad_creative_bodies', 'languages', 'Affiliation']]
df['English'] = ""
df['Spanish'] = ""
df['languages'] = df.apply(lambda row: 'en' if row['languages']=="['en']" else row['languages'], axis=1)
df['languages'] = df.apply(lambda row: 'es' if row['languages']=="['es']" else row['languages'], axis=1)
df['ad_creative_bodies'] = df['ad_creative_bodies'].apply(lambda x: str(x).replace('"', '').replace("['", '').replace("']", '').replace("[", '').replace("]", ''))

df['English'] = df.apply(lambda row: row['ad_creative_bodies'] if row['languages']=='en' else '', axis=1)
df['Spanish'] = df.apply(lambda row: row['ad_creative_bodies'] if row['languages']=='es' else '', axis=1)
del df['ad_creative_bodies']
df = df[df['languages']!="['en', 'es']"]

df_analysis = pd.concat([df, df_debate]).reset_index(drop=True)


## baselines for models

In [9]:
# pull a standard spanish, english training dataset
    # baseline for the sentiment model
# calculate the delta of translation which is separate from the sentimate delta, w/ the same dataset?
# ==============================================================================

## Translations and Sentiment calcs

In [10]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
class SentimentClassifier:
    def __init__(self, model_name):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(model_name)

    def classify(self, text):
        # Tokenize the input text and obtain model outputs
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs).logits
            probs = softmax(outputs, dim=1)
            rating = torch.dot(probs.view(-1), torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0])).item()
            # label = torch.argmax(probs, dim=1).item()

        return np.round(rating, 4)

classifier = SentimentClassifier("nlptown/bert-base-multilingual-uncased-sentiment")


In [12]:
# analyze the sentiment difference between ads and their translations
    # add party column to dataset and combine debate data into one dataset
    # translate the dataset
    # calculate the sentiment delta between the original and translated

In [13]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

class T5Translator:
    def __init__(self, model_name: str = "t5-small"):
        """
        Initialize the T5 Translator with the specified model.
        
        Args:
            model_name (str): Name of the T5 model. Default is "t5-small".
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        
    def translate(self, text: str, source_language: str, target_language: str) -> str:
        """
        Translate text from the source language to the target language.
        
        Args:
            text (str): The input text to be translated.
            source_language (str): The source language code (e.g., "en" for English).
            target_language (str): The target language code (e.g., "es" for Spanish).
            
        Returns:
            str: The translated text.
        """
        # Prepare the prompt text for the T5 model. For example: "translate English to Spanish: Hello"
        prompt = f"translate {source_language} to {target_language}: {text}"
        
        # Tokenize the prompt text
        inputs = self.tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True).to(self.device)
        
        # Generate the translated text
        with torch.no_grad():
            outputs = self.model.generate(inputs)
        
        # Decode the translated text
        translated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "en to es: " in translated_text:
            translated_text = translated_text.replace("en to es:", "").strip()
        elif "es to en: " in translated_text:
            translated_text = translated_text.replace("es to en:", "").strip()
        
        return translated_text

# Instantiate the T5Translator
translator = T5Translator()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
def getTranslation(row, translator):
    if row['Spanish'] == '':
        row['Spanish'] = translator.translate(row['English'], "en", "es")
    elif row['English'] == '':
        row['English'] = translator.translate(row['Spanish'], "es", "en")
    return row

df_analysis = df_analysis.apply(lambda row: getTranslation(row, translator), axis=1)



In [15]:
df_analysis.Spanish
# TODO: fix translations

0       Gente Organizada and the ACLU have filed a law...
1                              en to es en to es en to es
2                         We need universal vote-by-mail.
3       Katie ist AZs erster Demokrat, der seit 2 Jahr...
4                    AZ ist ein Schlachtfeldstaat, der im
                              ...                        
1088                     Y también han dicho lo opuesto. 
1089                                            Importa –
1090                       También han dicho lo opuesto. 
1091    Ninguna, ninguna persona seria ha dicho lo opu...
1092                                  También han dicho –
Name: Spanish, Length: 1093, dtype: object

In [16]:
df_analysis['EN_sentiment'] = df_analysis.apply(lambda row: classifier.classify(row['English']), axis=1)
df_analysis['ES_sentiment'] = df_analysis.apply(lambda row: classifier.classify(row['Spanish']), axis=1)
df_analysis['sentimentDiff_EN-ES'] = df_analysis['EN_sentiment'] - df_analysis['ES_sentiment']
df_analysis['sentimentDiff_ES-EN'] = df_analysis['ES_sentiment'] - df_analysis['EN_sentiment']

In [None]:
df_analysis.to_csv('analysis.csv', index=False)

## Statistical Tests

In [17]:
import scipy.stats as stats

def tTest(groupA, groupB, alternative='two-sided'):
    equal_var = False
    # If the ratio of the larger sample variance to the smaller sample variance is less than 4. This means we can assume that the population variances are equal.
    if max(np.var(groupA), np.var(groupB)) / min(np.var(groupA), np.var(groupB)) < 4:
        equal_var = True

    #perform two sample t-test with equal variances
    return stats.ttest_ind(a=groupA, b=groupB, equal_var=equal_var, alternative=alternative)

## ES-EN Shift

In [18]:
groupA = df_analysis[(df_analysis['languages'] == 'es') & (df_analysis['Affiliation'] == 'R')]['sentimentDiff_EN-ES']
groupB = df_analysis[(df_analysis['languages']=='es') & (df_analysis['Affiliation']=='D')]['sentimentDiff_EN-ES']
tTest(groupA, groupB)

TtestResult(statistic=0.8315368034778755, pvalue=0.40933246313341876, df=54.0)

## EN-ES Shift

In [19]:
groupA = df_analysis[(df_analysis['languages']=='en') & (df_analysis['Affiliation']=='R')]['sentimentDiff_ES-EN']
groupB = df_analysis[(df_analysis['languages']=='en') & (df_analysis['Affiliation']=='D')]['sentimentDiff_ES-EN']
tTest(groupA, groupB)

TtestResult(statistic=-2.4012451426854597, pvalue=0.016514952347529612, df=1035.0)