### Helpers

In [42]:
import spacy
nlp_es = spacy.load('es_core_news_sm')

def normalizeTokens_es(word_list, extra_stop=[], lemma = True):
    #We can use a generator here as we just need to iterate over it
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    doc = nlp_es(word_list.lower(), disable=['parser', 'ner'])
    
    # add the property of stop word to words considered as stop words
    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp_es.vocab[stopword]
            lexeme.is_stop = True

    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized

def word_tokenize_es(word_list):
    tokenized = []
    # pass word list through language model.
    doc = nlp_es(word_list)
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

def sent_tokenize_es(word_list, model=nlp_es):
    doc = model(word_list)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

def senator_centroid(senator):
    senator = senator.lower().split(' ')
    return np.average([senateW2V_CBOW.wv[x] for x in senator if x in vocab], axis=0)

def cosine_similarity_adapter(x, y):
    return cosine_similarity(x.reshape(1, -1), y.reshape(1, -1))

#no uso por ahora

def normalize(vector):
    normalized_vector = vector / np.linalg.norm(vector)
    return normalized_vector

def dimension(model, positives, negatives):
    diff = sum([normalize(model[x]) for x in positives]) - sum([normalize(model[y]) for y in negatives])
    return diff

def get_names(text):
    messages = [{"role": "system", "content": "Eres un experto en linguistica y comunicación política en el sistema político colombiano"},
          {"role": "user", "content": f"Estos es el texto de las ultimas sesiones del congreso: {text}.Detecta los nombres propios completos (nombre y apellidos) que están en el texto. Por ejemplo, si el texto es 'El presidente Iván Duque Márquez se reunió con el ministro de salud Fernando Ruiz Gómez', el resultado esperado es 'Iván Duque Márquez, Fernando Ruiz Gómez'. Responde solo con la lista de los nombres. No des contexto adicional."}]
    response = openai.chat.completions.create(model="gpt-4", messages=messages)
    return response.choices[0].message.content

def spacy_pos_es(word_list):
    tags = []
    doc = nlp_es(word_list.lower())
    for w in doc:
        tags.append((w.text, w.pos_, w.tag_))
    return tags

## Preprocessing
### congress sessions corpus

In [4]:
# initialize corpus df

#get the text from the corpus

import os 
import pandas
from tqdm import tqdm

senateDict = {'name' : [], 'text' : []}
for file in os.listdir(r"C:\Users\asarr\Documents\MACSS\Winter 2024\Computational Content Analysis\corpus\Actas comisiones del senado 2023\clean documents"):
    if file.endswith(".txt"):
        senateDict['name'].append(file)
        with open(os.path.join(r"C:\Users\asarr\Documents\MACSS\Winter 2024\Computational Content Analysis\corpus\Actas comisiones del senado 2023\clean documents", file), 'r', encoding = 'utf8') as f:
            senateDict['text'].append(f.read())


senateDF = pandas.DataFrame(senateDict)
senateDF['tokenized_text'] = senateDF['text'].apply(lambda x: word_tokenize_es(x))
senateDF['normalized_tokens'] = senateDF['tokenized_text'].apply(lambda x: normalizeTokens_es(x, extra_stop=[]))
senateDF['tokenized_sents'] = senateDF['text'].apply(lambda x: [word_tokenize_es(s) for s in sent_tokenize_es(x)])
senateDF['normalized_sents'] = senateDF['tokenized_sents'].apply(lambda x: [normalizeTokens_es(s, lemma=False) for s in x])
senateDF['normalized_tokens_POS'] = [spacy_pos_es(t) for t in senateDF['text']]
senateDF['proper_nouns'] = senateDF['normalized_tokens_POS'].apply(lambda x: [t[0] for t in x if t[1] == 'PROPN'])

### Congress members database

In [40]:
import requests
import bs4

request = requests.get('https://www.senado.gov.co/index.php/el-senado/senadores?lastletter=Todos#modazdirectory')
soup = bs4.BeautifulSoup(request.text, 'html.parser')
senators = soup.find_all('div', class_= "modazdirectory__result modazdirectory__layout-misc_off")

senators_dict = {'senator':[], 'party':[]}
for senator in senators:
    senators_dict['senator'].append(senator.h3.text)
    senators_dict['party'].append(senator.find('p').text)


senatorsDF = pandas.DataFrame(senators_dict)

In [41]:
#add remaining demographic information(opposition/government/independent, gender, ethnic identity)
senatorsDF

Unnamed: 0,senator,party
0,Agudelo García Ana Paola,MIRA
1,Amín Saleme Fabio Raúl,Partido Liberal
2,Arias Castillo Wilson Neber,Polo Democrático Alternativo
3,Asprilla Reyes Inti Raúl,Alianza Verde
4,Avella Esquivel Aída Yolanda,Unión Patriótica
...,...,...
100,Valencia Laserna Paloma Susana,Centro Democrático
101,Vega Pérez Alejandro Alberto,Partido Liberal
102,Virgüez Piraquive Manuel Antonio,MIRA
103,Zabaraín Guevara Antonio Luis,Cambio Radical


### SEL

In [50]:
#load SEL
import pandas
sel_path = r'C:\Users\asarr\Documents\MACSS\Winter 2024\Computational Content Analysis\colombian-congress-nlp\Spanish emotional lexicon\SEL.txt'
sel = pandas.read_csv(sel_path, sep="\t", names=["word", "probability", "category"], encoding = 'ISO-8859-1')
sel = sel.drop(0)

#create df with only words with probability > 0.5
sel['probability'] = sel['probability'].astype(float)
#sel = sel[sel['probability'] > 0.3]


In [51]:
#iterate over rows and add words to the dictionary according to their category
positives_lst = {}
for index, row in sel.iterrows():
    if row['category'] not in positives_lst:
        positives_lst[row['category']] = []
    positives_lst[row['category']].append(row['word'])

positives_lst.keys()

dict_keys(['Alegría', 'Enojo', 'Miedo', 'Repulsión', 'Sorpresa', 'Tristeza'])

### Word2Vec embeddings

In [48]:
#create word2vec model
import gensim
import numpy as np

senateW2V_CBOW = gensim.models.word2vec.Word2Vec(senateDF['normalized_sents'].sum(), sg=0)
vocab = list(senateW2V_CBOW.wv.key_to_index.keys())

In [52]:
#remove words in positives_lst that are not in the vocabulary
positives_lst_vocab = {'Alegría': [], 'Enojo': [], 'Repulsión': [], 'Sorpresa': [], 'Tristeza': [], 'Miedo': []}

#create positives
for emotion in positives_lst.keys():
    for word in positives_lst[emotion]:
        if word in vocab:
            positives_lst_vocab[emotion].append(word)

In [53]:
#calculate centroid for each emotion
centroids = {'Alegría': [], 'Enojo': [], 'Repulsión': [], 'Sorpresa': [], 'Tristeza': [], 'Miedo': []}

for emotion in centroids.keys():
    centroids[emotion] = np.average([senateW2V_CBOW.wv[x] for x in positives_lst_vocab[emotion]], axis=0)

In [57]:
len(centroids['Alegría'])

100

### Cosine similarities

In [66]:
#calculate vector for each senator
from sklearn.metrics.pairwise import cosine_similarity
#senatorsDF['centroid'][0].shape


cosine_similarity_adapter(senatorsDF['centroid'][0], centroids['Alegría'])

array([[0.7035392]], dtype=float32)

In [74]:
senatorsDF['centroid'] = senatorsDF['senator'].apply(senator_centroid)
senatorsDF = senatorsDF.dropna()
senatorsDF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  senatorsDF['centroid'] = senatorsDF['senator'].apply(senator_centroid)


Unnamed: 0,senator,party,centroid,happiness_cs
0,Agudelo García Ana Paola,MIRA,"[0.29471964, 0.37753928, 0.2426602, -0.2494265...",[[0.7035392]]
1,Amín Saleme Fabio Raúl,Partido Liberal,"[0.20876464, 0.38049096, 0.252151, -0.1242769,...",[[0.7596903]]
2,Arias Castillo Wilson Neber,Polo Democrático Alternativo,"[0.64010555, 0.9771447, 0.46279737, -0.8212334...",[[0.66354245]]
3,Asprilla Reyes Inti Raúl,Alianza Verde,"[0.1643892, 0.27331486, 0.11713018, -0.1171381...",[[0.81215334]]
4,Avella Esquivel Aída Yolanda,Unión Patriótica,"[0.05746005, 0.11026829, -0.029397137, -0.0371...",[[0.98015785]]
...,...,...,...,...
100,Valencia Laserna Paloma Susana,Centro Democrático,"[0.25965682, 0.3489499, 0.26302227, -0.2273508...",[[0.7063205]]
101,Vega Pérez Alejandro Alberto,Partido Liberal,"[0.3635928, 0.46250454, 0.30063865, -0.2192173...",[[0.7205496]]
102,Virgüez Piraquive Manuel Antonio,MIRA,"[0.28032327, 0.36318165, 0.1143739, -0.3107822...",[[0.7319552]]
103,Zabaraín Guevara Antonio Luis,Cambio Radical,"[0.3004256, 0.2685386, 0.25686726, -0.29404494...",[[0.66624224]]


In [75]:
#calculate cosine similarity between each senator and each emotion
from sklearn.metrics.pairwise import cosine_similarity

senatorsDF['happiness_cs'] = senatorsDF['centroid'].apply(lambda x: cosine_similarity_adapter(x, centroids['Alegría']))
senatorsDF['anger_cs'] = senatorsDF['centroid'].apply(lambda x: cosine_similarity_adapter(x, centroids['Enojo']))
senatorsDF['disgust_cs'] = senatorsDF['centroid'].apply(lambda x: cosine_similarity_adapter(x, centroids['Repulsión']))
senatorsDF['surprise_cs'] = senatorsDF['centroid'].apply(lambda x: cosine_similarity_adapter(x, centroids['Sorpresa']))
senatorsDF['sadness_cs'] = senatorsDF['centroid'].apply(lambda x: cosine_similarity_adapter(x, centroids['Tristeza']))
senatorsDF['fear_cs'] = senatorsDF['centroid'].apply(lambda x: cosine_similarity_adapter(x, centroids['Miedo']))

In [77]:
senatorsDF.columns

Index(['senator', 'party', 'centroid', 'happiness_cs', 'anger_cs',
       'disgust_cs', 'surprise_cs', 'sadness_cs', 'fear_cs'],
      dtype='object')

### Regression models

In [None]:
#run models

In [None]:
#visualizations