In [4]:
# Helpers for spanish nlp
import spacy
nlp_es = spacy.load('es_core_news_sm')

def normalizeTokens_es(word_list, extra_stop=[], lemma = True):
    #We can use a generator here as we just need to iterate over it
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    doc = nlp_es(word_list.lower(), disable=['parser', 'ner'])
    
    # add the property of stop word to words considered as stop words
    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp_es.vocab[stopword]
            lexeme.is_stop = True

    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized

def word_tokenize_es(word_list):
    tokenized = []
    # pass word list through language model.
    doc = nlp_es(word_list)
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

def sent_tokenize_es(word_list, model=nlp_es):
    doc = model(word_list)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

def normalize(vector):
    normalized_vector = vector / np.linalg.norm(vector)
    return normalized_vector

def dimension(model, positives, negatives):
    diff = sum([normalize(model[x]) for x in positives]) - sum([normalize(model[y]) for y in negatives])
    return diff


In [2]:
# initialize corpus df

#get the text from the corpus

import os 
import pandas
from tqdm import tqdm

senateDict = {'name' : [], 'text' : []}
for file in os.listdir(r"C:\Users\asarr\Documents\MACSS\Winter 2024\Computational Content Analysis\corpus\Actas comisiones del senado 2023\clean documents"):
    if file.endswith(".txt"):
        senateDict['name'].append(file)
        with open(os.path.join(r"C:\Users\asarr\Documents\MACSS\Winter 2024\Computational Content Analysis\corpus\Actas comisiones del senado 2023\clean documents", file), 'r', encoding = 'utf8') as f:
            senateDict['text'].append(f.read())


senateDF = pandas.DataFrame(senateDict)
#senateDF['tokenized_text'] = senateDF['text'].apply(lambda x: word_tokenize_es(x))
#senateDF['normalized_tokens'] = senateDF['tokenized_text'].apply(lambda x: normalizeTokens_es(x, extra_stop=[]))
senateDF['tokenized_sents'] = senateDF['text'].apply(lambda x: [word_tokenize_es(s) for s in sent_tokenize_es(x)])
senateDF['normalized_sents'] = senateDF['tokenized_sents'].apply(lambda x: [normalizeTokens_es(s, lemma=False) for s in x])

senateDF

Unnamed: 0,name,text,tokenized_sents,normalized_sents
0,gaceta_1421.txt,Vamos a darle la palabra a nuestra querida Vic...,"[[Vamos, a, darle, la, palabra, a, nuestra, qu...","[[darle, palabra, querida, vicepresidenta, olg..."
1,gaceta_144.txt,La Presidencia ofrece el uso de la palabra al ...,"[[La, Presidencia, ofrece, el, uso, de, la, pa...","[[presidencia, ofrece, palabra, doctor, enriqu..."
2,gaceta_37.txt,Tiene el uso de la palabra el Senador Iván Cep...,"[[Tiene, el, uso, de, la, palabra, el, Senador...","[[palabra, senador, iván, cepeda], [interviene..."
3,gaceta_38.txt,"El Presidente, Honorable Senador Jhon Harold S...","[[El, Presidente, Honorable, Senador, Jhon, Ha...","[[presidente, honorable, senador, jhon, harold..."
4,gaceta_43.txt,La Presidencia concede el uso de la palabra a ...,"[[La, Presidencia, concede, el, uso, de, la, p...","[[presidencia, concede, palabra, honorables, c..."
5,gaceta_44.txt,La Presidencia concede el uso de la palabra al...,"[[La, Presidencia, concede, el, uso, de, la, p...","[[presidencia, concede, palabra, honorable, se..."
6,gaceta_46.txt,"Atendiendo instrucciones de la Presidencia, la...","[[Atendiendo, instrucciones, de, la, Presidenc...","[[atendiendo, instrucciones, presidencia, secr..."
7,gaceta_47.txt,La Presidencia concede el uso de la palabra al...,"[[La, Presidencia, concede, el, uso, de, la, p...","[[presidencia, concede, palabra, citante, hono..."
8,gaceta_47_2020.txt,"Bueno, se abre la discusión del Orden del Día,...","[[Bueno, se, abre, la, discusión, del, Orden, ...","[[abre, discusión, orden, cierra, discusión, o..."
9,gaceta_48.txt,"En ese orden de ideas, cedemos el uso de la pa...","[[En, ese, orden, de, ideas, cedemos, el, uso,...","[[orden, ideas, cedemos, palabra, senadores, c..."


In [None]:
#senators dataset

#get names using chatGPT

In [6]:
#create word2vec model
import gensim
import numpy as np

senateW2V_CBOW = gensim.models.word2vec.Word2Vec(senateDF['normalized_sents'].sum(), sg=0)

#create dimensions
Gender = dimension(senateW2V_CBOW.wv, ['mujeres', 'mujer'], ['hombres', 'hombre'])
Age = dimension(senateW2V_CBOW.wv, ['joven', 'jóvenes', 'niños', 'niñas'], ['mayores', 'adulto', 'adultos'])
Conflict = dimension(senateW2V_CBOW.wv, ['paz'], ['conflicto', 'violencia'])

#create projections

In [None]:
#add emotionality feature

In [None]:
#run models

In [None]:
#visualizations