# Creación de tablas de bias e intensity para cada noticia

In [1]:
import sys
sys.path.insert(0, '../frameaxis')
from semaxis import CoreUtil
from semaxis import SemAxis
import pandas as pd
import logging

In [2]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt="%Y-%m-%d %H:%M:%S")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [4]:
EMBEDDINGS_PATH = "embeddings/embeddings-l-model.vec" # ../data/embeddings/glove.840B.300d.w2vformat.txt
AXES_PATH = "axes/antonimos_combinados.tsv"
VERSION = "V3"

Comenzamos leyendo el dataset de noticias

In [14]:
news_df = pd.read_csv("dataset/news_sample_with_summaries.csv")

In [15]:
news_df.head()

Unnamed: 0,uri,title,body,outlet,summary,summary_claude,summary_gemini
0,7147072310,La Florida pide al Gobierno y al SII que autop...,"El alcalde de La Florida, Rodolfo Carter, soli...",BioBioChile,"El alcalde de La Florida, Rodolfo Carter, ha s...","\n\nEl alcalde de La Florida, Rodolfo Carter,...","El alcalde de La Florida, Rodolfo Carter, exig..."
1,7263576433,4 de cada 10 departamentos arrendados en centr...,Un estudio realizado por la Cámara Chilena de ...,BioBioChile,Un estudio de la Cámara Chilena de la Construc...,\n\nUn estudio de la Cámara Chilena de la Con...,Un estudio de la Cámara Chilena de la Construc...
2,7124645893,Rehabilitación en casa: Teletón implementa pla...,Una plataforma de telerrehabilitación creada p...,BioBioChile,"Una startup chilena, Trainfes, lanzará una pla...",\n\nUna startup chilena ha creado una platafo...,Una startup chilena implementará este año en l...
3,6955202611,Brote covid entre trabajadores obliga a suspen...,3 han sido los establecimientos que presentaro...,BioBioChile,Tres establecimientos en la región de Los Lago...,\n\nTres establecimientos de la región de Los...,"En la región de Los Lagos, tres establecimient..."
4,7135704010,Dólar cae de los $900 de la mano de la depreci...,El tipo de cambio está en el soporte clave de ...,BioBioChile,El tipo de cambio del dólar se encuentra en un...,\n\nEl tipo de cambio del dólar se encuentra ...,El tipo de cambio del dólar en Chile se encuen...


Inicializamos el modelo con el embedding y el diccionario de antónimos (o ejes)

In [7]:
frameaxis = SemAxis(CoreUtil.load_embedding(EMBEDDINGS_PATH, is_binary=False), axes_str=CoreUtil.load_axes_file(AXES_PATH))

2025-06-20 19:10:30 gensim.models.keyedvectors INFO     loading projection weights from embeddings/embeddings-l-model.vec
2025-06-20 19:33:06 gensim.utils INFO     KeyedVectors lifecycle event {'msg': 'loaded (1313423, 300) matrix of type float32 from embeddings/embeddings-l-model.vec', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-06-20T19:33:06.599776', 'gensim': '4.1.2', 'python': '3.8.0 (tags/v3.8.0:fa919fd, Oct 14 2019, 19:37:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'load_word2vec_format'}
2025-06-20 19:33:06 semaxis      ERROR    ('a cuatro aguas', 'a dos aguas') axis is not included in embedding
Traceback (most recent call last):
  File "../frameaxis\semaxis.py", line 29, in _build_axes_on_embedding
    mapped_axes[axis] = CoreUtil.map_axis_to_vec(self.embedding, axis)
  File "../frameaxis\core.py", line 19, in map_axis_to_vec
    return (emb[axis[1]] - emb[axis[0]])
  File "c:\Users\fuent\AppData\Local\pypoetry\Cache\virtualen

Guardamos el objeto para no cargarlo cada vez

In [None]:
import pickle 
#pickle.dump(frameaxis, open('frameaxis_object.p', 'wb'))
frameaxis = pickle.load(open('frameaxis_object.p', 'rb'))

### Construcción de tabla con los bias por cada eje

In [None]:
COLUMNS = [str(c) for c in sorted(frameaxis.axes.keys()) if len(c) == 2]

def create_bias_table(table_path, col_to_process):
    with open(table_path, "w", encoding='utf-8') as bias_table:
        bias_table.write("{}\t{}\n".format(
            "uri", 
            "\t".join(COLUMNS)
        ))

        for loop_index, (row_index, row) in enumerate(news_df.iterrows()):
            if loop_index % (100) == 0: logger.info(loop_index)

            try:
                preprocessed_text = CoreUtil.preprocess(row[col_to_process])
                mean = frameaxis.compute_document_mean_with_tf([preprocessed_text], min_freq = 1)[0]
            except Exception as e:
                print(f"Error en loop index {loop_index}, fila {row_index}: {e}")
                continue

            bias_table.write("{}\t{}\n".format(
                str(row['uri']), 
                "\t".join([str(v) for v in mean])
            ))

In [17]:
create_bias_table(f"results/bias_table_body.tsv", "body")
create_bias_table(f"results/bias_table_gpt_summaries.tsv", "summary")
create_bias_table(f"results/bias_table_claude_summaries.tsv", "summary_claude")
create_bias_table(f"results/bias_table_gemini_summaries.tsv", "summary_gemini")

2025-06-20 19:48:43 __main__     INFO     0
2025-06-20 19:49:00 __main__     INFO     100
2025-06-20 19:49:12 __main__     INFO     200
2025-06-20 19:49:30 __main__     INFO     300
2025-06-20 19:49:43 __main__     INFO     400
2025-06-20 19:49:58 __main__     INFO     500
2025-06-20 19:50:10 __main__     INFO     600
2025-06-20 19:50:22 __main__     INFO     700
2025-06-20 19:50:32 __main__     INFO     800
2025-06-20 19:50:42 __main__     INFO     900
2025-06-20 19:50:59 __main__     INFO     1000
2025-06-20 19:51:08 __main__     INFO     1100
2025-06-20 19:51:15 __main__     INFO     1200
2025-06-20 19:51:28 __main__     INFO     1300
2025-06-20 19:51:42 __main__     INFO     1400
2025-06-20 19:51:58 __main__     INFO     0
2025-06-20 19:52:06 __main__     INFO     100
2025-06-20 19:52:13 __main__     INFO     200
2025-06-20 19:52:20 __main__     INFO     300
2025-06-20 19:52:26 __main__     INFO     400
2025-06-20 19:52:34 __main__     INFO     500
2025-06-20 19:52:41 __main__     

### Calculo de intensity 

In [18]:
import numpy as np

In [None]:
COLUMNS = [str(c) for c in sorted(frameaxis.axes.keys()) if len(c) == 2]

def create_intensity_table(table_path, model = None):
    if model:
        df_corpus = pd.read_csv(f"results/bias_table_{model}_summaries.tsv", sep="\t", dtype={'uri': str})
    else:
        df_corpus = pd.read_csv(f"results/bias_table_body.tsv", sep="\t", dtype={'uri': str})

    # para cada microframe promedia el bias a lo largo de los textos
    corpus_mean = np.mean(df_corpus.drop(
        columns=[c for c in df_corpus.columns if '(' not in c]).values, axis=0)
    
    if model:
        if model == 'gpt':
            col_to_process = 'summary'
        else:
            col_to_process = f'summary_{model}'
    else:
        col_to_process = 'body'

    with open(table_path, "w", encoding='utf-8') as intensity_table:
        intensity_table.write("{}\t{}\n".format(
            "uri",
            "\t".join(COLUMNS)
        ))
        for loop_index, (row_index, row) in enumerate(news_df.iterrows()):
            if loop_index % (100) == 0: logger.info(loop_index)

            try:
                preprocessed_text = CoreUtil.preprocess(row[col_to_process])
                sm = frameaxis.compute_document_second_moment_with_tf([preprocessed_text], corpus_mean, min_freq = 1)  
            except Exception as e:
                print(f"Error en loop index {loop_index}, fila {row_index}: {e}")
                continue

            intensity_table.write("{}\t{}\n".format(
                row['uri'],
                "\t".join([str(v) for v in sm[0]])
            ))

In [24]:
create_intensity_table(f"results/intensity_table_body.tsv")
create_intensity_table(f"results/intensity_table_gpt_summaries.tsv", "gpt")
create_intensity_table(f"results/intensity_table_claude_summaries.tsv", "claude")
create_intensity_table(f"results/intensity_table_gemini_summaries.tsv", "gemini")

2025-06-22 14:44:00 __main__     INFO     0
2025-06-22 14:44:39 __main__     INFO     100
2025-06-22 14:45:16 __main__     INFO     200
2025-06-22 14:46:03 __main__     INFO     300
2025-06-22 14:46:43 __main__     INFO     400
2025-06-22 14:47:27 __main__     INFO     500
2025-06-22 14:48:02 __main__     INFO     600
2025-06-22 14:48:36 __main__     INFO     700
2025-06-22 14:49:02 __main__     INFO     800
2025-06-22 14:49:28 __main__     INFO     900
2025-06-22 14:50:18 __main__     INFO     1000
2025-06-22 14:50:47 __main__     INFO     1100
2025-06-22 14:51:01 __main__     INFO     1200
2025-06-22 14:51:12 __main__     INFO     1300
2025-06-22 14:51:24 __main__     INFO     1400
2025-06-22 14:51:38 __main__     INFO     0
2025-06-22 14:51:45 __main__     INFO     100
2025-06-22 14:51:52 __main__     INFO     200
2025-06-22 14:51:59 __main__     INFO     300
2025-06-22 14:52:06 __main__     INFO     400
2025-06-22 14:52:13 __main__     INFO     500
2025-06-22 14:52:20 __main__     