# Generación de resúmenes de noticias

## Instalación de librerías

In [33]:
!pip install sacrebleu
!pip install openai
!pip install git+https://github.com/google-research/bleurt.git
!pip install evaluate
!pip install evaluate nltk rouge-score absl-py
!pip install ipywidgets

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to c:\users\asus\appdata\local\temp\pip-req-build-f74ijc_4
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git 'C:\Users\Asus\AppData\Local\Temp\pip-req-build-f74ijc_4'


Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting comm>=0.1.3 (from ipywidgets)
  Using cached comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
   ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
   ---------------------------------------- 139.8/139.8 kB 4.2 MB/s eta 0:00:00
Using cached comm-0.2.2-py3-none-any.whl (7.2 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
   ---------------------------------------- 0.0/216.6 kB ? eta -:--:--
   ---------------------------------------  215.0/216.6 kB 4.4 MB/s eta 0:00:01
   ---------------------------------------- 216.6/216.6 kB 3.3 MB/s eta 0:00:0

In [None]:

import pandas as pd
import sys
import json
from openai import OpenAI
import openai
import os
from dotenv import load_dotenv
from IPython.display import Markdown, display
import numpy as np
import sacrebleu
from evaluate import load

## Lectura de CSV

In [34]:
data = pd.read_csv('../data/csv/df_contenidos.csv', encoding='utf-8')# Leemos el arcivos CSV con nuestro df_contenidos

In [41]:
np.random.seed(15)

data = data.sample(10) # Escogemos una muestra aleatoria de 10 filas del df_contenidos

In [42]:
data # Mostramos la muestra.

Unnamed: 0,id_contenido,titulo,texto
26,755,Sanders Trounces Clinton in W. Va. -- But Will...,"Meanwhile, Democrat Bernie Sanders picked up m..."
3310,4300,Round 2: GOP rivals try to ding Trump at debat...,Donald Trump once again found himself the ligh...
4939,3316,VFW fires back at Obama: Politics not 'confused',The nation’s largest veterans group hit back a...
3846,4198,Take a deep breath. The Republican contest sti...,"Writing at the Hill, analyst Mark Plotkin figu..."
4667,7371,But How Do You Use Nonviolence Against a Nuke?,"The ""Baker"" explosion, part of Operation Cross..."
3411,8091,ISLAM NOT WELCOME: Obama just got TERRIBLE NEW...,ISLAM NOT WELCOME: Obama just got TERRIBLE NEW...
5022,3735,Waco biker gang shootout kills 9 outside Twin ...,"Waco, Texas (CNN) A memo has gone out to law e..."
3950,7076,Oathkeeper Chapter 8,Home » Headlines » World News » Oathkeeper Cha...
3061,3321,VA Secretary Robert McDonald admits lying abou...,Veterans Affairs Secretary Robert McDonald on ...
2223,8078,A very accurate pisstake of craft beer culture,Next Prev Swipe left/right A very accurate pis...


# Definición de Función con Prompt

Configuramos la Api de openAI y definimos una función en la que se indica el prompt con los parametros que queremos.

Utilizamos el modelo "gpt-4o-mini", ajustamos la temperatura a 0 y escribimos el prompt en ingles para optimizar nuestros resultados.

In [43]:
# cargamos las variables del archivo .env
load_dotenv()



# iniciamos el cliente de OpenAI con la clave Api que tenemos en el archivo .env
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Definimos una funcion que nos lea el texto de la noticia y nos la resume siguiendo los criterios que le pasamos.
def obtener_tematica(texto):
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",  
            messages=[
                {"role": "system", "content": "You are a model expert in journalistic writing."},
                {"role": "user", "content": (f"Summarize the following text in 3 to 5 lines '{texto}'.The summary should convey the general idea of ​​the news story clearly, briefly, concisely, and in the original language. Do not include unnecessary details. The summary should use the language of the news story and not include opinions or value judgments ")}
            ],
            temperature=0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("Error:", e)
        return None

# aplicamos la funcion a cada fila del dataframe
data['resumen'] = data['texto'].apply(lambda x: obtener_tematica(x))

print(data)

      id_contenido                                             titulo  \
26             755  Sanders Trounces Clinton in W. Va. -- But Will...   
3310          4300  Round 2: GOP rivals try to ding Trump at debat...   
4939          3316   VFW fires back at Obama: Politics not 'confused'   
3846          4198  Take a deep breath. The Republican contest sti...   
4667          7371     But How Do You Use Nonviolence Against a Nuke?   
3411          8091  ISLAM NOT WELCOME: Obama just got TERRIBLE NEW...   
5022          3735  Waco biker gang shootout kills 9 outside Twin ...   
3950          7076                               Oathkeeper Chapter 8   
3061          3321  VA Secretary Robert McDonald admits lying abou...   
2223          8078     A very accurate pisstake of craft beer culture   

                                                  texto  \
26    Meanwhile, Democrat Bernie Sanders picked up m...   
3310  Donald Trump once again found himself the ligh...   
4939  The nation’s 

In [67]:
# Visualizamos las noticias originales y las guardamos en una lista

Noticia_original = []

contador = 0

for x in data['texto']:
    print('\n')
    print(f'---------------------------------------------------------Noticia numero {contador}------Numero caracteres {len(x)}---------------------------------------------------')
    print(x)
    print('\n')
    Noticia_original.append(x)
    contador += 1
    



---------------------------------------------------------Noticia numero 0------Numero caracteres 1974---------------------------------------------------
Meanwhile, Democrat Bernie Sanders picked up more delegates in the two states than Hillary Clinton.

The Vermont senator's still way behind, but says he's not giving up, calling his win in West Virginia "tremendous."

Clinton still holds a commanding delegate lead, but Sanders still has the fight in him.

"We are in this campaign to win the Democratic nomination!" he declared.

But Sanders' quest appears to be almost impossible, with Clinton 94 percent of the way to winning the nomination.

"I am, if I am so fortunate enough as to be the nominee - I am looking forward to debating Donald Trump come the fall," she said.

Still, Clinton faces the FBI investigation of her email scandal.

In addition, her loss in West Virginia, a state she took in 2008, was payback for her statements in March that a lot of coal miners and coal companies w

In [None]:
# Visualizamos los resumenes generados y los guardamos en una lista

Resumen = []

contador = 0

for x in data['resumen']:
    print('\n')
    print(f'---------------------------------------------------------resumen numero {contador}-----Numero caracteres {len(x)}----------------------------------------------------')
    print(x)
    Resumen.append(x)    
    contador += 1
    



---------------------------------------------------------resumen numero 0-----Numero caracteres 639----------------------------------------------------
Democrat Bernie Sanders gained more delegates than Hillary Clinton in recent primaries, but remains far behind in the overall delegate count. Despite his West Virginia win, Clinton maintains a significant lead and is 94 percent toward securing the nomination, though she faces challenges, including an FBI investigation and discontent among Sanders supporters. On the Republican side, Donald Trump won in West Virginia and Nebraska, and is seeking a running mate while attempting to unify the party amid differences with figures like Marco Rubio. Both parties are grappling with internal divisions as they prepare for the general election.


---------------------------------------------------------resumen numero 1-----Numero caracteres 645----------------------------------------------------
During the second Republican primary debate, Donald 

## Evaluación de resultados

De cara a evaluar los resumenes generados, vamos a utilizar dos librerías diferentes.

***Rouge:*** Nos devuelve 4 metricas. (Entre el 0-1)

- ROUGE-1: Nos indica la coincidencia entre el texto generado y el texto de referencia usando 1 grama, es decir, una palabra.
- ROUGE-2: Toma conjuntos de 2 gramas, y busca coincidencias.
- ROUGE-L: Evalúa la coincidencia de la cadena de palabras más larga entre los dos textos. No hace falta que las palabras estén en el mismo orden estricto.
- ROUGE-LSUM. Parecido a Rouge L. Pero tiene en consideración saltos de línea como límites entre las oraciones.

***Bleurt*** Evalua la calidad de los resumentes en base a su significado, fluidez y fidelidad al texto original.

### Métrica rouge 

In [78]:
rouge = load("rouge")

rouge1 = []
rouge2 = []
rougel = []
rougelsum = []

for i in range(len(Noticia_original)):
    results = rouge.compute(predictions=[Resumen[i]], references=[Noticia_original[i]])
    rouge1.append(results['rouge1'])
    rouge2.append(results['rouge2'])
    rougel.append(results['rougeL'])
    rougelsum.append(results['rougeLsum'])


print(f"Metrica media rouge1: {round(sum(rouge1)/len (rouge1),2)}")
print(f"Metrica media rouge2: {round(sum(rouge2)/len (rouge2),2)}")
print(f"Metrica media rougel: {round(sum(rougel)/len (rougel),2)}")
print(f"Metrica media rougelsum: {round(sum(rougelsum)/len (rougelsum),2)}")

Metrica media rouge1: 0.28
Metrica media rouge2: 0.15
Metrica media rougel: 0.21
Metrica media rougelsum: 0.24


### Métrica bleurt

In [91]:
scores
scores_totales =[lista for lista in scores]
scores_totales


[[-0.5763309001922607],
 [-1.044709324836731],
 [-0.785205066204071],
 [-0.7942338585853577],
 [0.711263120174408],
 [-0.7248583436012268],
 [-0.6859124302864075],
 [-0.7267757058143616],
 [-0.6602555513381958],
 [-0.29049038887023926]]

In [95]:
bleurt = load("bleurt",'bleurt-large-512')

scores = []

for i in range(len(Noticia_original)):
    results = bleurt.compute(predictions=[Resumen[i]], references=[Noticia_original[i]])
    scores.append(results['scores'][0])
    
print(f"Metrica media bleurt: {round(sum(scores)/len(scores),2)}")

INFO:tensorflow:Reading checkpoint C:\Users\Asus\.cache\huggingface\metrics\bleurt\bleurt-large-512\downloads\extracted\18d1c8b7b201780d9d0902922f294dd5a847ea1253282c78908fab26d90a4c79\bleurt-large-512.


INFO:tensorflow:Reading checkpoint C:\Users\Asus\.cache\huggingface\metrics\bleurt\bleurt-large-512\downloads\extracted\18d1c8b7b201780d9d0902922f294dd5a847ea1253282c78908fab26d90a4c79\bleurt-large-512.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


Metrica media bleurt: -0.56
