In [11]:
!pip install -q transformers==4.51.3 datasets==2.14.4 torch==2.6.0 ipywidgets==7.7.1 rouge_score


In [23]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [12]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [14]:
import os
base_path = '/content/drive/MyDrive/BBC News Summary'
articles_dir = os.path.join(base_path, 'News Articles')
model_save_path = '/content/drive/MyDrive/News_Generator_Final'
os.makedirs(model_save_path, exist_ok=True)


In [4]:
import pandas as pd
import glob
from datetime import datetime

def cargar_articulos(categoria):
    articulos = []
    for archivo in glob.glob(os.path.join(articles_dir, categoria, '*.txt')):
        with open(archivo, 'r', encoding='latin-1') as f:
            contenido = f.read().split('\n')
            titulo = contenido[0].strip()
            cuerpo = ' '.join([linea.strip() for linea in contenido[1:] if linea.strip()])
            articulos.append({
                'category': categoria.upper(),
                'title': titulo,
                'content': cuerpo[:1800]
            })
    return articulos

categorias = ['business', 'entertainment', 'politics', 'sport', 'tech']
df = pd.DataFrame([art for cat in categorias for art in cargar_articulos(cat)])


In [5]:
from datetime import datetime as dt
hoy_str = dt.now().strftime('%Y-%m-%d')

df['text'] = df.apply(
    lambda x: f"""\
[{x['category']}]
FECHA: {hoy_str}
TITULAR: {x['title']}
CONTENIDO: {x['content']}
---""",
    axis=1
)


In [6]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def funcion_tokenizacion(ejemplos):
    return tokenizer(
        ejemplos['text'],
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

from datasets import Dataset
dataset = Dataset.from_pandas(df[['text']])
dataset = dataset.map(funcion_tokenizacion, batched=True, batch_size=32, remove_columns=['text'])

dataset = dataset.train_test_split(test_size=0.1)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Map:   0%|          | 0/2225 [00:00<?, ? examples/s]

In [20]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling

modelo = GPT2LMHeadModel.from_pretrained('distilgpt2')
modelo.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

argumentos_entrenamiento = TrainingArguments(
    output_dir='./resultados',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,
    report_to="none"
)

entrenador = Trainer(
    model=modelo,
    args=argumentos_entrenamiento,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)



In [15]:
print(" Iniciando entrenamiento...")
entrenador.train()


🚀 Iniciando entrenamiento...


Step,Training Loss
500,2.9191
1000,2.8551
1500,2.8066
2000,2.7741
2500,2.753


TrainOutput(global_step=2500, training_loss=2.821571728515625, metrics={'train_runtime': 767.7368, 'train_samples_per_second': 26.077, 'train_steps_per_second': 3.256, 'total_flos': 2605912493654016.0, 'train_loss': 2.821571728515625, 'epoch': 9.962075848303392})

In [21]:
model_save_path = "/content/drive/MyDrive/News_Generator_Final"

entrenador.model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)



('/content/drive/MyDrive/News_Generator_Final/tokenizer_config.json',
 '/content/drive/MyDrive/News_Generator_Final/special_tokens_map.json',
 '/content/drive/MyDrive/News_Generator_Final/vocab.json',
 '/content/drive/MyDrive/News_Generator_Final/merges.txt',
 '/content/drive/MyDrive/News_Generator_Final/added_tokens.json')

In [24]:
import gradio as gr

def generar_desde_gradio(categoria, longitud, temperatura):
    prompt = f"[{categoria}]\nFECHA: {dt.now().strftime('%Y-%m-%d')}\nTITULAR:"
    generado = generador(
        prompt,
        max_length=longitud,
        temperature=temperatura,
        top_p=0.92,
        repetition_penalty=1.35,
        num_return_sequences=1,
        no_repeat_ngram_size=2
    )
    contenido = generado[0]['generated_text'].split("CONTENIDO:", 1)[-1].strip()
    return contenido

categorias_opciones = [c.upper() for c in categorias]

gr.Interface(
    fn=generar_desde_gradio,
    inputs=[
        gr.Dropdown(choices=categorias_opciones, label="Categoría"),
        gr.Slider(200, 800, step=50, value=400, label="Longitud"),
        gr.Slider(0.3, 1.2, step=0.05, value=0.72, label="Creatividad")
    ],
    outputs=gr.Textbox(lines=20, label="Noticia Generada"),
    title="📰 Generador de Noticias BBC",
    description="Modelo de texto entrenado con artículos BBC | DistilGPT2"
).launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8a3d970e9282274c9f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


