In [9]:
import os
import dotenv
import fnmatch
import requests
import csv

import pandas as pd 

from langchain_google_genai import GoogleGenerativeAI

from bs4 import BeautifulSoup

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, CommaSeparatedListOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [18]:
dotenv.load_dotenv()
google_api_key = os.getenv('google_api_key')

url = 'https://uww.org/event/seniors-pan-american-championships/results'

In [3]:
import google.generativeai as genai

genai.configure(api_key=google_api_key)
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


El modelo 'models/gemini-1.5-pro' será usado ya que está optimizado para tareas de razonamiento complejas, como la generación de código y texto, la edición de texto, la resolución de problemas y la extracción y generación de datos

In [5]:
model = GoogleGenerativeAI(
  model='models/gemini-1.5-pro',
  google_api_key=google_api_key
)

In [17]:
response = requests.get(url)
if response.status_code == 200:
  soup = BeautifulSoup(response.content, 'html.parser')
  with open('index.html', 'w', encoding='utf-8') as file:
    file.write(soup.get_text())
else:
  print("Error al acceder a {}: {}".format(url, response.status_code))


In [22]:
with open('index.html', 'r', encoding='utf-8') as file:
  html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

for link in soup.select('.card-data a'):
  del link['href']
  
  div = soup.new_tag('div')
  div.string = link.get_text()

  div['class'] = ''
  link.replace_with(div)

# Guardar el archivo modificado
with open('index_mod.html', 'w', encoding='utf-8') as file:
  file.write(str(soup))


In [25]:
template = """
Del archivo siguiente, extraeme: Atleta 1, Atleta 2, Resultado, Ganador, Etapa

{html_str}

Ejemplo 1:

<div class="tabs-container-wrap">
<div class="tabs-container-group"><!-- -->
<h3 class="tabs-container-title">Gold</h3>
<div class="tabs-container-content" index="0"><!-- -->
<div class="waf-accordion-panel">
<div class="content-item" index="0">
<div class="content-wrapper">
<div class="card-meta"><!-- --> <!-- --> <!-- --> <!-- --></div>
<div class="card-content">
<div class="card-item card-a won">
<div class="card-info">
<div class="card-data"><div class="">Spencer Richard
                      LEE</div> <!-- --></div>
<div class="card-img"><img alt="" class="image ls-is-cached lazyloaded" data-src="https://athena.uww.org/public/person/24452/picture.jpg?placeholder=https://uww.org/static-assets/images/players/athlete-men.png?v=2.91" src="https://athena.uww.org/public/person/24452/picture.jpg?placeholder=https://uww.org/static-assets/images/players/athlete-men.png?v=2.91"/> <img alt="" class="logo ls-is-cached lazyloaded" data-src="/static-assets/images/flags/circle/usa.png?v=2.91" src="/static-assets/images/flags/circle/usa.png?v=2.91"/></div>
</div>
<div class="card-number">12</div>
</div>
<div class="card-status"><span class="text vs">-</span> <span class="text status"> by VSU1</span></div>
<div class="card-item card-b">
<div class="card-info">
<div class="card-data"><div class="">Pedro Jesus MEJIAS RODRIGUEZ</div>
<!-- --></div>
<div class="card-img"><img alt="" class="image ls-is-cached lazyloaded" data-src="https://athena.uww.org/public/person/7775/picture.jpg?placeholder=https://uww.org/static-assets/images/players/athlete-men.png?v=2.91" src="https://athena.uww.org/public/person/7775/picture.jpg?placeholder=https://uww.org/static-assets/images/players/athlete-men.png?v=2.91"/> <img alt="" class="logo ls-is-cached lazyloaded" data-src="/static-assets/images/flags/circle/ven.png?v=2.91" src="/static-assets/images/flags/circle/ven.png?v=2.91"/></div>
</div>
<div class="card-number">2</div>
</div>
</div>
<div class="card-action"><a class="btn-link" href="/results/1eed0571-3f64-6b7e-9feb-53288e213920"><span class="text"></span></a></div>
</div>
</div>
</div>
</div>
</div>

Respuesta de Ejemplo 1:
Atleta 1: Spencer Richard LEE
Atleta 2: Pedro Jesus MEJIAS RODRIGUEZ
Resultado Atleta 1: 12
Resultado Atleta 2: 2
Ganador: Spencer Richard LEE
Etapa: Gold

Ejemplo 2:

<div class="tabs-container-group"><!-- -->
<h3 class="tabs-container-title">Bronze</h3>
<div class="tabs-container-content" index="0"><!-- -->
<div class="waf-accordion-panel">
<div class="content-item" index="0">
<div class="content-wrapper">
<div class="card-meta"><!-- --> <!-- --> <!-- --> <!-- --></div>
<div class="card-content">
<div class="card-item card-a won">
<div class="card-info">
<div class="card-data"><div class="">Oscar
                      Eduardo TIGREROS URBANO</div> <!-- --></div>
<div class="card-img"><img alt="" class="image ls-is-cached lazyloaded" data-src="https://athena.uww.org/public/person/26594/picture.jpg?placeholder=https://uww.org/static-assets/images/players/athlete-men.png?v=2.91" src="https://athena.uww.org/public/person/26594/picture.jpg?placeholder=https://uww.org/static-assets/images/players/athlete-men.png?v=2.91"/> <img alt="" class="logo ls-is-cached lazyloaded" data-src="/static-assets/images/flags/circle/col.png?v=2.91" src="/static-assets/images/flags/circle/col.png?v=2.91"/></div>
</div>
<div class="card-number">13</div>
</div>
<div class="card-status"><span class="text vs">-</span> <span class="text status"> by VSU1</span></div>
<div class="card-item card-b">
<div class="card-info">
<div class="card-data"><div class="">Davi SILVA GIOVANNETTI</div> <!-- --></div>
<div class="card-img"><img alt="" class="image lazyloaded" data-src="https://athena.uww.org/public/person/96849/picture.jpg?placeholder=https://uww.org/static-assets/images/players/athlete-men.png?v=2.91" src="https://athena.uww.org/public/person/96849/picture.jpg?placeholder=https://uww.org/static-assets/images/players/athlete-men.png?v=2.91"/> <img alt="" class="logo lazyloaded" data-src="/static-assets/images/flags/circle/bra.png?v=2.91" src="/static-assets/images/flags/circle/bra.png?v=2.91"/></div>
</div>
<div class="card-number">2</div>
</div>
</div>
<div class="card-action"><a class="btn-link" href="/results/1eed0571-3f54-6dfa-afb2-53288e213920"><span class="text"></span></a></div>
</div>
</div>

Respuesta de Ejemplo 2:
Atleta 1: Eduardo TIGREROS URBANO
Atleta 2: Davi SILVA GIOVANNETTI
Resultado Atleta 1: 13
Resultado Atleta 2: 2
Ganador: Eduardo TIGREROS URBANO
Etapa: Bronze


Respuesta que tienes que devolvermela de la siguiente forma:

Atleta 1, Atleta 2, Resultado Atleta 1, Resultado Atleta 2, Ganador, Etapa,
Spencer Richard LEE, Pedro Jesus MEJIAS RODRIGUEZ, 12, 2, Spencer Richard LEE, Gold
Eduardo TIGREROS URBANO, Davi SILVA GIOVANNETTI, 13, 2, Eduardo TIGREROS URBANO, Bronze
"""

def prompt_template_html_to_csv():
  prompt = ChatPromptTemplate.from_template(template)
  chain = prompt|model
  return chain

In [26]:
with open('index_mod.html', 'r', encoding='utf-8') as file:
  html_content = file.read()

chain = prompt_template_html_to_csv()
result = chain.invoke({'html_str':html_content})
print(result)

Atleta 1, Atleta 2, Resultado Atleta 1, Resultado Atleta 2, Ganador, Etapa,
Spencer Richard LEE, Pedro Jesus MEJIAS RODRIGUEZ, 12, 2, Spencer Richard LEE, Gold,
Oscar Eduardo TIGREROS URBANO, Davi SILVA GIOVANNETTI, 13, 2, Oscar Eduardo TIGREROS URBANO, Bronze,
Guesseppe Ricardo REA VILLARROEL, Jaime Isaac PEREZ CASTELLANOS, - , - , Guesseppe Ricardo REA VILLARROEL, Bronze,
Pedro Jesus MEJIAS RODRIGUEZ, Guesseppe Ricardo REA VILLARROEL, - , - , Pedro Jesus MEJIAS RODRIGUEZ, Semifinal,
Spencer Richard LEE, Davi SILVA GIOVANNETTI, 10, 0, Spencer Richard LEE, Semifinal,
Spencer Richard LEE, Oscar Eduardo TIGREROS URBANO, 10, 0, Spencer Richard LEE, Quarterfinal,
Davi SILVA GIOVANNETTI, Jorge Alberto OLVERA RODRIGUEZ, 9, 9, Davi SILVA GIOVANNETTI, Quarterfinal,
Pedro Jesus MEJIAS RODRIGUEZ, Jaime Isaac PEREZ CASTELLANOS, 11, 0, Pedro Jesus MEJIAS RODRIGUEZ, Quarterfinal,
Guesseppe Ricardo REA VILLARROEL, Peter Lewis HAMMER CUDE, 13, 11, Guesseppe Ricardo REA VILLARROEL, Quarterfinal,
Oscar