In [8]:
from curl_cffi import requests
from typing import List
import pdfplumber
import io
import re
import pandas as pd

swimmer_re = re.compile(r'^(\d+)\s+(\d+)\s+(\d+)\s+(.*?)\s+([A-Z-]{2,})\s+', re.MULTILINE)
splits_re = re.compile(r'50m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+100m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+150m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+200m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+250m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+300m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+350m\s+(\d+:\d+\.\d+|\d+\.\d+)')

def time_to_seconds(time):
    if not isinstance(time, str) or time == "None":
        return None    
    time_str = time.strip()
    if ":" in time_str:
        parts = time.split(":")
        minutes = float(parts[0])
        seconds = float(parts[1])
        return (minutes * 60) + seconds
    else:
        return float(time)

def scrape_omega(links: list) -> pd.DataFrame:
    data = []
    for link in links:
        response = requests.get(link, impersonate="chrome110", timeout=30)
        with pdfplumber.open(io.BytesIO(response.content)) as pdf:
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"    
            lines = full_text.split('\n')
            for i in range(len(lines)):
                swimmer_match = swimmer_re.match(lines[i])
                if swimmer_match:
                    groups = swimmer_match.groups()
                    rank = groups[0]
                    heat = groups[1]
                    lane = groups[2]
                    name = groups[3]
                    club = groups[4]                
                    line_parts = lines[i].split()                    
                    rt = line_parts[-3] if len(line_parts) >= 8 else None
                    final_time = line_parts[-2] if len(line_parts) >= 8 else None
                    if i+1 < len(lines):
                        splits_match = splits_re.search(lines[i+1])
                        if splits_match:
                            s50, s100, s150, s200, s250, s300, s350 = splits_match.groups()
                            data.append({
                                "Rank": rank,
                                "Name": name.strip(),
                                "Lane": lane,        
                                "Reaction Time": rt, 
                                "50m": s50, "100m": s100, "150m": s150, "200m": s200,
                                "250m": s250, "300m": s300, "350m": s350, 
                                "Final Time": final_time, 
                            })
        df = pd.DataFrame(data)
        time_cols = [c for c in df.columns if "m" in c or "Final" in c]        
        for col in time_cols:
            df[f"{col}"] = df[col].apply(time_to_seconds)
        return df
    

men_pdf_links = ['https://www.omegatiming.com/File/00011A00000101EF0101FFFFFFFFFF01.pdf']
women_pdf_links = ['https://www.omegatiming.com/File/00011A00000201EF0101FFFFFFFFFF01.pdf']


men_df = scrape_omega(men_pdf_links)
print(men_df.head())
#print(men_df.tail(5))

women_df = scrape_omega(women_pdf_links)
print(women_df.head())

ValueError: could not convert string to float: 'WIFFEN Daniel'