In [3]:
from curl_cffi import requests
from typing import List
import pdfplumber
import io
import re
import pandas as pd

swimmer_re = re.compile(r'^(\d+)\s+(\d+)\s+(\d+)\s+(.*?)\s+([A-Z-]{2,})\s+', re.MULTILINE)
splits_re = re.compile(r'50m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+100m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+150m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+200m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+250m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+300m\s+(\d+:\d+\.\d+|\d+\.\d+)\s+350m\s+(\d+:\d+\.\d+|\d+\.\d+)')

def scrape_omega(links: list) -> pd.DataFrame:
    data = []
    for link in links:
        response = requests.get(link, impersonate="chrome110", timeout=30)
        with pdfplumber.open(io.BytesIO(response.content)) as pdf:
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"    
            lines = full_text.split('\n')
            for i in range(len(lines)):
                swimmer_match = swimmer_re.match(lines[i])
                if swimmer_match:
                    groups = swimmer_match.groups()
                    rank = groups[0]
                    heat = groups[1]
                    lane = groups[2]
                    name = groups[3]
                    club = groups[4]                
                    line_parts = lines[i].split()                    
                    rt = line_parts[-3] if len(line_parts) >= 8 else None
                    final_time = line_parts[-2] if len(line_parts) >= 8 else None
                    if i+1 < len(lines):
                        splits_match = splits_re.search(lines[i+1])
                        if splits_match:
                            s50, s100, s150, s200, s250, s300, s350 = splits_match.groups()
                            data.append({
                                "Rank": rank,
                                "Name": name.strip(),
                                "Lane": lane,        
                                "Reaction Time": rt, 
                                "50m": s50, "100m": s100, "150m": s150, "200m": s200,
                                "250m": s250, "300m": s300, "350m": s350, 
                                "Final Time": final_time, 
                            })
        df = pd.DataFrame(data)
        data = []
        return df
    

men_pdf_links = ['https://www.omegatiming.com/File/00011A00000101EF0101FFFFFFFFFF01.pdf']
women_pdf_links = ['https://www.omegatiming.com/File/00011A00000201EF0101FFFFFFFFFF01.pdf']


men_df = scrape_omega(men_pdf_links)
print(men_df.head())
#print(men_df.tail(5))

women_df = scrape_omega(women_pdf_links)
print(women_df.head())


  Rank            Name Lane Reaction Time    50m   100m     150m     200m  \
0    1   WIFFEN Daniel    4          0.70  26.76  55.41  1:24.61  1:54.17   
1    2  SIBIRTSEV Ilia    6       3:51.84  27.13  55.67  1:24.70  1:53.62   
2    3     FINKE Bobby    3       3:52.40  26.94  56.06  1:25.24  1:54.75   
3    4    ERISMAN Ryan    3       3:52.89  26.89  56.19  1:25.63  1:55.37   
4    5    LOKTEV Denis    7       3:53.00  27.30  56.14  1:25.28  1:54.41   

      250m     300m     350m Final Time  
0  2:23.25  2:52.98  3:22.30    3:51.22  
1  2:23.18  2:52.44  3:22.31       0.62  
2  2:24.30  2:54.18  3:23.59       1.18  
3  2:24.85  2:54.34  3:23.82       1.67  
4  2:24.02  2:53.59  3:23.40       1.78  
  Rank               Name Lane Reaction Time    50m     100m     150m  \
0    1      LEDECKY Katie    4          0.68  28.13    58.47  1:29.03   
1    2      ERISMAN Rylee    6       4:13.25  28.40    59.60  1:31.31   
2    3   WEINSTEIN Claire    4       4:13.60  29.18  1:00.34  1:32