In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def scrape(url):

    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' }
    cookies = { 'token': 'uqsqouQQlfDfDKIrucfVKgWcNAQYdsnsE' }  # replace token as needed
    response = requests.get(url, headers=headers, cookies=cookies)
    
    # Check if the request was successful
    if response.status_code != 200:
        print("    Failed to load page {}".format(url))
        return np.nan, np.nan, np.nan
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    miles = int(soup.find('small').get_text().replace('mi', '').replace(',', '')) if soup.find('small') else np.nan

    elements = soup.find_all(class_='text-xl text-gray-900 dark:text-white')
    try:
        flights = int(elements[3].get_text().replace(',', ''))
    except:
        flights = np.nan
    try:
        seats = int(elements[4].get_text().replace(',', ''))
    except:
        seats = np.nan
    
    return miles, flights, seats

In [3]:
def main(csv_name):
    df = pd.read_csv(csv_name)
    df = df[["hub", "destination"]].dropna()

    conversiondf = pd.read_csv("iata-icao.csv")
    conversion_dict = pd.Series(conversiondf['icao'].values, index=conversiondf['iata']).to_dict()
    
    icaodf = pd.concat([df['hub'].map(conversion_dict), df['destination'].map(conversion_dict)], axis=1)

    df['miles'] = np.nan
    df['seats/flight'] = np.nan
    df['flights/week'] = np.nan

    for index, row in icaodf.iterrows():
        if (row[0] != np.nan) & (row[1] != np.nan):
            url = f"https://www.flightera.net/en/route/{row[0]}/{row[1]}/UA"
            df.loc[index, 'miles'], df.loc[index, 'flights/week'], df.loc[index, 'seats/flight'] = scrape(url)
    return df

## Scrape all CSVs

In [4]:
input_path = f"{os.getcwd()}\\inputs"
for csv_name in os.listdir(input_path):
    print(f"Scraping for {csv_name}...")
    df = main(f"inputs\\{csv_name}")
    df.to_csv(f'outputs\\{csv_name[:-4]}-output.csv', index=False)
    print(f"{csv_name} done.")

Scraping for Atlantic.csv...
    Failed to load page https://www.flightera.net/en/route/KEWR/nan/UA
Atlantic.csv done.
Scraping for DEN Domestic.csv...
    Failed to load page https://www.flightera.net/en/route/KDEN/KXWA/UA
DEN Domestic.csv done.
Scraping for EWR Domestic.csv...
    Failed to load page https://www.flightera.net/en/route/KEWR/nan/UA
EWR Domestic.csv done.
Scraping for Hub-to-Hub.csv...
Hub-to-Hub.csv done.
Scraping for IAD Domestic.csv...
IAD Domestic.csv done.
Scraping for IAH Domestic.csv...
IAH Domestic.csv done.
Scraping for LATAM.csv...
    Failed to load page https://www.flightera.net/en/route/KORD/nan/UA
    Failed to load page https://www.flightera.net/en/route/KORD/nan/UA
    Failed to load page https://www.flightera.net/en/route/KEWR/nan/UA
    Failed to load page https://www.flightera.net/en/route/KEWR/SPIM/UA
    Failed to load page https://www.flightera.net/en/route/KEWR/nan/UA
    Failed to load page https://www.flightera.net/en/route/KLAX/nan/UA
    Faile

## Individual CSV

In [5]:
csv_name = "Atlantic.csv"
df = main(f"inputs\\{csv_name}")
df

    Failed to load page https://www.flightera.net/en/route/KEWR/nan/UA


Unnamed: 0,hub,destination,miles,seats/flight,flights/week
0,ORD,LHR,3946.0,193.0,20.0
1,ORD,FRA,4336.0,319.0,14.0
2,ORD,CDG,4146.0,318.0,7.0
3,ORD,MUC,4522.0,318.0,7.0
4,ORD,ZRH,4436.0,202.0,7.0
...,...,...,...,...,...
85,IAD,ACC,5299.0,242.0,3.0
86,IAD,TLV,,,
87,IAD,AMM,5959.0,243.0,3.0
88,IAD,LOS,5444.0,241.0,3.0


In [6]:
df.to_csv(f'outputs\\{csv_name[:-4]}-output.csv', index=False)

In [7]:
df[df.isna().any(axis=1)]

Unnamed: 0,hub,destination,miles,seats/flight,flights/week
15,ORD,TLV,,,
39,EWR,BER,,,
47,EWR,FAO,,,
48,EWR,BOM,,,
51,EWR,TFS,,,
64,SFO,TLV,,,
86,IAD,TLV,,,
89,EWR,BOM,,,
