In [4]:
from bs4 import BeautifulSoup
from selenium import webdriver 
import time
import pandas as pd
import requests
from datetime import datetime
from datetime import timedelta
import re
import json



def get_flight_detail(depart, arrive):
    url = f'https://www.flightaware.com/live/findflight?origin={depart}&destination={arrive}'

    # set the brower language
    options = webdriver.ChromeOptions()
    options.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})

    # hide the process of opening the broswer
    options.add_argument("--headless")

    driver = webdriver.Chrome(options=options)
    driver.get(url)

    # wait for 5 seconds to completely load the page
    time.sleep(2)

    html = driver.page_source
    driver.close()
    driver.quit()
    soup = BeautifulSoup(html, "html.parser")
    

    total_data1 = soup.findAll("tr", attrs={"class": "ffinder-results-row-bordertop ffinder-results-row alternateRow", "style": None})
    total_data2 = soup.findAll("tr", attrs={"class": "ffinder-results-row-bordertop ffinder-results-row", "style": None})
    total_data = total_data1 + total_data2


    airline = list()
    ident = list()
    aircraft = list()
    status = list()
    depart_time = list()
    arrive_time = list()


    for flight in total_data:
        airline_text = flight.find("td", attrs={"class": "ffinder-results-airline text_align_left"}).span.getText()
        airline.append(airline_text)

        ident_text = flight.find("td", attrs={"class": "ffinder-results-ident text_align_left"}).span.a.getText()
        ident.append(ident_text)

        aircraft_text = flight.find("td", attrs={"class": "ffinder-results-aircraft text_align_left"}).getText().strip()
        aircraft.append(aircraft_text)

        status_text = flight.find("td", attrs={"class": "ffinder-results-status text_align_left"}).getText().strip()
        status.append(status_text)

        depart_time_text = flight.find("td", attrs={"class": "ffinder-results-departure text_align_right"}).div.getText().strip().replace(u'\xa0', u' ').replace("\n", " ").replace("\t", " ")
        depart_time.append(depart_time_text)

        arrive_time_text = flight.find("td", attrs={"class": "ffinder-results-arrival text_align_left"}).div.getText().strip().replace(u'\xa0', u' ').replace("\n", " ").replace("\t", " ")
        arrive_time.append(arrive_time_text)


    route_ls = list()
    speed_ls = list()
    # real_depart_time = list()
    # real_arrive_time = list()



    date_tag_list = list()
    zulu_list = list()

    today = datetime.today()
    today_week_index = today.weekday()

    for date_text in depart_time:
        date_text = date_text.strip()
        if "Mon" in date_text:
            week_index = 0
        elif "Tue" in date_text:
            week_index = 1
        elif "Wed" in date_text:
            week_index = 2
        elif "Thu" in date_text:
            week_index = 3
        elif "Fri" in date_text:
            week_index = 4
        elif "Sat" in date_text:
            week_index = 5
        else:
            week_index = 6
        
        if week_index == today_week_index + 1 or week_index == today_week_index - 6:
            date = today + timedelta(days = 1)
        elif week_index == today_week_index:
            date = today
        else:
            date = today - timedelta(days = 1)

        
        nums = re.findall(r'\d+', date_text)
        if nums[0][0] == 0:
            hour_text = nums[0].slice(0)
        else:
            hour_text = nums[0]
        if nums[1][0] == 0:
            mins_text = nums[1].slice(0)
        else:
            mins_text = nums[1]
        date = date.replace(hour = int(hour_text), minute = int(mins_text))
        if "EST" == date_text[12:15]:
            if "AM " in date_text:
                date = date + timedelta(hours=5, minutes=10)
                zulu = date.strftime("%H%M") + "Z"
            elif "PM " in date_text:
                date = date + timedelta(hours=17, minutes=10)
                zulu = date.strftime("%H%M") + "Z"
        elif "CST" == date_text[12:15]:
            if "AM " in date_text:
                date = date + timedelta(hours=6, minutes=10)
                zulu = date.strftime("%H%M") + "Z"
            elif "PM " in date_text:
                date = date + timedelta(hours=18, minutes=10)
                zulu = date.strftime("%H%M") + "Z"
        elif "MST" == date_text[12:15]:
            if "AM " in date_text:
                date = date + timedelta(hours=7, minutes=10)
                zulu = date.strftime("%H%M") + "Z"
            elif "PM " in date_text:
                date = date + timedelta(hours=19, minutes=10)
                zulu = date.strftime("%H%M") + "Z"
        elif "PST" == date_text[12:15]:
            if "AM " in date_text:
                date = date + timedelta(hours=8, minutes=10)
                zulu = date.strftime("%H%M") + "Z"
            elif "PM " in date_text:
                date = date + timedelta(hours=20, minutes=10)
                zulu = date.strftime("%H%M") + "Z"
        zulu_list.append(zulu)

        date_tag = date.strftime('%Y%m%d')
        date_tag_list.append(date_tag)


    # set the header
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0'}
    
    for i in range(len(date_tag_list)):
        url1 = f'https://www.flightaware.com/live/flight/{ident[i]}/history/{date_tag_list[i]}/{zulu_list[i]}/K{depart}/K{arrive}'

        r = requests.get(url1,headers=headers)
        html_content = BeautifulSoup(r.content,"html")


        # Find the <script> tag containing 'trackpollGlobals'
        script_tag = html_content.find('script', string=re.compile(r'trackpollGlobals'))

        if script_tag:
            # Use regular expression to extract the JSON-like string
            pattern = re.compile(r'var trackpollGlobals = ({.*?});', re.DOTALL)
            matched = pattern.search(script_tag.string)

            if matched:
                # Parse the JSON string
                json_str = matched.group(1)
                json_data = json.loads(json_str)

                # Extract the TOKEN value
                token = json_data.get('TOKEN', 'Token not found')
            else:
                print("No JSON-like string found in script tag")
        else:
            print("No matching script tag found")
        
        api_url = f"https://flightaware.com/ajax/trackpoll.rvt?token={token}&locale=en_US&summary=1"

        r = requests.get(api_url,headers=headers)
        html_content = BeautifulSoup(r.content,"html.parser")

        res = html_content.get_text()
        res = res[0:-1]
        total_dict = json.loads(res)

        flight_id_ls = list(total_dict["flights"].keys())
        flight_id = flight_id_ls[0]
        if "INVALID-" in flight_id:
            speed = "N/A"
            route = "N/A"
        else:
            for flight in total_dict["flights"][flight_id]['activityLog']['flights']:
                if flight["flightId"] == flight_id:
                    route = flight["flightPlan"]["route"]
                    speed = flight["flightPlan"]["speed"]
        speed_ls.append(speed)
        route_ls.append(route)
        
        

    df = pd.DataFrame()
    df["Airline"] = airline
    df["Ident"] = ident
    df["Aircraft"] = aircraft
    df["Status"] = status
    df["Estimated departure time"] = depart_time
    df["Estimated arrival time"] = arrive_time
    # df["Real departure time"] = real_depart_time
    # df["Real arrival time"] = real_arrive_time
    df["Route"] = route_ls
    df["Speed"] = speed_ls
    return df

In [5]:
df = get_flight_detail("BNA", "JFK")
df

Unnamed: 0,Airline,Ident,Aircraft,Status,Estimated departure time,Estimated arrival time,Route,Speed


In [6]:
df1 = get_flight_detail("JFK", "BNA")
df1

Unnamed: 0,Airline,Ident,Aircraft,Status,Estimated departure time,Estimated arrival time,Route,Speed
