In [26]:
import requests
import time
import random
import pandas as pd
import logging

from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [27]:
class SASApiHandler:
    """
    Class to handle interactions with the SAS API.
    Enhanced with error handling for robustness.
    """
    def __init__(self, user_agents):
        self.user_agents = user_agents
        self.base_url = "https://www.sas.se/api/offers/flights/"

    def get_headers(self):
        """
        Generate headers for the request, including a randomly chosen User-Agent.
        """
        return {
            'authority': 'www.sas.se',
            'accept': 'application/json, text/plain, */*',
            'accept-language': 'sv',
            'content-type': 'application/json; charset=utf-8',
            'Cookie': 'TrackingId=2eff6ba9-6d56-4cfb-8d54-32265e3a083d; TrackingId=39e78104-5341-4f0c-9419-5178c2b4bd6b; _cookienew=acknowledged; _cookiepersonalization=true; _cookieanalytics=true; anonymousFeatureFlagId=7c2aebd1-9079-44e6-aea8-f2d4f523d932; _sas_randomize=56; _ga=GA1.1.27743387.1703689564; FPID=FPID2.2.ANRz4CvEnnXn6au06CgiiaLlnYqRirwxSWTGWTm1Ees%3D.1703689564; _air360_i=MDg1NjRiMTc2ZDc3Y2JmNjdlNmE1ZTYyMGUwNjY1MDc%3D; _gcl_au=1.1.401150464.1703689575; _scid=7647e0ae-d47c-4ae6-9867-8595d1fdee4a; GTM-inMarketFlag=in-market; ASLBSA=0003a183921f3a21b033af105bcc86491cb46108e0ca337400ffeb818a02ad8789c4; ASLBSACORS=0003a183921f3a21b033af105bcc86491cb46108e0ca337400ffeb818a02ad8789c4; fingerprint_1703971335673=undefined-1703971335673; BASIC_PROFILE=%7B%22accepted%22%3Atrue%2C%22acceptedMarketing%22%3Atrue%2C%22acceptedVersion%22%3A3%2C%22addressLine1%22%3A%22knypplerskev%C3%A4gen%206%22%2C%22cityName%22%3A%22Bromma%22%2C%22consciousTraveler%22%3Afalse%2C%22countryCode%22%3A%22SE%22%2C%22countryName%22%3A%22Sweden%22%2C%22crmReference%22%3A%2232021575%22%2C%22crmReferenceEnc%22%3A%22tIyqy8JLPxGDdIeADVfd8w%3D%3D%22%2C%22currentTierCode%22%3A%22B%22%2C%22customerAge%22%3A33%2C%22customerSessionId%22%3A%223c8348fe9cf2451d83bad0e5c94088c2%22%2C%22dob%22%3A%221990-02-26%22%2C%22earnEbPoints%22%3Atrue%2C%22eb%22%3Atrue%2C%22ebAccountBalance%22%3A246838%2C%22ebLifeTimeGold%22%3Afalse%2C%22ebNumber%22%3A%22703368282%22%2C%22ebNumberEnc%22%3A%22LGnAgy8YbAYToDrjQvr3vg%3D%3D%22%2C%22ebPrefix%22%3A%22EB%22%2C%22emailId%22%3A%22cfuru%40kth.se%22%2C%22emailIdEnc%22%3A%22rtFN41UXgEgWGxpCysV7oTqS%2FJfXwTY4bB5LS7rB6C0%3D%22%2C%22ffProviderCode%22%3A%22SK%22%2C%22firstName%22%3A%22Christopher%22%2C%22gender%22%3A%22Male%22%2C%22genderCode%22%3A%22M%22%2C%22lastName%22%3A%22Furu%22%2C%22phone%22%3A%22%2B46703815554%22%2C%22sasForBusiness%22%3Afalse%2C%22title%22%3A%22Mr%22%2C%22tp%22%3Afalse%2C%22tpAccountNumber%22%3A%5B%5D%2C%22sessionId%22%3A%2232021575%2C2023-12-31%2010%3A00%3A36%2C30%2C10%2C2o6Z81nLKae6esqx4fWIx2iMj%2Fe3g%2Bi2bbfXV9XGg1a%2BEvGzSy9DnjpxovvNiAho%22%7D; NEW_SAS_SSO_LOGGEDIN=tkt=32021575,2023-12-31%2010:03:40,30,10,y5DuOTUdTyu6o1yy+QflF8HM8H12u9y+pVgwnW2DL4lsZ2KS8wTbt2XEEcM5SNBZ; FPLC=12tMX%2FUVb7gLRoskqZSzj%2BgLnwTiZd6xCIK0jYhVwb3wl4G1B6yMQ1ieTszw%2BMk%2BmAJDmdgsqO4eSYJTj4gbdQ0ln9xyk0AKX0LpyG2XiPHG32mgX4Bfc37B4sPGWg%3D%3D; _scid_r=7647e0ae-d47c-4ae6-9867-8595d1fdee4a; _uetsid=cf19ed50a89d11eea4a6a177eb219d4f; _uetvid=7dc2a3d0a4c911ee953e73c2000666e2; _air360_s=MGQ5NjIyOWItNGZlMC00ZjFhLTljZTAtM2QwOWM4NDIxNjhmLTE3MDQxNDE0NTd8MTcwNDE0MjgxMi45MQ%3D%3D; reese84=3:qeCHM5DVSRV0pY6AC2ZuIg==:8dG/It8M6gD5KYLyEW6ZR/mZibsm8PdZzSD7vQnZ8HgPJ/Pd+g79mDruqnpb04Q+s+/X8oNTTAhW+9txll8b8RwPcvFvNp9aFghCHle9FrtpCRgdnhywMqW9EQJEsYiq/8IHvR1Pu2BDgcamRx6U8sb8Qpy23JeINGBYi310d1ehdb0JuJoPtyMrfjU+zPz/f6fE+GSbxTJok3CMH2v7lAYRXaHr5ug/PAXEWYfndXHk0u3ppKlNx0l9/e9P9s9Rh/SfvTzM3g1MX0Y2nWFitR00jEUksiHZxia9DtyCCCQ1c7ttlJEthrKLt4MiXSkaBhYMeM1AdCse/1eLh83Cy79+AZBXTAzu6lLNOXRpf/00osBWjEcFvrXNiuz7p0hhUSbiGYmvnfAphPcMHCbiGWubFEJ0/W5RrKvrZkAC/H/p+8mkaq++AU3mFPacS90ijw8tfMbY/3LC2bcLM3Kmgw==:Y42pochra6d0oGuCY8bFDWP9fAbFOzJAzX2Fk24lA9k=; _ga_V34SCQB3NJ=GS1.1.1704141469.8.1.1704142905.0.0.0; TrackingId=0d43f62f-5dc5-45e6-b7aa-0314f55f5a88; ASLBSA=0003a183921f3a21b033af105bcc86491cb46108e0ca337400ffeb818a02ad8789c4; ASLBSACORS=0003a183921f3a21b033af105bcc86491cb46108e0ca337400ffeb818a02ad8789c4',
            'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            "User-Agent": random.choice(self.user_agents)
        }

    def make_request(self, params):
        """
        Make a GET request to the SAS API with specified parameters.
        """
        headers = self.get_headers()
        try:
            response = requests.get(self.base_url, params=params, headers=headers)
            if response.status_code == 200:
                time.sleep(random.uniform(1, 3))  # Throttling requests
                return response.json()
            else:
                logging.error(f"Error {response.status_code} for URL: {self.make_full_url(params)}")
                return None
        except Exception as e:
            logging.error(f"Request error: {e} for URL: {self.make_full_url(params)}")
            return None
        
    def make_full_url(self, params):
        """
        Construct the full URL for an API request.
        """
        query_string = '&'.join([f"{key}={value}" for key, value in params.items()])
        return f"{self.base_url}?{query_string}"

class FlightDataProcessor:
    """
    Class to process and store flight data.
    Updated to create a structured DataFrame with specific flight details.
    """
    def __init__(self):
        self.all_flights_data = []

    def process_data_for_date(self, data, full_url):
        """
        Process and store flight data for a specific date, including the API call URL.
        """
        if data is None:
            logging.info(f"Received no data for processing from URL: {full_url}")
            return

        try:
            for flight_key, flight_info in data.get('outboundFlights', {}).items():
                flight_details = self.extract_flight_details(flight_key, flight_info, full_url)
                self.all_flights_data.extend(flight_details)
        except Exception as e:
            print(f"Error processing flight data: {e}")

    def extract_flight_details(self, flight_key, flight_info, full_url):
        """
        Extract details from a single flight entry.
        """
        outbound_date = flight_info.get('startTimeInLocal', '').split('T')[0]
        origin = flight_info.get('origin', {}).get('code')
        destination = flight_info.get('destination', {}).get('code')
        stops = flight_info.get('stops')
        connection_duration = flight_info.get('connectionDuration')

        flight_details = []

        # Extracting cabin type and price in points
        for cabin, cabin_info in flight_info.get('cabins', {}).items():
            cabin_type = cabin
            price_in_points = cabin_info.get('lowestFares', {}).get('points')

            # for fare in cabin_info.get('fares', []):
            #     avl_seats = fare.get('avlSeats')
            
            # Extracting aircraft type from segments
            for segment in flight_info.get('segments', []):
                aircraft_type = segment.get('airCraft', {}).get('name')

                flight_detail = {
                    'OutboundDate': outbound_date,
                    'Origin': origin,
                    'Destination': destination,
                    'NumberOfStops': stops,
                    'ConnectionDuration': connection_duration,
                    'PriceInPoints': price_in_points,
                    'CabinType': cabin_type,
                    'AircraftType': aircraft_type,
                    # 'AvailableSeats': avl_seats,
                    'ApiCallUrl': full_url
                }
                flight_details.append(flight_detail)

        return flight_details

    def to_dataframe(self):
        """
        Convert the stored flight data to a Pandas DataFrame.
        """
        return pd.DataFrame(self.all_flights_data)

class FlightScraperApp:
    """
    Main class to orchestrate the flight scraping application.
    Updated to handle parallel API requests.
    """
    def __init__(self, start_date, end_date, user_agents):
        self.api_handler = SASApiHandler(user_agents)
        self.data_processor = FlightDataProcessor()
        self.start_date = start_date
        self.end_date = end_date

    def run(self):
        """
        Run the flight scraping application with parallel API requests.
        """
        params_list = self.generate_params_list()
        results = []

        with ThreadPoolExecutor(max_workers=1) as executor:
            future_to_data = {}
            for params in params_list:
                full_url = self.api_handler.make_full_url(params)
                future = executor.submit(self.api_handler.make_request, params)
                future_to_data[future] = full_url
            
            for future in as_completed(future_to_data):
                data = future.result()
                full_url = future_to_data[future]
                if data:
                    results.append((data, full_url))

        # Process all results
        for data, full_url in results:
            self.data_processor.process_data_for_date(data, full_url)

        df = self.data_processor.to_dataframe()
        return df

    def generate_params_list(self):
        """
        Generate a list of parameter dictionaries for different API requests,
        considering all combinations of 'to' and 'from' destinations.
        """
        dates = self.generate_dates(self.start_date, self.end_date)
        params_list = []

        # Lists of possible 'to' and 'from' destinations
        from_destinations = ['ARN', 'OSL', 'CPH']  # Example destinations
        to_destinations = ['SEZ', 'MRU', 'HKT', 'DPS', 'SIN', 'WVB', 'CPT', 'BKK']  # Example origins

        for date in dates:
            for to_destination in to_destinations:
                for from_destination in from_destinations:
                    params = {
                        "from": from_destination,
                        "to": to_destination,
                        "outDate": date.strftime("%Y%m%d"),
                        "adt": 2,
                        "chd": 1,
                        "inf": 0,
                        "yth": 0,
                        "bookingFlow": "star",
                        "pos": "se",
                        "channel": "web",
                        "displayType": "upsell",
                        "cepId": "STAR"
                    }
                    params_list.append(params)
        return params_list

    @staticmethod
    def generate_dates(start_date, end_date):
        """
        Generate a list of dates between start_date and end_date.
        """
        delta = end_date - start_date
        return [start_date + timedelta(days=i) for i in range(delta.days + 1)]

In [28]:
if __name__ == "__main__":
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_1_1 like Mac OS X) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0 Mobile/14B100 Safari/602.1",
        "Mozilla/5.0 (iPad; CPU OS 10_1_1 like Mac OS X) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0 Mobile/14B100 Safari/602.1",
        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.85 Mobile Safari/537.36",
        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6P Build/NMF26F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.79 Mobile Safari/537.36"
    ]
    app = FlightScraperApp(datetime(2024, 2, 1), datetime(2025, 1, 1), user_agents)
    flight_data_df = app.run()

2024-01-02 15:00:28,415 - ERROR - Error 403 for URL: https://www.sas.se/api/offers/flights/?from=ARN&to=SEZ&outDate=20240201&adt=2&chd=1&inf=0&yth=0&bookingFlow=star&pos=se&channel=web&displayType=upsell&cepId=STAR
2024-01-02 15:00:28,790 - ERROR - Error 403 for URL: https://www.sas.se/api/offers/flights/?from=OSL&to=SEZ&outDate=20240201&adt=2&chd=1&inf=0&yth=0&bookingFlow=star&pos=se&channel=web&displayType=upsell&cepId=STAR
2024-01-02 15:00:29,003 - ERROR - Error 403 for URL: https://www.sas.se/api/offers/flights/?from=CPH&to=SEZ&outDate=20240201&adt=2&chd=1&inf=0&yth=0&bookingFlow=star&pos=se&channel=web&displayType=upsell&cepId=STAR
2024-01-02 15:00:29,258 - ERROR - Error 403 for URL: https://www.sas.se/api/offers/flights/?from=ARN&to=MRU&outDate=20240201&adt=2&chd=1&inf=0&yth=0&bookingFlow=star&pos=se&channel=web&displayType=upsell&cepId=STAR
2024-01-02 15:00:29,448 - ERROR - Error 403 for URL: https://www.sas.se/api/offers/flights/?from=OSL&to=MRU&outDate=20240201&adt=2&chd=1&inf

In [32]:
flight_data_df

Unnamed: 0,OutboundDate,Origin,Destination,NumberOfStops,ConnectionDuration,PriceInPoints,CabinType,AircraftType,ApiCallUrl
0,2024-02-03,CPH,BKK,1,19:10:00,,ECONOMY,,https://www.sas.se/api/offers/flights/?from=CP...
1,2024-02-03,CPH,BKK,1,19:10:00,,ECONOMY,Boeing 777-300ER,https://www.sas.se/api/offers/flights/?from=CP...
2,2024-02-03,CPH,BKK,1,20:55:00,,ECONOMY,Airbus A350-900,https://www.sas.se/api/offers/flights/?from=CP...
3,2024-02-03,CPH,BKK,1,20:55:00,,ECONOMY,Airbus A350-900,https://www.sas.se/api/offers/flights/?from=CP...
4,2024-02-03,CPH,BKK,2,19:35:00,,ECONOMY,Airbus A321-100/200,https://www.sas.se/api/offers/flights/?from=CP...
5,2024-02-03,CPH,BKK,2,19:35:00,,ECONOMY,,https://www.sas.se/api/offers/flights/?from=CP...
6,2024-02-03,CPH,BKK,2,19:35:00,,ECONOMY,Boeing 777-300ER,https://www.sas.se/api/offers/flights/?from=CP...
7,2024-02-03,CPH,BKK,2,19:35:00,,ECONOMY,Airbus A321-100/200,https://www.sas.se/api/offers/flights/?from=CP...
8,2024-02-03,CPH,BKK,2,19:35:00,,ECONOMY,,https://www.sas.se/api/offers/flights/?from=CP...
9,2024-02-03,CPH,BKK,2,19:35:00,,ECONOMY,Boeing 777-300ER,https://www.sas.se/api/offers/flights/?from=CP...


In [30]:
flight_data_df.to_pickle("outbound.pkl")


In [31]:
flight_data_df[flight_data_df.CabinType == "BUSINESS"]

Unnamed: 0,OutboundDate,Origin,Destination,NumberOfStops,ConnectionDuration,PriceInPoints,CabinType,AircraftType,ApiCallUrl
