# Data Sampling Example Notebook
This notebook contains the process in which the flight dataset from [Kaggle](https://www.kaggle.com/datasets/dilwong/flightprices) was sampled and filtered from.

In [4]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Define input and output file paths
input_file_path = 'total_data.csv'
output_file_path = 'filtered_data.csv'

# Define fraction of sample
fraction = 0.05

# Set chunk size
chunk_size = 100000

In [6]:
# Open the output file for writing
with open(output_file_path, 'w') as output_file:
    # Iterate over the file in chunks
    for i, chunk in enumerate(pd.read_csv(input_file_path, chunksize=chunk_size)):
        # Sample and filter chunk for flights out of California
        sampled_chunk = chunk.sample(frac=fraction, replace=True, random_state=42)
        processed_chunk = sampled_chunk[(sampled_chunk['startingAirport'] == 'SFO') | (sampled_chunk['startingAirport'] == 'OAK') | (sampled_chunk['startingAirport'] == 'LAX')]

        # For the first chunk, write headers; for subsequent chunks, skip the headers
        if i == 0:
            processed_chunk.to_csv(output_file, index=False)
        else:
            processed_chunk.to_csv(output_file, index=False, header=False)

In [7]:
ca_data = pd.read_csv(output_file_path)
ca_data

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,fareBasisCode,travelDuration,elapsedDays,isBasicEconomy,isRefundable,...,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode
0,510f3a1255b9843a8eae48032b191faf,2022-04-17,2022-04-22,OAK,LAX,YH0OASMR,PT7H11M,0,False,False,...,1650678600||1650697140,2022-04-22T18:50:00.000-07:00||2022-04-22T23:5...,SEA||ONT,OAK||SEA,Alaska Airlines||Alaska Airlines,AS||AS,Embraer 175||Airbus A320,7320||8940,672||956,coach||coach
1,5f7a29384cea410317ca308d2e065059,2022-04-17,2022-05-06,SFO,BOS,E0AJZNN1,PT8H29M,1,False,False,...,1651919400||1651929660,2022-05-07T06:30:00.000-04:00||2022-05-07T09:2...,JFK||BOS,SFO||JFK,JetBlue Airways||JetBlue Airways,B6||B6,Airbus A319-321||AIRBUS INDUSTRIE A321 SHARKLETS,20280||4560,2566||185,coach||coach
2,a5e9d9b01627d1e1c54d6b6cbf143945,2022-04-18,2022-04-30,OAK,DTW,QA3OA0MC,PT10H12M,0,False,False,...,1651345740||1651367940||1651376220,2022-04-30T13:09:00.000-06:00||2022-04-30T20:1...,SLC||ORD||DTW,OAK||SLC||ORD,Delta||United||United,DL||UA||UA,Airbus A220-100||Embraer 175 (Enhanced Winglet...,6240||11940||5220,588||1251||240,coach||coach||coach
3,b8714828ff605dfc10a9c511917f1ee8,2022-04-16,2022-04-26,SFO,MIA,R7AZZNN3,PT10H48M,1,False,False,...,1651053360||1651072080,2022-04-27T05:56:00.000-04:00||2022-04-27T11:0...,JFK||MIA,SFO||JFK,JetBlue Airways||JetBlue Airways,B6||B6,Airbus A319-321||Boeing 737 MAX 8,20160||11580,2566||1104,coach||coach
4,80e97c74e379451453a1151b70aaf371,2022-04-17,2022-05-14,LAX,ORD,WAA7OWEN,PT4H4M,1,False,False,...,1652612580,2022-05-15T06:03:00.000-05:00,ORD,LAX,United,UA,Boeing 737-900,14640,1745,coach
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
880262,3bb39f976c56ba93dfab318eadb0bfeb,2022-10-05,2022-11-10,LAX,LGA,KAA4AWEN,PT11H47M,0,False,False,...,1668100920||1668134820,2022-11-10T10:22:00.000-07:00||2022-11-10T21:4...,DEN||LGA,LAX||DEN,United||United,UA||UA,Boeing 737-900||Airbus A319,8520||13020,848||1632,coach||coach
880263,704001c0a3a4cd9b91b7e70cef9d5658,2022-10-05,2022-11-08,SFO,ATL,KA0NA0MQ,PT6H38M,1,False,False,...,1667973180||1667991600,2022-11-08T21:53:00.000-08:00||2022-11-09T06:0...,LAX||ATL,SFO||LAX,Delta||Delta,DL||DL,Airbus A319||,5460||15300,339||1943,coach||coach
880264,60f0ec85cf33eb7efff2aa506d99af30,2022-10-05,2022-11-05,SFO,DFW,VAVNA0BQ,PT7H18M,0,True,False,...,1667645760||1667659380,2022-11-05T05:56:00.000-05:00||2022-11-05T09:4...,MSP||DFW,SFO||MSP,Delta||Delta,DL||DL,Boeing 737-900||,12660||9480,1586||854,coach||coach
880265,839954f5afe360146c028eefe96a6d2a,2022-10-05,2022-11-03,OAK,LGA,NH4OAJMN,PT18H34M,1,False,False,...,1667505900||1667534220||1667564940,2022-11-03T13:05:00.000-07:00||2022-11-03T23:5...,SEA||BOS||LGA,OAK||SEA||BOS,Alaska Airlines||Alaska Airlines||Delta,AS||AS||DL,Boeing 737-900||Boeing 737-800||Embraer 175,7800||19020||5340,672||2489||186,coach||coach||coach
