In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver

chromedriver_path = "/Users/AliBaba/downloads/chromedriver"

browser = webdriver.Chrome(chromedriver_path)

In [13]:
import numpy as np
import pandas as pd
import datetime
from datetime import date, timedelta
from time import sleep

#Here you get to pick the days you want to use
start_day = '2020-04-3'
end_day = '2020-04-12'

In [4]:
#fxn to crawl google flights and create full data frame
def extract_price_info(start_day, end_day, url_dict, num_days):
    start_date = datetime.datetime.strptime(start_day, '%Y-%m-%d')
    end_date = datetime.datetime.strptime(end_day, '%Y-%m-%d')

    front_url = url_dict['front_url']
    middle_url = url_dict['middle_url']
    back_url = url_dict['back_url']
    
    fare_list = []
    
    for i in range(num_days):
        start = str(start_date).split()[0]
        end = str(end_date).split()[0]
        
        variable_url = front_url + start + middle_url + end + back_url
    
        cards = get_cards_via_soup(variable_url)
        day_dict = {}
        for card in cards:
            city, fare = convert_key_val(card)
            day_dict[city] = int(fare)
        
        series = pd.Series(day_dict)
        df = pd.DataFrame(series, columns=[start])
        fare_list.append(df)
        
        start_date = start_date + timedelta(days=1)
        end_date = end_date + timedelta(days=1)
    
    return fare_list

In [3]:
def print_countries_extracted(prices_frame):
    print("These are the countries for which we were able to pull prices:\n")
    print(list(prices_frame.index))

In [5]:
def get_cards_via_soup(variable_url):
    sleep(np.random.randint(3,7))
    
    browser.get(variable_url)
    
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    
    return soup.select('div[class*=info-container]')

In [6]:
def convert_key_val(card):
    pre_city = card.select('h3')[0].text
    city = str(pre_city)
    pre_fare = card.select('span[class*=price]')[0].text
    fare = str(pre_fare)
    #Need to do this in order to ensure clean dataframe when concat the series
    if fare == '':
        fare = '0'
    else:   #lets just get the comma and dollar sign out of the way
        fare = (fare.replace(',','').split('$')[1])
    return city, fare

In [7]:
def combine_days(frame_list):
    first = frame_list[0]
    
    if (len(frame_list) == 1):
        print_countries_extracted(first)
        return first
    
    for x in range(1, len(frame_list)):
        following = frame_list[x]
        frame = pd.concat([first, following], axis=1, sort=False)
        first = frame
    
    #provide a list of the countries pulled - so can just copy & paste
    print_countries_extracted(frame)
    
    return frame.fillna(0)

In [8]:
#this will walk through and only keep destinations under chosen price
def clean_extraction(dataframe, max_price):
    price_copy = dataframe.copy()
    for index, row in price_copy.iterrows():
        destination = row
        keep = False
        for price in destination:
            if price < max_price and price > 0:
                keep = True
        if(keep is False):
            price_copy = price_copy.drop([index])
    return price_copy

In [9]:
def price_pipeline(destination, num_days, max_price):
    destination_prices = extract_price_info(start_day, 
                end_day, destinations[str(destination)], num_days)
    prices_full = combine_days(destination_prices)
    destination_final = clean_extraction(prices_full, max_price)
    return destination_final, prices_full

In [10]:
#URL for three European regions
western_europe_url = {}
western_europe_url['front_url'] = 'https://www.google.com/flights?hl=en#flt=/m/0rh6k.r/m/0852h.'
western_europe_url['middle_url'] = '*r/m/0852h./m/0rh6k.'
western_europe_url['back_url'] = ';c:USD;e:1;ls:1w;sd:1;t:e'

southern_europe_url = {}
southern_europe_url['front_url'] = 'https://www.google.com/flights?hl=en#flt=/m/0rh6k.r/m/0250wj.'
southern_europe_url['middle_url'] = '*r/m/0250wj./m/0rh6k.'
southern_europe_url['back_url'] = ';c:USD;e:1;ls:1w;sd:1;t:e'

eastern_europe_url = {}
eastern_europe_url['front_url'] = 'https://www.google.com/flights?hl=en#flt=/m/0rh6k.r/m/09b69.'
eastern_europe_url['middle_url'] = '*r/m/09b69./m/0rh6k.'
eastern_europe_url['back_url'] = ';c:USD;e:1;ls:1w;sd:1;t:e'

In [11]:
#urls for countries
germany_url = {}
germany_url['front_url'] = 'https://www.google.com/flights?hl=en#flt=/m/0rh6k.r/m/0345h.'
germany_url['middle_url'] = '*r/m/0345h./m/0rh6k.'
germany_url['back_url'] = ';c:USD;e:1;sd:1;t:e'

italy_url = {}
italy_url['front_url'] = 'https://www.google.com/flights?hl=en#flt=/m/0rh6k.r/m/03rjj.'
italy_url['middle_url'] = '*r/m/03rjj./m/0rh6k.'
italy_url['back_url'] = ';c:USD;e:1;sd:1;t:e'

france_url = {}
france_url['front_url'] = 'https://www.google.com/flights?hl=en#flt=/m/0rh6k.r/m/0f8l9c.'
france_url['middle_url'] = '*r/m/0f8l9c./m/0rh6k.'
france_url['back_url'] = ';c:USD;e:1;sd:1;t:e'

turkey_url = {}
turkey_url['front_url'] = 'https://www.google.com/flights?hl=en#flt=/m/0rh6k.r/m/01znc_.'
turkey_url['middle_url'] = '*r/m/01znc_./m/0rh6k.'
turkey_url['back_url'] = ';c:USD;e:1;sd:1;t:e'

greece_url = {}
greece_url['front_url'] = 'https://www.google.com/flights?hl=en#flt=/m/0rh6k.r/m/035qy.'
greece_url['middle_url'] = '*r/m/035qy./m/0rh6k.'
greece_url['back_url'] = ';c:USD;e:1;sd:1;t:e'

In [12]:
destinations = {
    'eastern_europe' : eastern_europe_url,
    'western_europe' : western_europe_url,
    'southern_europe' : southern_europe_url,
    'germany' : germany_url,
    'italy' : italy_url,
    'france' : france_url,
    'turkey' : turkey_url,
    'greece' : greece_url,
}

In [14]:
germany_final, _ = price_pipeline('germany', 7, 750)
germany_final

These are the countries for which we were able to pull prices:

['Frankfurt', 'Munich', 'Berlin', 'Stuttgart', 'Hamburg', 'Düsseldorf', 'Cologne', 'Nuremberg', 'Hanover', 'Bremen', 'Dortmund', 'Leipzig', 'Saarbrücken', 'Dresden', 'Friedrichshafen', 'Münster', 'Westerland', 'Paderborn', 'Rostock', 'Karlsruhe', 'Baden-Baden']


Unnamed: 0,2020-04-03,2020-04-04,2020-04-05,2020-04-06,2020-04-07,2020-04-08,2020-04-09
Frankfurt,972,972,829,717.0,789,789.0,789
Munich,868,868,733,693.0,668,668.0,698
Berlin,867,867,768,728.0,728,728.0,758


In [16]:
france_final, _ = price_pipeline('france', 7, 750)
france_final

These are the countries for which we were able to pull prices:

['Paris', 'Nice', 'Marseille', 'Lyon', 'Bordeaux', 'Strasbourg', 'Biarritz', 'Toulouse', 'Nantes', 'Brest', 'Bastia', 'Clermont-Ferrand', 'Lorient', 'Montpellier', 'Pau', 'Rennes', 'Toulon', 'Lille', 'Ajaccio', 'Caen', 'Calvi', 'Chambéry', 'Nancy', 'Figari', 'Lourdes', 'La Rochelle', 'Perpignan', 'Bergerac', 'Aurillac', 'Brive-la-Gaillarde', 'Castres']


Unnamed: 0,2020-04-03,2020-04-04,2020-04-05,2020-04-06,2020-04-07,2020-04-08,2020-04-09
Paris,788.0,746.0,727.0,663.0,703.0,703.0,788.0
Nice,709.0,679.0,621.0,594.0,593.0,593.0,709.0


In [18]:
italy_final, _ = price_pipeline('italy', 7, 750)
italy_final

These are the countries for which we were able to pull prices:

['Rome', 'Milan', 'Venice', 'Naples', 'Florence', 'Catania', 'Bologna', 'Pisa', 'Palermo', 'Genoa', 'Verona', 'Bari', 'Turin', 'Trieste', 'Brindisi', 'Reggio Calabria', 'Lamezia Terme', 'Ancona', 'Cagliari', 'Olbia', 'Trapani', 'Perugia', 'Pescara', 'Lampedusa', 'Alghero']


Unnamed: 0,2020-04-03,2020-04-04,2020-04-05,2020-04-06,2020-04-07,2020-04-08,2020-04-09
Rome,861.0,711.0,821.0,711.0,778,726.0,861.0
Milan,834.0,812.0,844.0,804.0,764,699.0,834.0
Venice,793.0,1042.0,888.0,798.0,758,693.0,793.0


In [20]:
greece_final, _ = price_pipeline('greece', 7, 850)
greece_final

These are the countries for which we were able to pull prices:

['Athens', 'Santorini', 'Thessaloniki', 'Rhodes', 'Mykonos', 'Corfu', 'Chania', 'Heraklion', 'Kos', 'Zakynthos Island', 'Alexandroupoli', 'Ioannina', 'Chios', 'Naxos', 'Sitia', 'Skiathos', 'Kalamata', 'Kavala', 'Leros', 'Lemnos', 'Mitilini', 'Plaka', 'Paros', 'Skyros', 'Samos', 'Karpathos', 'Cephalonia', 'Kythira', 'Kastellorizo', 'Icaria']


Unnamed: 0,2020-04-03,2020-04-04,2020-04-05,2020-04-06,2020-04-07,2020-04-08,2020-04-09
Athens,908.0,901.0,867.0,784.0,784.0,784.0,908.0
Thessaloniki,903.0,913.0,873.0,833.0,799.0,799.0,903.0


In [24]:
ee_final, _ = price_pipeline('eastern_europe', 7, 750)
ee_final

These are the countries for which we were able to pull prices:

['Rome', 'Lisbon', 'Barcelona', 'Madrid', 'Athens', 'Milan', 'Venice', 'Naples', 'Santorini', 'Florence', 'Ibiza', 'Ponta Delgada', 'Málaga', 'Malta', 'Seville', 'Porto', 'Palma', 'Valencia', 'Belgrade', 'Terceira Island', 'Dubrovnik', 'Zagreb', 'Tirana', 'Tenerife', 'Catania', 'Bologna', 'Split', 'Faro', 'Sarajevo', 'Prishtina', 'Pisa', 'Palermo', 'Jerez de la Frontera', 'Alicante', 'Bilbao', 'Genoa', 'Verona', 'Bari', 'Skopje', 'Madeira', 'Moscow', 'Prague', 'Budapest', 'Kyiv', 'Warsaw', 'Bucharest', 'Sofia', 'Saint Petersburg', 'Kraków', 'Minsk', 'Chisinau', 'Riga', 'Vilnius', 'Tallinn', 'Wrocław', 'Gdańsk', 'Lviv', 'Sochi', 'Bydgoszcz', 'Cluj-Napoca', 'Constanța', 'Dnipro', 'Kharkiv', 'Iași', 'Kherson', 'Krasnodar', 'Košice', 'Katowice', 'Samara', 'Kazan', 'Łódź', 'Odesa', 'Zaporizhzhia', 'Poznań', 'Rostov-on-Don', 'Rzeszow', 'Sibiu', 'Timișoara', 'Ufa', 'Varna', 'Debrecen', 'Palanga', 'Voronezh', 'Szczecin', 'Lublin',

Unnamed: 0,2020-04-03,2020-04-04,2020-04-05,2020-04-06,2020-04-07,2020-04-08,2020-04-09
Lisbon,572.0,0.0,0.0,0.0,0.0,0.0,0.0
Moscow,0.0,845.0,678.0,676.0,651.0,692.0,735.0
Kyiv,0.0,680.0,690.0,689.0,677.0,677.0,677.0
Warsaw,0.0,899.0,882.0,829.0,818.0,694.0,882.0
Sofia,0.0,676.0,676.0,674.0,674.0,674.0,672.0
Saint Petersburg,0.0,953.0,936.0,844.0,701.0,701.0,794.0
Chisinau,0.0,780.0,740.0,700.0,659.0,659.0,689.0
Lviv,0.0,843.0,739.0,699.0,699.0,842.0,729.0
Sochi,0.0,717.0,677.0,0.0,637.0,637.0,667.0
Krasnodar,0.0,809.0,769.0,729.0,729.0,729.0,759.0


In [22]:
we_final, _ = price_pipeline('western_europe', 7, 750)
we_final

These are the countries for which we were able to pull prices:

['Moscow', 'Prague', 'Budapest', 'Kyiv', 'Warsaw', 'Bucharest', 'Sofia', 'Saint Petersburg', 'Kraków', 'Minsk', 'Chisinau', 'Riga', 'Vilnius', 'Tallinn', 'Wrocław', 'Gdańsk', 'Lviv', 'Sochi', 'Bratislava', 'Bydgoszcz', 'Cluj-Napoca', 'Dnipro', 'Kharkiv', 'Iași', 'Kherson', 'Krasnodar', 'Katowice', 'Samara', 'Kazan', 'Mineralnye Vody', 'Odesa', 'Zaporizhzhia', 'Palanga', 'Poznań', 'Rostov-on-Don', 'Rzeszow', 'Sibiu', 'Timișoara', 'Ufa', 'Varna', 'London', 'Paris', 'Amsterdam', 'Frankfurt', 'Dublin', 'Lisbon', 'Munich', 'Brussels', 'Vienna', 'Zürich', 'Geneva', 'Barcelona', 'Madrid', 'Berlin', 'Edinburgh', 'Manchester', 'Nice', 'Glasgow', 'Ibiza', 'Ponta Delgada', 'Shannon', 'Málaga', 'Stuttgart', 'Hamburg', 'Seville', 'Düsseldorf', 'Luxembourg', 'Birmingham', 'Porto', 'Palma', 'Valencia', 'Marseille', 'Lyon', 'Terceira Island', 'Bordeaux', 'Strasbourg', 'Tenerife', 'Belfast', 'Biarritz', 'Cardiff']


Unnamed: 0,2020-04-03,2020-04-04,2020-04-05,2020-04-06,2020-04-07,2020-04-08,2020-04-09
Moscow,735.0,0.0,0.0,0.0,0.0,0.0,0.0
Kyiv,682.0,0.0,0.0,0.0,0.0,0.0,0.0
Sofia,674.0,0.0,0.0,0.0,0.0,0.0,0.0
Sochi,707.0,0.0,0.0,0.0,0.0,0.0,0.0
Samara,703.0,0.0,0.0,0.0,0.0,0.0,0.0
Kazan,690.0,0.0,0.0,0.0,0.0,0.0,0.0
London,0.0,726.0,589.0,547.0,547.0,547.0,624.0
Paris,0.0,746.0,727.0,663.0,703.0,703.0,698.0
Amsterdam,0.0,837.0,741.0,601.0,701.0,701.0,717.0
Frankfurt,0.0,972.0,829.0,717.0,789.0,789.0,789.0


In [23]:
se_final, _ = price_pipeline('southern_europe', 7, 750)
se_final

These are the countries for which we were able to pull prices:

['London', 'Paris', 'Amsterdam', 'Frankfurt', 'Dublin', 'Lisbon', 'Munich', 'Brussels', 'Zürich', 'Geneva', 'Vienna', 'Barcelona', 'Madrid', 'Berlin', 'Edinburgh', 'Manchester', 'Nice', 'Glasgow', 'Ibiza', 'Ponta Delgada', 'Shannon', 'Málaga', 'Stuttgart', 'Hamburg', 'Seville', 'Düsseldorf', 'Luxembourg', 'Birmingham', 'Porto', 'Palma', 'Valencia', 'Marseille', 'Lyon', 'Terceira Island', 'Bordeaux', 'Strasbourg', 'Tenerife', 'Belfast', 'Biarritz', 'Cardiff', 'Rome', 'Athens', 'Milan', 'Venice', 'Naples', 'Santorini', 'Florence', 'Malta', 'Belgrade', 'Dubrovnik', 'Zagreb', 'Tirana', 'Catania', 'Bologna', 'Split', 'Faro', 'Sarajevo', 'Prishtina', 'Pisa', 'Palermo', 'Alicante', 'Bilbao', 'Genoa', 'Verona', 'Bari', 'Skopje', 'Madeira', 'Thessaloniki', 'Turin']


Unnamed: 0,2020-04-03,2020-04-04,2020-04-05,2020-04-06,2020-04-07,2020-04-08,2020-04-09
London,680.0,0.0,0.0,0.0,0.0,0.0,0.0
Lisbon,572.0,649.0,582.0,558.0,555.0,556.0,575.0
Barcelona,902.0,920.0,838.0,684.0,684.0,684.0,787.0
Nice,709.0,0.0,0.0,0.0,0.0,0.0,0.0
Rome,0.0,711.0,821.0,711.0,778.0,726.0,711.0
Milan,0.0,812.0,844.0,804.0,764.0,699.0,729.0
Venice,0.0,1042.0,888.0,798.0,758.0,693.0,723.0
Tirana,0.0,0.0,0.0,750.0,0.0,719.0,0.0
Faro,0.0,718.0,0.0,596.0,613.0,896.0,1014.0
Sarajevo,0.0,860.0,0.0,749.0,749.0,749.0,779.0
