In [2]:
################################################################################################################

import time

# Core of scraping
import requests
from os.path import join as pjoin
from bs4 import BeautifulSoup

# For applying a random sleep interval between requests  
from random import randint 
from time import sleep

# Need these in order to simulate human activity in Chrome/Firefox browser (clicking)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec

# Regex
import regex as re
import os 
import sys

# Zip
import gzip
import shutil

# Needed for turning the date to datetime values
import datetime as dt

# Dataframes
import pandas as pd

# Twilio enables us to send SMS
from twilio.rest import Client


################################################################################################################

# URLS we need these specific URLs in order to run te program correctly. Do not change this !
inside_airbnb_url = "http://insideairbnb.com/"
get_the_data_url = "http://insideairbnb.com/get-the-data.html"

################################################################################################################

In [3]:
################################################################################################################

def try_connecting(get_the_data_url):
    while True:
        try:
            source_code = requests.get(get_the_data_url, timeout = 30, verify=False)
            return (source_code)
        except (requests.ConnectionError) as e:
            print("OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n")
            print(str(e))            
            continue
        except (requests.Timeout) as e:
            print("OOPS!! Timeout Error")
            print(str(e))
            continue
        except (requests.RequestException) as e:
            print("OOPS!! General Error")
            print(str(e))
            continue
        except (KeyboardInterrupt):
            print("Someone closed the program")
        break

################################################################################################################

def make_cities_list(soup):
    ''''''
    cities_list = [] # List to store the list of cities

    for location in soup.findAll(re.compile('^h2$')):
        l = location.text
        #print(l)
        s = l.split(",")
        #print(s)
        n = []
        
        for i in s:
            if i[0] == " ":
                k = i[1:]
            else:
                k = i
            n.append(k)
        # City, county and country included here 
        cities_list.append(n)
    
    list_of_cities = []
    for c in cities_list:
        list_of_cities.append(c[0])

    return (list_of_cities)

################################################################################################################

def make_cities_index_list(cities_list):
    ''''''
    # Only names of cities
    list_of_cities_index = []
    index = 0 # We need the index to find the table, because I can't retrieve it by id

    # We'll build a list of tuples that stores (city,index)
    for c in cities_list:
        list_of_cities_index.append((index, c))
        index += 1 
    return list_of_cities_index


################################################################################################################

def link_core(link):
    core = link[29:] # Ex: united-states/ca/san-francisco/2019-03-06/visualisations/reviews.csv
    splitted = core.split("/")
    
    return(splitted)

################################################################################################################

def get_metadata(splitted):
    
    if len(splitted) == 6: # Most links have 6 elements 
        country = splitted[0]
        region = splitted[1]
        city = splitted[2]
        date = splitted[3]
        filetype = splitted[4] # This is either 'data' or 'visualizations'; More like purpose...
        filename = splitted[5] 
        filesavename = re.sub( '[^a-z0-9]', '', date) + "_" + city + "_" + filename # This is so we can name our files conveniently
        return(city, date, filetype, filename, filesavename)

    else: # This is for Ireland which has only 4 elements.
        country = splitted[0]
        # missing region
        # missing city
        date = splitted[1]
        filetype = splitted[2]
        filename = splitted[3]
        filesavename = re.sub('[^a-z0-9]', '', date)  + "_" + country + "_" + filename # Example: 2019-11-23*ireland*listings.csv.gz
        return(country, date, filetype, filename, filesavename)

################################################################################################################

def getall(my_list, s):
    '''
    Elegant list comprehension to match city name with index.
    Matches second element of a tuple with the first one
    '''
    index = [x for x, y in my_list if y==s] # Very versatile. 
    return (index[0])

################################################################################################################

def application_launch(option0_input="", ):
    '''The purpose is to be able to input the desired city that will be scraped.'''
    # Input variables
    
    # USE NLP TO RETURN CLOSEST NEIGHBOUR. EX: YOU MEANT COPENHAGEN? or YOUR CITY OF INTEREST IS COPENHAGEN. IS THAT CORRECT?
    
    option0_input = str(input("One or many cities?. Answer with One/Many.\n")) # CAN ALSO DO LIST OF CITIES
    
    if option0_input == "One":
        input_name = str(input("Please state the name of your desired city. Ex: Copenhagen.")) 
        input_index =  getall(list_of_cities_index, input_name) # Link city name with index so we can search and find it in page source. 
        input_index_list = []
        
    elif option0_input == "Many":
        many_string_input = str(input("Please enter a string representing a list of cities. Ex: Copenhagen,Berlin,Paris,etc.\n"))
        many_list_input = list(many_string_input.split(","))
        input_index = ""
        input_index_list = []
        for c in many_list_input:
            input_index_list.append(getall(list_of_cities_index,c))
            
    # Also ALL option?
        
    print("\n")
    
    option1_input = str(input("Do you want to download one or all files? Type One/All/Many.\n")) # CAN ALSO DO LIST OF CITIES
    # IF NOT IN ANSWER LIST RETURN ERROR.
    
    if option1_input == "One":
        direct_download_link_input = str(input("Please copy paste your direct download link here.\n"))
        print("\n")
        filetype_input = str(input("Please state the your desired file type to work on. Ex: 'calendar.csv.gz'.\n"))
        
    elif option1_input == "All":
        direct_download_link_input = ""
        filetype_input = str(input("Please state the your desired file type to work on. Ex: 'calendar.csv.gz'.\n"))
        
    #elif option1_input == "Many":
     #   direct_download_link_input = ""
      #  filetype_input = str(input("Please state the your desired file type to work on. Ex: 'calendar.csv.gz'.\n"))
    else:
        print("Error. Answer not acceptable. Please try again (have to implement this try again though.)\n")
    
    print("\n")
    
    ################################
    
    option2_input = str(input("Do you want to specify the current working directory? Type Yes/No or Default..\nDefault is for me. It runs the path I used when writing the code.\n"))
    
    if option2_input == "Yes": # USE REGEX TO IGNORE CASE SENSITIVITY
        chdir_path_input = str(input("Type in or copy paste the path to which you want to store your output.\nNote: I store it on an external HDD so it's possible.\n"))
        # CURRENT DIR
        os.chdir(r"" + str(chdir_path_input))
        root_download_folder = str(os.getcwd())
    
    elif option2_input == "No": # Choosing no is your default machine cwd. 
        root_download_folder = str(os.getcwd())
    
    elif option2_input == "Default": 
        #default_chdir = r"E:\Airbnb\Data"
        os.chdir()
        root_download_folder = str(os.getcwd())
    
    else:
        print("Error. Answer not acceptable. Please try again (have to implement this try again though.)")
        
    print("\n")
    
    return(input_index, direct_download_link_input, filetype_input, root_download_folder, input_index_list)

################################################################################################################

def write_zip_to_disk(path_to_zip_folder, path_to_zip_file, url):
    
    if not os.path.exists(path_to_zip_folder): # Need to create the path up to the file name. 
        os.makedirs(path_to_zip_folder)
    with open(path_to_zip_file, "wb") as f:
            r = requests.get(url)
            f.write(r.content)
            
################################################################################################################

def unzip(path_to_zip_file,path_to_unzipped_file):
    with gzip.open(path_to_zip_file, 'rb') as f_in:
        with open(path_to_unzipped_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    
    os.remove(path_to_zip_file) # Delete the GZ after unzipping to free memory 

################################################################################################################

def pre_process(df):
    '''
    Some level of standard pre-processing is required for these dataframes.
    '''
    # I think it's best if we create a copy of the calendar to play around with, for testing purposes
    # In practice, we're wasting too much memory, so I'll just comment this out and set input to df
    #df = calendar
    
    # But the price is an object because of the $ sign.
    # Eliminating the $sign
    df["price"] = df["price"].replace('[\$,]', '', regex=True).astype(float)
    # Then assigning new data type to numeric so we can do calculations.
    df["price"] = pd.to_numeric(df["price"])
    
    # Also, the date needs to be changed to datetime. 
    df["date"] = pd.to_datetime(df["date"])
    
    # We're only interested in price and time so we drop the rest.
    # We ignore errors, as some calendars don't contain those columns 
    df.drop(columns=["available","adjusted_price","minimum_nights","maximum_nights","listing_id"], inplace=True, errors='ignore')
    
    # I guess we should make this function return the "new" dataframe
    return df

################################################################################################################

def memory_optimization(df):
    for i in ['year','quarter','month','day','weekday']:
        df[i] = df[i].astype('category')       

################################################################################################################

def get_yearspan(df):
    '''
    A helper function that will return a list of the years present in the calendar file.
    '''
    years = df["date"].dt.year.unique()
    
    return years 

################################################################################################################

def split_by_year(df):
    '''
    Split a calendar dataframe into multiple dataframes, sliced by year value.
    '''
    
    years = get_yearspan(df) # Get a list of the years; As far as I know, there can be at least 2 values
    
    
    new_df_list = [] # A list that stockpiles the outputs 
    
            
    if len(years) >  1: # If there's more than 1 year, let's split that shit 
        for i in range(len(years)):
            splice = df.loc[df['year'] == years[i],:]
            new_df_list.append(splice)
    
    return new_df_list   

################################################################################################################

def statistics(df,column_name):
    '''
    A function that returns statistics such as mean, std, mode, etc.
    The input is a dataframe and the desired column by which to group by.
    Groups the results in dictionaries.
    '''
    
    df = df.filter([column_name, 'price'], axis=1) # Filter to keep only price values. 
    
    # Group by column.
    mean = df.groupby([column_name]).mean()
    
    # Creating separate dataframes for the statistic
    statistics = df.groupby([column_name]).describe()
    
    # Making the dataframe presentable
    statistics = statistics.price.reset_index(level=[column_name])
    
    return statistics

################################################################################################################

def create_mini_dataframes(df):
    '''
    This function creates the slices all at a time.
    '''
    
    year = statistics(df,"year")
    quarter = statistics(df,"quarter")
    month = statistics(df,"month")
    weekday = statistics(df,"weekday")
    calendar_day = statistics(df,"date")
    
    # Returns a tuple with 5 elements.
    return (year,quarter,month,weekday,calendar_day)

################################################################################################################

def apply_create_mini_dataframes(split_list):
    '''
    Argument is the output of create_mini_dataframes()
    '''
    
    mini_collection_list = [] # For each year, tuples of the collections of statistics, for each 'timeframe'.

    for i in range(len(split_list)):
        # A collection of statistics specific to a certain year. Ex: 2020's stats for year, quarter, month, etc.
        mini_collection = create_mini_dataframes(split_list[i])
        mini_collection_list.append(mini_collection)
        
    return (mini_collection_list)
            
################################################################################################################

def write_to_csv(path_to_mini_folder, mini_df):
    
    if not os.path.exists(path_to_mini_folder):
        os.makedirs(path_to_mini_folder)
        mini_df.to_csv(path_to_mini_folder +  "\\" + "calendar.csv", index=False)            
            
################################################################################################################

def print_time_elapsed(start_time, end_time):
    # The actual difference 
    time_elapsed = end_time - start_time
    
    if time_elapsed <=  60:
        out =  "{0:.2f}".format(time_elapsed)
        print("Time elapsed in seconds: ", out)
        return(out)
    
    elif  (time_elapsed <= 3600) and (time_elapsed > 60) :
        out = "{0:.2f}".format(time_elapsed/60)
        print("Time elapsed in minutes: ", out)
        return(out)
    
    else:
        out = "{0:.2f}".format(time_elapsed/3600)
        print("Time elapsed in hours: ", out)
        return(out)
         
################################################################################################################

def create_paths(city, filetype, root_download_folder, filename, filesavename):
    
    if filetype ==  "data":
        path_to_download_folder = root_download_folder + "\\" + city + "\\"  + filename[:-7]
        path_to_zip_file = path_to_download_folder + "\\" + filesavename
        path_to_unzipped_file = path_to_download_folder + "\\" + filesavename[:-3] # Only GZ files need this additional path
        
    elif filetype == "visualisations":
        
        if filename != "neighbourhoods.geojson":
            path_to_download_folder = root_download_folder + "\\" + city + "\\"  + filename[:-4]
            path_to_zip_file = path_to_download_folder + "\\" + filesavename
            path_to_unzipped_file = ""

        else:
            path_to_download_folder = root_download_folder + "\\" + city + "\\"  + filename[:-8]
            path_to_zip_file = path_to_download_folder + "\\" + filesavename
            path_to_unzipped_file = ""
        
    return (path_to_download_folder, path_to_zip_file, path_to_unzipped_file)
    
################################################################################################################

def dataframe_work(path_to_unzipped_file, path_to_download_folder, date):
    
    # Create a Data Frame
    df = pd.read_csv(path_to_unzipped_file)
        
    # Pre-process dataframe 
    df = pre_process(df) # The data is in a very specific structured format.'''
        
    # Optimize memory allocation 
    #memory_optimization(df) #Makes the files considerably smaller in size.
        
    # Year list
    #years = get_yearspan(df) #Just a helper.
        
    # Splitting by year
    #df_list = split_by_year(df) #Just a helper.'''
        
    # Create the mini files
    mini_df = statistics(df, "date")
        
    # Path to storage folder
    path_to_mini_folder = path_to_download_folder + "\\minis\\" +  date
        
    # Write them to CSV
    write_to_csv(path_to_mini_folder, mini_df) # The output.'''
        
    # Delete main file
    os.remove(path_to_unzipped_file) # Of course we want to get rid of this almost .5GB monster.'''
    
################################################################################################################

def scrape_one_market(direct_download_link_input, filetype_input, market, root_download_folder):
    
    if direct_download_link_input:
        
        link = direct_download_link_input
        splitted = link_core(link)

        metadata = get_metadata(splitted)

        city = metadata[0]
        date = metadata[1]
        filetype = metadata[2]
        filename = metadata[3]
        filesavename = metadata[4]
    
        if (filetype_input == "calendar.csv.gz") and (filename == filetype_input):

            paths = create_paths(city, filetype, root_download_folder, filename, filesavename)

            path_to_download_folder = paths[0]
            path_to_zip_file = paths[1]
            path_to_unzipped_file = paths[2]

            write_zip_to_disk(path_to_download_folder, path_to_zip_file, link)
            unzip(path_to_zip_file, path_to_unzipped_file)

            dataframe_work(path_to_unzipped_file, path_to_download_folder, date)
        
    else:
        
        for a in market.findAll('a', attrs={'href': re.compile("^http://")}):
            
            # Find all links 
            link = a.get('href')
            splitted = link_core(link)

            metadata = get_metadata(splitted)

            city = metadata[0]
            date = metadata[1]
            filetype = metadata[2]
            filename = metadata[3]
            filesavename = metadata[4]
        
            if (filetype_input == "calendar.csv.gz") and (filename == filetype_input):

                paths = create_paths(city, filetype, root_download_folder, filename, filesavename)

                path_to_download_folder = paths[0]
                path_to_zip_file = paths[1]
                path_to_unzipped_file = paths[2]

                write_zip_to_disk(path_to_download_folder, path_to_zip_file, link)
                unzip(path_to_zip_file, path_to_unzipped_file)

                dataframe_work(path_to_unzipped_file, path_to_download_folder, date)
                           
################################################################################################################
       
def my_excepthook(type, value, traceback):
    end = time.time()
    print("Program crashed after", end - start, "seconds")
    sys.__excepthook__(type, value, traceback)  # Print error message
    
################################################################################################################

def send_sms(duration):
    # the following line needs your Twilio Account SID and Auth Token
    client = Client("AC5e04cc29173e103d25fe0bacaa3bcf4a", "84d11e73db927ce27813f6c5d7bdd926")
    
    content = "Work finished in " + duration
    
    # change the "from_" number to your Twilio number and the "to" number
    # to the phone number you signed up for Twilio with, or upgrade your
    # account to send SMS to any phone number
    client.messages.create(to="+4571530372", 
                           from_="+12055761457", 
                           body=content)

################################################################################################################

def show_all_show_more(driver):
    clicks = 0
    while True:
        clicks += 1
        # 98 clicks because...idk man, not all have show more data
        if clicks <= 98: # Here we tell it how many times to click before it stops. 
            driver.find_element(By.PARTIAL_LINK_TEXT, "show").click()
            #sleep(randint(3,12))
        else: 
            break
            
################################################################################################################

def show_many_show_more(driver, input_index_list):
    for index in input_index_list: 
        driver.find_elements(By.PARTIAL_LINK_TEXT,"show")[index].click() 

################################################################################################################
            
def show_more(driver, input_index):
    driver.find_elements(By.PARTIAL_LINK_TEXT,"show")[input_index].click() 
    
################################################################################################################

def scrape_many_markets(filetype_input, market_list, root_download_folder):
    
    for market in market_list:
        
        for a in market.findAll('a', attrs={'href': re.compile("^http://")}):
            
            # Find all links 
            link = a.get('href')
            splitted = link_core(link)

            metadata = get_metadata(splitted)

            city = metadata[0]
            date = metadata[1]
            filetype = metadata[2]
            filename = metadata[3]
            filesavename = metadata[4]
        
            if (filetype_input == "calendar.csv.gz") and (filename == filetype_input):

                paths = create_paths(city, filetype, root_download_folder, filename, filesavename)

                path_to_download_folder = paths[0]
                path_to_zip_file = paths[1]
                path_to_unzipped_file = paths[2]

                write_zip_to_disk(path_to_download_folder, path_to_zip_file, link)
                unzip(path_to_zip_file, path_to_unzipped_file)

                dataframe_work(path_to_unzipped_file, path_to_download_folder, date)
                

In [10]:
################################################################################################################

# Excepthook
sys.excepthook = my_excepthook

# Source code 
source_code = try_connecting(get_the_data_url)

# Getting source code 
plain_text = source_code.text

# soup #1
soup = BeautifulSoup(plain_text, 'html.parser')

# City name list 
cities_list = make_cities_list(soup)

# No of cities
#no_cities = len(cities_list)

# Index list
list_of_cities_index = make_cities_index_list(cities_list)

print(list_of_cities_index)
print("\n\n")

# Launch phase
launch = application_launch()

# Input
input_index = launch[0]
direct_download_link_input = launch[1]
filetype_input = launch[2]
root_download_folder = launch[3]
input_index_list = launch[4]

[(0, 'Amsterdam'), (1, 'Antwerp'), (2, 'Asheville'), (3, 'Athens'), (4, 'Austin'), (5, 'Barcelona'), (6, 'Barossa Valley'), (7, 'Barwon South West'), (8, 'Beijing'), (9, 'Belize'), (10, 'Bergamo'), (11, 'Berlin'), (12, 'Bologna'), (13, 'Bordeaux'), (14, 'Boston'), (15, 'Bristol'), (16, 'Broward County'), (17, 'Brussels'), (18, 'Buenos Aires'), (19, 'Cambridge'), (20, 'Cape Town'), (21, 'Chicago'), (22, 'Clark County'), (23, 'Columbus'), (24, 'Copenhagen'), (25, 'Crete'), (26, 'Denver'), (27, 'Dublin'), (28, 'Edinburgh'), (29, 'Euskadi'), (30, 'Florence'), (31, 'Geneva'), (32, 'Ghent'), (33, 'Girona'), (34, 'Greater Manchester'), (35, 'Hawaii'), (36, 'Hong Kong'), (37, 'Istanbul'), (38, 'Jersey City'), (39, 'Lisbon'), (40, 'London'), (41, 'Los Angeles'), (42, 'Lyon'), (43, 'Madrid'), (44, 'Malaga'), (45, 'Mallorca'), (46, 'Manchester'), (47, 'Melbourne'), (48, 'Menorca'), (49, 'Mexico City'), (50, 'Milan'), (51, 'Montreal'), (52, 'Munich'), (53, 'Naples'), (54, 'Nashville'), (55, 'New B

In [11]:
################################################################################################################

# Runtime starting time
start_time = time.time()

# Selenium driver
driver = webdriver.Firefox() # Firefox Driver
driver.get(get_the_data_url)

show_many_show_more(driver, input_index_list)
    
# New source code from driver
source_code = driver.page_source
soup = BeautifulSoup(source_code, 'html.parser')

market_list = []
for index in input_index_list:
    market_list.append(soup.findAll('table')[index])
                                
# Runtime ending time
end_time = time.time()
duration = print_time_elapsed(start_time, end_time)

Time elapsed in seconds:  25.56


In [12]:
################################################################################################################

# Runtime starting time
start_time = time.time()

# SCRAPER
scrape_many_markets(filetype_input=filetype_input, market_list=market_list, root_download_folder=root_download_folder)

# Runtime ending time
end_time = time.time()
duration = print_time_elapsed(start_time, end_time)

# Send SMS
#send_sms(duration)

OSError: Not a gzipped file (b'<h')