<a href="https://colab.research.google.com/github/ShreyasJothish/airbnb_pricing_DS/blob/master/SJ3_AirBnB_Code_for_Flask_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import requests
from bs4 import BeautifulSoup
import time
import gzip
import pandas as pd
import os


def download_data():
    page = requests.get("http://insideairbnb.com/get-the-data.html")

    if page.status_code != 200:
        print("Error: Request to InsideAirBnB is failing")
        return pd.DataFrame()

    soup = BeautifulSoup(page.content, 'html.parser')

    # On exploring the web site structure it is found the url links to download
    # the data is present in under tags **td**. Here we shall extract the data
    # based on this **td** tag. The **td** tags contain information for
    # archieved data across years for the same city and this is ignored.
    #
    # Here we are interested in fetching only Listing information in each city.
    # So only url to **listings.csv.gz** is used.
    td_tags = soup.find_all('td')

    # To ensure only the latest data for a particular city is used.
    city_set = set()

    # To maintain city level summary for data fetched.
    city_data = []

    for td_tag in td_tags:
        link_list = [a['href'] for a in td_tag.find_all('a', href=True)]

        # Fetch only listings.csv.gz data.
        if (len(link_list) > 0  and (link_list[0].find('united-states') != -1) 
            and (link_list[0].find('listings.csv.gz') != -1)):
            url = link_list[0]
            
            # Summary for each city is got by parsing the url itself.
            url_split = link_list[0].split('/')

            # InsideAirBnB follows a particular url format which is used
            # as reference for parsing.
            if len(url_split) != 9:
                print(f"Error: URL not following the format {url}")

                # It is seen the data for ireland is fetched but the
                # url format is different as compared to others. So
                # this special handling is needed.
                if url_split[3] == "ireland":
                    print("Info: Special handling for Ireland")
                    country = url_split[3]
                    region = url_split[3]
                    city = url_split[3]
                    date = url_split[4]

                    if city not in city_set:
                        city_set.add(city)
                        print([country, region, city, date, url])
                        city_data.append([country, region, city, date, url])
            else:
                country = url_split[3]
                region = url_split[4]
                city = url_split[5]
                date = url_split[6]

                if city not in city_set:
                    city_set.add(city)
                    city_data.append([country, region, city, date, url])

    # Check summary information of each city.
    print(f"Total number of city information fetched: {len(city_data)}")
    print("Info: Start summary information of each city")
    for city in city_data:
        print(city)
    print("Info: Completed summary information of each city ")

    # Consolidated data frame to hold data for all cities together.
    df_all = pd.DataFrame()

    for city in city_data:
        city_name = city[2]
        url = city[4]
        print(f"Info: Downloading data for {city_name} with url {url}")

        r = requests.get(url)

        # Retrieve HTTP meta-data.
        if r.status_code != 200:
            print(f"Error: Request to {url} failed with status "
                  f"{r.status_code}")
            continue

        # Fetch the data locally.
        file_name = f"{city_name}_listings.csv.gz"
        with open(file_name, 'wb') as f:
            f.write(r.content)

        # Unzip and load the file to data frame.
        with gzip.open(file_name) as f:
            df = pd.read_csv(f)

            print(f"Info: Shape of data within {file_name}: {df.shape}")

            if df_all.empty:
                df_all = df
            else:
                df_all = pd.concat([df_all, df])

        print(f"Info: Shape of concatenated dataframe: {df_all.shape}")

        # Remove file
        os.remove(file_name)
        print(f"Info: Removed {file_name}!")

        # Sleep for short duration to ensure server is not loaded
        time.sleep(10)

    return df_all

In [6]:
consolidated_df = download_data()

if not consolidated_df.empty:
    print(consolidated_df.shape)

Error: URL not following the format http://data.insideairbnb.com/united-states/2016-04-18/data/listings.csv.gz
Total number of city information fetched: 27
Info: Start summary information of each city
['united-states', 'nc', 'asheville', '2019-06-26', 'http://data.insideairbnb.com/united-states/nc/asheville/2019-06-26/data/listings.csv.gz']
['united-states', 'tx', 'austin', '2019-05-14', 'http://data.insideairbnb.com/united-states/tx/austin/2019-05-14/data/listings.csv.gz']
['united-states', 'ma', 'boston', '2019-05-19', 'http://data.insideairbnb.com/united-states/ma/boston/2019-05-19/data/listings.csv.gz']
['united-states', 'fl', 'broward-county', '2019-05-19', 'http://data.insideairbnb.com/united-states/fl/broward-county/2019-05-19/data/listings.csv.gz']
['united-states', 'ma', 'cambridge', '2019-06-24', 'http://data.insideairbnb.com/united-states/ma/cambridge/2019-06-24/data/listings.csv.gz']
['united-states', 'il', 'chicago', '2019-05-19', 'http://data.insideairbnb.com/united-state

  if self.run_code(code, result):


Info: Shape of data within broward-county_listings.csv.gz: (9165, 106)
Info: Shape of concatenated dataframe: (29368, 106)
Info: Removed broward-county_listings.csv.gz!
Info: Downloading data for cambridge with url http://data.insideairbnb.com/united-states/ma/cambridge/2019-06-24/data/listings.csv.gz
Info: Shape of data within cambridge_listings.csv.gz: (1369, 106)
Info: Shape of concatenated dataframe: (30737, 106)
Info: Removed cambridge_listings.csv.gz!
Info: Downloading data for chicago with url http://data.insideairbnb.com/united-states/il/chicago/2019-05-19/data/listings.csv.gz
Info: Shape of data within chicago_listings.csv.gz: (8169, 106)
Info: Shape of concatenated dataframe: (38906, 106)
Info: Removed chicago_listings.csv.gz!
Info: Downloading data for clark-county-nv with url http://data.insideairbnb.com/united-states/nv/clark-county-nv/2019-06-25/data/listings.csv.gz


  if self.run_code(code, result):


Info: Shape of data within clark-county-nv_listings.csv.gz: (9369, 106)
Info: Shape of concatenated dataframe: (48275, 106)
Info: Removed clark-county-nv_listings.csv.gz!
Info: Downloading data for columbus with url http://data.insideairbnb.com/united-states/oh/columbus/2019-05-18/data/listings.csv.gz
Info: Shape of data within columbus_listings.csv.gz: (1363, 106)
Info: Shape of concatenated dataframe: (49638, 106)
Info: Removed columbus_listings.csv.gz!
Info: Downloading data for denver with url http://data.insideairbnb.com/united-states/co/denver/2019-05-29/data/listings.csv.gz
Info: Shape of data within denver_listings.csv.gz: (4659, 106)
Info: Shape of concatenated dataframe: (54297, 106)
Info: Removed denver_listings.csv.gz!
Info: Downloading data for hawaii with url http://data.insideairbnb.com/united-states/hi/hawaii/2019-06-03/data/listings.csv.gz


  if self.run_code(code, result):


Info: Shape of data within hawaii_listings.csv.gz: (25706, 106)
Info: Shape of concatenated dataframe: (80003, 106)
Info: Removed hawaii_listings.csv.gz!
Info: Downloading data for jersey-city with url http://data.insideairbnb.com/united-states/nj/jersey-city/2019-06-29/data/listings.csv.gz
Info: Shape of data within jersey-city_listings.csv.gz: (2877, 106)
Info: Shape of concatenated dataframe: (82880, 106)
Info: Removed jersey-city_listings.csv.gz!
Info: Downloading data for los-angeles with url http://data.insideairbnb.com/united-states/ca/los-angeles/2019-05-05/data/listings.csv.gz


  if self.run_code(code, result):


Info: Shape of data within los-angeles_listings.csv.gz: (43954, 106)
Info: Shape of concatenated dataframe: (126834, 106)
Info: Removed los-angeles_listings.csv.gz!
Info: Downloading data for nashville with url http://data.insideairbnb.com/united-states/tn/nashville/2019-06-11/data/listings.csv.gz
Info: Shape of data within nashville_listings.csv.gz: (6962, 106)
Info: Shape of concatenated dataframe: (133796, 106)
Info: Removed nashville_listings.csv.gz!
Info: Downloading data for new-orleans with url http://data.insideairbnb.com/united-states/la/new-orleans/2019-05-05/data/listings.csv.gz
Info: Shape of data within new-orleans_listings.csv.gz: (6962, 106)
Info: Shape of concatenated dataframe: (140758, 106)
Info: Removed new-orleans_listings.csv.gz!
Info: Downloading data for new-york-city with url http://data.insideairbnb.com/united-states/ny/new-york-city/2019-06-02/data/listings.csv.gz


  if self.run_code(code, result):


Info: Shape of data within new-york-city_listings.csv.gz: (48801, 106)
Info: Shape of concatenated dataframe: (189559, 106)
Info: Removed new-york-city_listings.csv.gz!
Info: Downloading data for oakland with url http://data.insideairbnb.com/united-states/ca/oakland/2019-05-18/data/listings.csv.gz
Info: Shape of data within oakland_listings.csv.gz: (3167, 106)
Info: Shape of concatenated dataframe: (192726, 106)
Info: Removed oakland_listings.csv.gz!
Info: Downloading data for pacific-grove with url http://data.insideairbnb.com/united-states/ca/pacific-grove/2019-05-31/data/listings.csv.gz
Info: Shape of data within pacific-grove_listings.csv.gz: (228, 106)
Info: Shape of concatenated dataframe: (192954, 106)
Info: Removed pacific-grove_listings.csv.gz!
Info: Downloading data for portland with url http://data.insideairbnb.com/united-states/or/portland/2019-06-07/data/listings.csv.gz
Info: Shape of data within portland_listings.csv.gz: (5585, 106)
Info: Shape of concatenated dataframe: 

In [0]:
"""
# To save the data frame and compress it
consolidated_df.to_csv('consolidated_data.csv', index = False, header=True)
!gzip consolidated_data.csv

# To load the data frame and uncompress it
!gunzip consolidated_data.csv.gz
consolidated_df = pd.read_csv("consolidated_data.csv")
"""
consolidated_df.to_csv('consolidated_data.csv', index = False, header=True)

In [23]:
consolidated_df.country.unique()

array(['United States'], dtype=object)

In [22]:
consolidated_df.country_code.unique()

array(['US'], dtype=object)

In [0]:
consolidated_df = consolidated_df[consolidated_df['country'] == 'United States']

In [24]:
consolidated_df.country.isnull().sum()

0

In [29]:
consolidated_df.city.value_counts()

Los Angeles                     27639
New York                        22077
Brooklyn                        18849
Austin                          11636
San Diego                       11579
Seattle                          8909
Washington                       8853
Las Vegas                        8479
Chicago                          8142
San Francisco                    7520
New Orleans                      6940
Nashville                        6853
Boston                           6036
Honolulu                         5826
Portland                         5576
Denver                           4603
Queens                           4231
Minneapolis                      3558
Oakland                          3047
Kihei                            2910
Jersey City                      2833
San Jose                         2774
Lahaina                          2761
Hollywood                        2543
Fort Lauderdale                  2166
Asheville                        1910
Long Beach  

In [25]:
consolidated_df.city.isnull().sum()

119

In [26]:
consolidated_df.zipcode.isnull().sum()

3134

In [30]:
# Cleans up NaN values and sends sorted list of country names.
def getcountries(df):
    country_df = df[df['country'].notnull()]
    countries = sorted(list(country_df.country.unique()))
  
    return countries

getcountries(consolidated_df)

['United States']

In [0]:
# Fetches cities for given country.
# Cleans up NaN values and sends sorted list of city names.
def get_cities_for_country(df, country):
    country_df = df[df['country'] == country]
    
    city_df = country_df[country_df['city'].notnull()]
    cities = sorted(list(city_df.city.unique()))
  
    return cities
  
cities = get_cities_for_country(consolidated_df, 'United States')
cities

In [32]:
countries = getcountries(consolidated_df)

for country in countries:
  print("-----------------------Start----------------------------")
  print(f"Finding duplicate cities in {country}")
  cities = get_cities_for_country(consolidated_df, country)
  
  # Code to find simplify finding duplicate city names.
  city_prefix_duplicates = {}
  for city in cities:
    city = city.strip()
  
    if len(city) < 3:
      print(f"{city} is less than 3 characters")
      continue
  
    city_prefix = city.lower()[:3]
  
    if city_prefix in city_prefix_duplicates:
      city_prefix_duplicates[city_prefix].add(city)
    else:
      new_set = set()
      new_set.add(city)
      city_prefix_duplicates[city_prefix] = new_set

  print(city_prefix_duplicates)
  print("-----------------------End------------------------------")

-----------------------Start----------------------------
Finding duplicate cities in United States
 is less than 3 characters
. is less than 3 characters
CA is less than 3 characters
DC is less than 3 characters
LA is less than 3 characters
La is less than 3 characters
NY is less than 3 characters
NY is less than 3 characters
Ny is less than 3 characters
P is less than 3 characters
纽约 is less than 3 characters
{'mir': {'Miramar'}, 'ast': {'Astoria', 'ASTORIA', 'Astoria, Queens', 'ASTORIA/LIC', 'Astoria,Queens', 'Astoria - New York', 'astoria', 'Astoria Queens', 'Astoria/Queens'}, 'bro': {'Broward', 'Brookdale', 'Brookyn', 'Brooklyn Park', 'Brookyln', 'Bronx New York', 'Brooklyn (Williamsburg)', 'Brooklyn, NY 11221', 'Brookly,', 'Brooklyn,  Ny 11221', 'bronx', 'Broward County', 'Brooklyn New York', 'Brooklyn NY', 'Bronx ny', 'Brooklyn,  NY', 'Brooklyn', 'Bronx', 'Bronx NY', 'brooklyn', 'Bronx, NY', 'Broome', 'Brooklyn Center', 'Brookly', 'BRONX', 'BROOKLYN', 'Bronxville', 'Brookline', '

In [15]:
# Fetches zipcodes for given city.
# Cleans up NaN values and sends sorted list of zipcode names.
def get_zipcodes_for_city(df, city):
    city_df = df[df['city'] == city]
    
    zipcode_df = city_df[city_df['zipcode'].notnull()]
    zipcodes = sorted(list(zipcode_df.zipcode.unique()))
  
    return zipcodes
  
get_zipcodes_for_city(consolidated_df, 'New Hope')

['55427', '55428']

In [33]:
consolidated_df.zipcode.isnull().sum()

3134

In [34]:
consolidated_df.zipcode.value_counts()

96815                     4181
96753                     3495
96761                     3011
92109.0                   2551
78704.0                   2315
96740                     2010
11211                     1911
92101.0                   1898
89109.0                   1746
78702.0                   1599
90291                     1593
11221                     1485
20002                     1389
33019                     1349
90046                     1285
11206                     1280
90028                     1267
20009                     1178
20001                     1166
70119                     1166
89103.0                   1159
11216                     1115
10019                     1100
37203                     1090
10002                     1089
90026                     1033
96722                     1020
70130                     1016
11238                      999
10009                      988
                          ... 
10470.0                      1
90002.0 