In [1]:
import requests
import pandas as pd
import gzip
from io import BytesIO
import logging

# WIP Get NYC area Detailed Listing/Calendar/Review Data

In [2]:
def download_and_extract(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Extract filename from the URL
        filename = url.split('/')[-1].replace('.gz', '')
        # Open the gzip file
        with gzip.open(BytesIO(response.content), 'rb') as f:
            # Read the contents of the gzip file
            file_content = f.read()
        return file_content
    else:
        # Request was not successful, print error message
        print("Failed to download the file from the provided URL.")
        return None

In [3]:
file_ls = ['listings.csv.gz','calendar.csv.gz','reviews.csv.gz']
location_url_ls = ['http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/',
                    'http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/',
                    'http://data.insideairbnb.com/united-states/ma/boston/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/mt/bozeman/2023-12-10/data/',
                    'http://data.insideairbnb.com/united-states/fl/broward-county/2023-12-25/data/',
                    'http://data.insideairbnb.com/united-states/ma/cambridge/2023-12-26/data/',
                    'http://data.insideairbnb.com/united-states/il/chicago/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/oh/columbus/2023-12-25/data/',
                    'http://data.insideairbnb.com/united-states/tx/dallas/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/co/denver/2023-12-29/data/',
                    'http://data.insideairbnb.com/united-states/tx/fort-worth/2023-12-15/data/',
                    'http://data.insideairbnb.com/united-states/hi/hawaii/2023-12-15/data/',
                    'http://data.insideairbnb.com/united-states/nj/jersey-city/2023-12-22/data/',
                    'http://data.insideairbnb.com/united-states/ca/los-angeles/2023-12-03/data/',
                    'http://data.insideairbnb.com/united-states/tn/nashville/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/la/new-orleans/2023-12-03/data/',
                    'http://data.insideairbnb.com/united-states/ny/new-york-city/2024-02-06/data/',
                    'http://data.insideairbnb.com/united-states/nj/newark/2023-12-30/data/',
                    'http://data.insideairbnb.com/united-states/ca/oakland/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/ca/pacific-grove/2023-12-31/data/',
                    'http://data.insideairbnb.com/united-states/or/portland/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/ri/rhode-island/2023-12-30/data/',
                    'http://data.insideairbnb.com/united-states/ny/rochester/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/or/salem-or/2023-12-22/data/',
                    'http://data.insideairbnb.com/united-states/ca/san-diego/2023-12-04/data/',
                    'http://data.insideairbnb.com/united-states/ca/san-francisco/2023-12-04/data/',
                    'http://data.insideairbnb.com/united-states/ca/san-mateo-county/2023-12-22/data/',
                    'http://data.insideairbnb.com/united-states/ca/santa-clara-county/2023-12-23/data/',
                    'http://data.insideairbnb.com/united-states/ca/santa-cruz-county/2023-12-30/data/',
                    'http://data.insideairbnb.com/united-states/wa/seattle/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/mn/twin-cities-msa/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/dc/washington-dc/2023-12-18/data/']

In [4]:
# Set up logging configuration
logging.basicConfig(filename='GetData.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

for file_type in file_ls:
    us_data = None
    
    for loc_url in location_url_ls:
        state = loc_url.split('/')[-5]
        city = loc_url.split('/')[-4]
        data_date = loc_url.split('/')[-3]
        
        # crawler
        url = loc_url+file_type
        csv_file = download_and_extract(url)
        if csv_file:
            # Load the CSV file into a Pandas DataFrame
            df = pd.read_csv(BytesIO(csv_file))
            
            # data process
            df['state'] = state
            df['city'] = city
            df['data_date'] = data_date
            if us_data is None:
                us_data = df.copy()
            else:
                us_data = pd.concat([us_data, df], ignore_index=True)
            
            logging.info(url+f'; {df.shape[0]} rows')
            print('Success: ',f'{df.shape[0]} rows, ',url)
        else:
            print('Failure: ',url)
            logging.error(url)
    us_data.to_csv('data/'+url.split('/')[-1].replace('.gz', ''),index=False)

Success:  410 rows,  http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/listings.csv.gz
Success:  3329 rows,  http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/listings.csv.gz
Success:  15419 rows,  http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/listings.csv.gz
Success:  4204 rows,  http://data.insideairbnb.com/united-states/ma/boston/2023-12-18/data/listings.csv.gz
Success:  594 rows,  http://data.insideairbnb.com/united-states/mt/bozeman/2023-12-10/data/listings.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  18230 rows,  http://data.insideairbnb.com/united-states/fl/broward-county/2023-12-25/data/listings.csv.gz
Success:  1130 rows,  http://data.insideairbnb.com/united-states/ma/cambridge/2023-12-26/data/listings.csv.gz
Success:  8949 rows,  http://data.insideairbnb.com/united-states/il/chicago/2023-12-18/data/listings.csv.gz
Success:  17414 rows,  http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-12-20/data/listings.csv.gz
Success:  2668 rows,  http://data.insideairbnb.com/united-states/oh/columbus/2023-12-25/data/listings.csv.gz
Success:  5642 rows,  http://data.insideairbnb.com/united-states/tx/dallas/2023-12-18/data/listings.csv.gz
Success:  4971 rows,  http://data.insideairbnb.com/united-states/co/denver/2023-12-29/data/listings.csv.gz
Success:  1871 rows,  http://data.insideairbnb.com/united-states/tx/fort-worth/2023-12-15/data/listings.csv.gz
Success:  34040 rows,  http://data.insideairbnb.com/united-states/hi/hawaii/2023-12-15/data/listings.csv.gz
Success

  df = pd.read_csv(BytesIO(csv_file))


Success:  1215271 rows,  http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  5628526 rows,  http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/calendar.csv.gz
Success:  1534396 rows,  http://data.insideairbnb.com/united-states/ma/boston/2023-12-18/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  216810 rows,  http://data.insideairbnb.com/united-states/mt/bozeman/2023-12-10/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  6655138 rows,  http://data.insideairbnb.com/united-states/fl/broward-county/2023-12-25/data/calendar.csv.gz
Success:  412456 rows,  http://data.insideairbnb.com/united-states/ma/cambridge/2023-12-26/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  3266145 rows,  http://data.insideairbnb.com/united-states/il/chicago/2023-12-18/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  6357331 rows,  http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-12-20/data/calendar.csv.gz
Success:  974078 rows,  http://data.insideairbnb.com/united-states/oh/columbus/2023-12-25/data/calendar.csv.gz
Success:  2059330 rows,  http://data.insideairbnb.com/united-states/tx/dallas/2023-12-18/data/calendar.csv.gz
Success:  1814226 rows,  http://data.insideairbnb.com/united-states/co/denver/2023-12-29/data/calendar.csv.gz
Success:  682915 rows,  http://data.insideairbnb.com/united-states/tx/fort-worth/2023-12-15/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  12421910 rows,  http://data.insideairbnb.com/united-states/hi/hawaii/2023-12-15/data/calendar.csv.gz
Success:  565501 rows,  http://data.insideairbnb.com/united-states/nj/jersey-city/2023-12-22/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  16643832 rows,  http://data.insideairbnb.com/united-states/ca/los-angeles/2023-12-03/data/calendar.csv.gz
Success:  3229903 rows,  http://data.insideairbnb.com/united-states/tn/nashville/2023-12-18/data/calendar.csv.gz
Success:  2582375 rows,  http://data.insideairbnb.com/united-states/la/new-orleans/2023-12-03/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  14299870 rows,  http://data.insideairbnb.com/united-states/ny/new-york-city/2024-02-06/data/calendar.csv.gz
Success:  620643 rows,  http://data.insideairbnb.com/united-states/nj/newark/2023-12-30/data/calendar.csv.gz
Success:  1003542 rows,  http://data.insideairbnb.com/united-states/ca/oakland/2023-12-20/data/calendar.csv.gz
Success:  87235 rows,  http://data.insideairbnb.com/united-states/ca/pacific-grove/2023-12-31/data/calendar.csv.gz
Success:  1804925 rows,  http://data.insideairbnb.com/united-states/or/portland/2023-12-20/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  1766450 rows,  http://data.insideairbnb.com/united-states/ri/rhode-island/2023-12-30/data/calendar.csv.gz
Success:  378505 rows,  http://data.insideairbnb.com/united-states/ny/rochester/2023-12-20/data/calendar.csv.gz
Success:  122640 rows,  http://data.insideairbnb.com/united-states/or/salem-or/2023-12-22/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  4680308 rows,  http://data.insideairbnb.com/united-states/ca/san-diego/2023-12-04/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  2940438 rows,  http://data.insideairbnb.com/united-states/ca/san-francisco/2023-12-04/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  1288727 rows,  http://data.insideairbnb.com/united-states/ca/san-mateo-county/2023-12-22/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  2536020 rows,  http://data.insideairbnb.com/united-states/ca/santa-clara-county/2023-12-23/data/calendar.csv.gz
Success:  602615 rows,  http://data.insideairbnb.com/united-states/ca/santa-cruz-county/2023-12-30/data/calendar.csv.gz
Success:  2512306 rows,  http://data.insideairbnb.com/united-states/wa/seattle/2023-12-20/data/calendar.csv.gz
Success:  1994360 rows,  http://data.insideairbnb.com/united-states/mn/twin-cities-msa/2023-12-20/data/calendar.csv.gz
Success:  2500945 rows,  http://data.insideairbnb.com/united-states/dc/washington-dc/2023-12-18/data/calendar.csv.gz
Success:  21879 rows,  http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/reviews.csv.gz
Success:  335803 rows,  http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/reviews.csv.gz
Success:  586441 rows,  http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/reviews.csv.gz
Success:  182482 rows,  http://data.insideairbnb.com/united-states/ma/boston/2023-12-

BadGzipFile: Not a gzipped file (b'id')

# Get Detailed Listing/Calendar/Review Data

In [2]:
def download_and_extract(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Extract filename from the URL
        filename = url.split('/')[-1].replace('.gz', '')
        # Open the gzip file
        with gzip.open(BytesIO(response.content), 'rb') as f:
            # Read the contents of the gzip file
            file_content = f.read()
        return file_content
    else:
        # Request was not successful, print error message
        print("Failed to download the file from the provided URL.")
        return None

In [3]:
file_ls = ['listings.csv.gz','calendar.csv.gz','reviews.csv.gz']
location_url_ls = ['http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/',
                    'http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/',
                    'http://data.insideairbnb.com/united-states/ma/boston/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/mt/bozeman/2023-12-10/data/',
                    'http://data.insideairbnb.com/united-states/fl/broward-county/2023-12-25/data/',
                    'http://data.insideairbnb.com/united-states/ma/cambridge/2023-12-26/data/',
                    'http://data.insideairbnb.com/united-states/il/chicago/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/oh/columbus/2023-12-25/data/',
                    'http://data.insideairbnb.com/united-states/tx/dallas/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/co/denver/2023-12-29/data/',
                    'http://data.insideairbnb.com/united-states/tx/fort-worth/2023-12-15/data/',
                    'http://data.insideairbnb.com/united-states/hi/hawaii/2023-12-15/data/',
                    'http://data.insideairbnb.com/united-states/nj/jersey-city/2023-12-22/data/',
                    'http://data.insideairbnb.com/united-states/ca/los-angeles/2023-12-03/data/',
                    'http://data.insideairbnb.com/united-states/tn/nashville/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/la/new-orleans/2023-12-03/data/',
                    'http://data.insideairbnb.com/united-states/ny/new-york-city/2024-02-06/data/',
                    'http://data.insideairbnb.com/united-states/nj/newark/2023-12-30/data/',
                    'http://data.insideairbnb.com/united-states/ca/oakland/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/ca/pacific-grove/2023-12-31/data/',
                    'http://data.insideairbnb.com/united-states/or/portland/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/ri/rhode-island/2023-12-30/data/',
                    'http://data.insideairbnb.com/united-states/ny/rochester/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/or/salem-or/2023-12-22/data/',
                    'http://data.insideairbnb.com/united-states/ca/san-diego/2023-12-04/data/',
                    'http://data.insideairbnb.com/united-states/ca/san-francisco/2023-12-04/data/',
                    'http://data.insideairbnb.com/united-states/ca/san-mateo-county/2023-12-22/data/',
                    'http://data.insideairbnb.com/united-states/ca/santa-clara-county/2023-12-23/data/',
                    'http://data.insideairbnb.com/united-states/ca/santa-cruz-county/2023-12-30/data/',
                    'http://data.insideairbnb.com/united-states/wa/seattle/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/mn/twin-cities-msa/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/dc/washington-dc/2023-12-18/data/']

In [4]:
# Set up logging configuration
logging.basicConfig(filename='GetData.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

for file_type in file_ls:
    us_data = None
    
    for loc_url in location_url_ls:
        state = loc_url.split('/')[-5]
        city = loc_url.split('/')[-4]
        data_date = loc_url.split('/')[-3]
        
        # crawler
        url = loc_url+file_type
        csv_file = download_and_extract(url)
        if csv_file:
            # Load the CSV file into a Pandas DataFrame
            df = pd.read_csv(BytesIO(csv_file))
            
            # data process
            df['state'] = state
            df['city'] = city
            df['data_date'] = data_date
            if us_data is None:
                us_data = df.copy()
            else:
                us_data = pd.concat([us_data, df], ignore_index=True)
            
            logging.info(url+f'; {df.shape[0]} rows')
            print('Success: ',f'{df.shape[0]} rows, ',url)
        else:
            print('Failure: ',url)
            logging.error(url)
    us_data.to_csv('data/'+url.split('/')[-1].replace('.gz', ''),index=False)

Success:  410 rows,  http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/listings.csv.gz
Success:  3329 rows,  http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/listings.csv.gz
Success:  15419 rows,  http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/listings.csv.gz
Success:  4204 rows,  http://data.insideairbnb.com/united-states/ma/boston/2023-12-18/data/listings.csv.gz
Success:  594 rows,  http://data.insideairbnb.com/united-states/mt/bozeman/2023-12-10/data/listings.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  18230 rows,  http://data.insideairbnb.com/united-states/fl/broward-county/2023-12-25/data/listings.csv.gz
Success:  1130 rows,  http://data.insideairbnb.com/united-states/ma/cambridge/2023-12-26/data/listings.csv.gz
Success:  8949 rows,  http://data.insideairbnb.com/united-states/il/chicago/2023-12-18/data/listings.csv.gz
Success:  17414 rows,  http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-12-20/data/listings.csv.gz
Success:  2668 rows,  http://data.insideairbnb.com/united-states/oh/columbus/2023-12-25/data/listings.csv.gz
Success:  5642 rows,  http://data.insideairbnb.com/united-states/tx/dallas/2023-12-18/data/listings.csv.gz
Success:  4971 rows,  http://data.insideairbnb.com/united-states/co/denver/2023-12-29/data/listings.csv.gz
Success:  1871 rows,  http://data.insideairbnb.com/united-states/tx/fort-worth/2023-12-15/data/listings.csv.gz
Success:  34040 rows,  http://data.insideairbnb.com/united-states/hi/hawaii/2023-12-15/data/listings.csv.gz
Success

  df = pd.read_csv(BytesIO(csv_file))


Success:  1215271 rows,  http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  5628526 rows,  http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/calendar.csv.gz
Success:  1534396 rows,  http://data.insideairbnb.com/united-states/ma/boston/2023-12-18/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  216810 rows,  http://data.insideairbnb.com/united-states/mt/bozeman/2023-12-10/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  6655138 rows,  http://data.insideairbnb.com/united-states/fl/broward-county/2023-12-25/data/calendar.csv.gz
Success:  412456 rows,  http://data.insideairbnb.com/united-states/ma/cambridge/2023-12-26/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  3266145 rows,  http://data.insideairbnb.com/united-states/il/chicago/2023-12-18/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  6357331 rows,  http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-12-20/data/calendar.csv.gz
Success:  974078 rows,  http://data.insideairbnb.com/united-states/oh/columbus/2023-12-25/data/calendar.csv.gz
Success:  2059330 rows,  http://data.insideairbnb.com/united-states/tx/dallas/2023-12-18/data/calendar.csv.gz
Success:  1814226 rows,  http://data.insideairbnb.com/united-states/co/denver/2023-12-29/data/calendar.csv.gz
Success:  682915 rows,  http://data.insideairbnb.com/united-states/tx/fort-worth/2023-12-15/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  12421910 rows,  http://data.insideairbnb.com/united-states/hi/hawaii/2023-12-15/data/calendar.csv.gz
Success:  565501 rows,  http://data.insideairbnb.com/united-states/nj/jersey-city/2023-12-22/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  16643832 rows,  http://data.insideairbnb.com/united-states/ca/los-angeles/2023-12-03/data/calendar.csv.gz
Success:  3229903 rows,  http://data.insideairbnb.com/united-states/tn/nashville/2023-12-18/data/calendar.csv.gz
Success:  2582375 rows,  http://data.insideairbnb.com/united-states/la/new-orleans/2023-12-03/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  14299870 rows,  http://data.insideairbnb.com/united-states/ny/new-york-city/2024-02-06/data/calendar.csv.gz
Success:  620643 rows,  http://data.insideairbnb.com/united-states/nj/newark/2023-12-30/data/calendar.csv.gz
Success:  1003542 rows,  http://data.insideairbnb.com/united-states/ca/oakland/2023-12-20/data/calendar.csv.gz
Success:  87235 rows,  http://data.insideairbnb.com/united-states/ca/pacific-grove/2023-12-31/data/calendar.csv.gz
Success:  1804925 rows,  http://data.insideairbnb.com/united-states/or/portland/2023-12-20/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  1766450 rows,  http://data.insideairbnb.com/united-states/ri/rhode-island/2023-12-30/data/calendar.csv.gz
Success:  378505 rows,  http://data.insideairbnb.com/united-states/ny/rochester/2023-12-20/data/calendar.csv.gz
Success:  122640 rows,  http://data.insideairbnb.com/united-states/or/salem-or/2023-12-22/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  4680308 rows,  http://data.insideairbnb.com/united-states/ca/san-diego/2023-12-04/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  2940438 rows,  http://data.insideairbnb.com/united-states/ca/san-francisco/2023-12-04/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  1288727 rows,  http://data.insideairbnb.com/united-states/ca/san-mateo-county/2023-12-22/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  2536020 rows,  http://data.insideairbnb.com/united-states/ca/santa-clara-county/2023-12-23/data/calendar.csv.gz
Success:  602615 rows,  http://data.insideairbnb.com/united-states/ca/santa-cruz-county/2023-12-30/data/calendar.csv.gz
Success:  2512306 rows,  http://data.insideairbnb.com/united-states/wa/seattle/2023-12-20/data/calendar.csv.gz
Success:  1994360 rows,  http://data.insideairbnb.com/united-states/mn/twin-cities-msa/2023-12-20/data/calendar.csv.gz
Success:  2500945 rows,  http://data.insideairbnb.com/united-states/dc/washington-dc/2023-12-18/data/calendar.csv.gz
Success:  21879 rows,  http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/reviews.csv.gz
Success:  335803 rows,  http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/reviews.csv.gz
Success:  586441 rows,  http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/reviews.csv.gz
Success:  182482 rows,  http://data.insideairbnb.com/united-states/ma/boston/2023-12-

BadGzipFile: Not a gzipped file (b'id')

# Get summary information

In [6]:
file_ls = ['listings.csv','reviews.csv']
location_url_ls = ['http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/',
                    'http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/',
                    'http://data.insideairbnb.com/united-states/ma/boston/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/mt/bozeman/2023-12-10/data/',
                    'http://data.insideairbnb.com/united-states/fl/broward-county/2023-12-25/data/',
                    'http://data.insideairbnb.com/united-states/ma/cambridge/2023-12-26/data/',
                    'http://data.insideairbnb.com/united-states/il/chicago/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/oh/columbus/2023-12-25/data/',
                    'http://data.insideairbnb.com/united-states/tx/dallas/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/co/denver/2023-12-29/data/',
                    'http://data.insideairbnb.com/united-states/tx/fort-worth/2023-12-15/data/',
                    'http://data.insideairbnb.com/united-states/hi/hawaii/2023-12-15/data/',
                    'http://data.insideairbnb.com/united-states/nj/jersey-city/2023-12-22/data/',
                    'http://data.insideairbnb.com/united-states/ca/los-angeles/2023-12-03/data/',
                    'http://data.insideairbnb.com/united-states/tn/nashville/2023-12-18/data/',
                    'http://data.insideairbnb.com/united-states/la/new-orleans/2023-12-03/data/',
                    'http://data.insideairbnb.com/united-states/ny/new-york-city/2024-02-06/data/',
                    'http://data.insideairbnb.com/united-states/nj/newark/2023-12-30/data/',
                    'http://data.insideairbnb.com/united-states/ca/oakland/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/ca/pacific-grove/2023-12-31/data/',
                    'http://data.insideairbnb.com/united-states/or/portland/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/ri/rhode-island/2023-12-30/data/',
                    'http://data.insideairbnb.com/united-states/ny/rochester/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/or/salem-or/2023-12-22/data/',
                    'http://data.insideairbnb.com/united-states/ca/san-diego/2023-12-04/data/',
                    'http://data.insideairbnb.com/united-states/ca/san-francisco/2023-12-04/data/',
                    'http://data.insideairbnb.com/united-states/ca/san-mateo-county/2023-12-22/data/',
                    'http://data.insideairbnb.com/united-states/ca/santa-clara-county/2023-12-23/data/',
                    'http://data.insideairbnb.com/united-states/ca/santa-cruz-county/2023-12-30/data/',
                    'http://data.insideairbnb.com/united-states/wa/seattle/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/mn/twin-cities-msa/2023-12-20/data/',
                    'http://data.insideairbnb.com/united-states/dc/washington-dc/2023-12-18/data/']

In [8]:
# Set up logging configuration
logging.basicConfig(filename='GetData_Summary.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

for file_type in file_ls:
    us_data = None
    
    for loc_url in location_url_ls:
        state = loc_url.split('/')[-5]
        city = loc_url.split('/')[-4]
        data_date = loc_url.split('/')[-3]
        
        # crawler
        url = loc_url+file_type
        csv_file = pd.read_csv(url)
        if csv_file is not None:
            # Load the CSV file into a Pandas DataFrame
            df = csv_file
            
            # data process
            df['state'] = state
            df['city'] = city
            df['data_date'] = data_date
            if us_data is None:
                us_data = df.copy()
            else:
                us_data = pd.concat([us_data, df], ignore_index=True)
            
            logging.info(url+f'; {df.shape[0]} rows')
            print('Success: ',f'{df.shape[0]} rows, ',url)
        else:
            print('Failure: ',url)
            logging.error(url)
    us_data.to_csv('data/SummaryData/'+url.split('/')[-1],index=False)

Success:  410 rows,  http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/listings.csv
Success:  3329 rows,  http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/listings.csv
Success:  15419 rows,  http://data.insideairbnb.com/united-states/tx/austin/2023-12-15/data/listings.csv
Success:  4204 rows,  http://data.insideairbnb.com/united-states/ma/boston/2023-12-18/data/listings.csv
Success:  594 rows,  http://data.insideairbnb.com/united-states/mt/bozeman/2023-12-10/data/listings.csv


  csv_file = pd.read_csv(url)


Success:  18230 rows,  http://data.insideairbnb.com/united-states/fl/broward-county/2023-12-25/data/listings.csv
Success:  1130 rows,  http://data.insideairbnb.com/united-states/ma/cambridge/2023-12-26/data/listings.csv
Success:  8949 rows,  http://data.insideairbnb.com/united-states/il/chicago/2023-12-18/data/listings.csv
Success:  17414 rows,  http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-12-20/data/listings.csv
Success:  2668 rows,  http://data.insideairbnb.com/united-states/oh/columbus/2023-12-25/data/listings.csv
Success:  5642 rows,  http://data.insideairbnb.com/united-states/tx/dallas/2023-12-18/data/listings.csv
Success:  4971 rows,  http://data.insideairbnb.com/united-states/co/denver/2023-12-29/data/listings.csv
Success:  1871 rows,  http://data.insideairbnb.com/united-states/tx/fort-worth/2023-12-15/data/listings.csv
Success:  34040 rows,  http://data.insideairbnb.com/united-states/hi/hawaii/2023-12-15/data/listings.csv
Success:  1549 rows,  http://data.

HTTPError: HTTP Error 404: Not Found

# Test

In [3]:
# test data
file_ls = ['listings.csv.gz','calendar.csv.gz']
location_url_ls = ['http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/',
                    'http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/']

In [5]:
# Set up logging configuration
logging.basicConfig(filename='TestGetData.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

for file_type in file_ls:
    us_data = None
    
    for loc_url in location_url_ls:
        state = loc_url.split('/')[-5]
        city = loc_url.split('/')[-4]
        data_date = loc_url.split('/')[-3]
        
        # crawler
        url = loc_url+file_type
        csv_file = download_and_extract(url)
        if csv_file:
            # Load the CSV file into a Pandas DataFrame
            df = pd.read_csv(BytesIO(csv_file))
            
            # data process
            df['state'] = state
            df['city'] = city
            df['data_date'] = data_date
            if us_data is None:
                us_data = df.copy()
            else:
                us_data = pd.concat([us_data, df], ignore_index=True)
            
            logging.info(url+f'; {df.shape[0]} rows')
            print('Success: ',f'{df.shape[0]} rows, ',url)
        else:
            print('Failure: ',url)
            logging.error(url)
    us_data.to_csv('Testdata/'+url.split('/')[-1].replace('.gz', ''),index=False)

Success:  410 rows,  http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/listings.csv.gz
Success:  3329 rows,  http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/listings.csv.gz
Success:  149650 rows,  http://data.insideairbnb.com/united-states/ny/albany/2024-01-06/data/calendar.csv.gz


  df = pd.read_csv(BytesIO(csv_file))


Success:  1215271 rows,  http://data.insideairbnb.com/united-states/nc/asheville/2023-12-18/data/calendar.csv.gz


In [7]:
test_df1 = pd.read_csv('TestData/listings.csv')
test_df1.shape

(3739, 78)

In [9]:
test_df1.head(5)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,state,city,data_date
0,47828453,https://www.airbnb.com/rooms/47828453,20240106043908,2024-01-06,city scrape,Rental unit in Albany · ★4.40 · 2 bedrooms · 3...,,,https://a0.muscache.com/pictures/47b30ae3-42bc...,299638115,...,,f,3,3,0,0,0.42,ny,albany,2024-01-06
1,13711253,https://www.airbnb.com/rooms/13711253,20240106043908,2024-01-06,city scrape,Home in Albany · ★4.77 · 1 bedroom · 1 bed · 1...,,2 block away from Honest Weight Co Op organic ...,https://a0.muscache.com/pictures/0dfbddb7-b6eb...,61700428,...,,f,3,0,3,0,2.45,ny,albany,2024-01-06
2,684100173908872919,https://www.airbnb.com/rooms/684100173908872919,20240106043908,2024-01-06,city scrape,Condo in Albany · ★4.40 · 1 bedroom · 2 beds ·...,,,https://a0.muscache.com/pictures/dad915ce-0619...,1176120,...,,f,3,3,0,0,4.2,ny,albany,2024-01-06
3,48124042,https://www.airbnb.com/rooms/48124042,20240106043908,2024-01-06,city scrape,Rental unit in Albany · ★4.89 · 2 bedrooms · 3...,,Downtown Albany - Corner of North Pearl and Ma...,https://a0.muscache.com/pictures/miso/Hosting-...,385664127,...,,f,2,2,0,0,5.54,ny,albany,2024-01-06
4,588298411730694861,https://www.airbnb.com/rooms/588298411730694861,20240106043908,2024-01-06,city scrape,Rental unit in Albany · ★5.0 · 1 bedroom · 1 b...,,,https://a0.muscache.com/pictures/b93dc43c-6bd4...,353498132,...,,f,5,5,0,0,0.48,ny,albany,2024-01-06


In [10]:
test_df2 = pd.read_csv('TestData/calendar.csv')
test_df2.shape

  test_df2 = pd.read_csv('TestData/calendar.csv')


(1364921, 10)

In [11]:
test_df2.head(5)

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights,state,city,data_date
0,1489424,2024-01-06,f,$50.00,,1,1125,ny,albany,2024-01-06
1,1489424,2024-01-07,f,$50.00,,1,1125,ny,albany,2024-01-06
2,1489424,2024-01-08,f,$50.00,,1,1125,ny,albany,2024-01-06
3,1489424,2024-01-09,f,$50.00,,1,1125,ny,albany,2024-01-06
4,1489424,2024-01-10,f,$50.00,,1,1125,ny,albany,2024-01-06
