# NYC Airbnb Price Prediction - Data Exploration

Use dataset published by Kaggle - https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data - to train a simple deep learning model to predict prices for Airbnb properties.

This notebook contains the code to explore the dataset.


In [1]:
valid_days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# import seaborn as sns
import datetime
import os


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import datetime as dt
# common imports
import zipfile
import time
# import datetime, timedelta
import datetime
from datetime import datetime, timedelta
from datetime import date
from dateutil import relativedelta
from io import StringIO
import pandas as pd
import pickle
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from io import StringIO
import requests
import json
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline 
import os
import math
from subprocess import check_output
from IPython.display import display
import logging
import yaml
from collections import Counter
import re
import os


# Load and Save Data
- parse list of XLS files 
- load XLS files, tab by tab, into dataframe
- pickle dataframe for future runs

In [4]:
def get_config(config_file):
    ''' open config file with name config_file that contains parameters
    for this module and return Python object

    Args:
        config_file: filename containing config parameters

    Returns:
        config: Python dictionary with config parms from config file - dictionary


    '''
    current_path = os.getcwd()
    print("current directory is: " + current_path)

    path_to_yaml = os.path.join(current_path, config_file)
    print("path_to_yaml " + path_to_yaml)
    try:
        with open(path_to_yaml, 'r') as c_file:
            config = yaml.safe_load(c_file)
        return config
    except Exception as error:
        print('Error reading the config file ' + str(error))

In [5]:
def get_path():
    ''' get the path for data files

    Returns:
        path: path for data directory

    '''
    rawpath = os.getcwd()
    # data is in a directory called "data" that is a sibling to the directory
    # containing the notebook
    path = os.path.abspath(os.path.join(rawpath, '..', 'data'))
    return path


In [6]:
# define categories for input columns
def define_feature_categories(df):
    allcols = list(df)
    print("all cols",allcols)
    textcols = ['Incident','Location'] # 
    continuouscols = ['Min Delay','Min Gap'] 
                      # columns to deal with as continuous values - no embeddings
    timecols = ['Report Date','Time']
    collist = ['Day','Vehicle','Route','Direction']
    for col in continuouscols:
        df[col] = df[col].astype(float)
    print('texcols: ',textcols)
    print('continuouscols: ',continuouscols)
    print('timecols: ',timecols)
    print('collist: ',collist)
    return(allcols,textcols,continuouscols,timecols,collist)

# Load dataframe
- load pickled dataframe
- show info about the dataset


In [7]:
def ingest_data(path,input_csv,pickled_input_dataframe,load_from_scratch):
    ''' load data into dataframe
    Args:
        path: path containing input file
        input_csv: input file name
        pickled_input_dataframe: pickled version of input file

    Returns:
        path: path for data directory
    '''
    if load_from_scratch:
        unpickled_df = pd.read_csv(os.path.join(path,input_csv)) 
    else:
        unpickled_df = pd.read_pickle(os.path.join(path,pickled_input_dataframe))
        logging.debug("reloader done")
    return(unpickled_df)

# Main Block

In [8]:
config = get_config('data_airbnb_preparation_config.yml')
path = get_path()
df = ingest_data(path,config['file_names']['input_csv'],config['file_names']['pickled_input_dataframe'],config['general']['load_from_scratch'])


current directory is: /home/ab/dev/deepLearning/manning/airbnb_price_prediction/notebooks
path_to_yaml /home/ab/dev/deepLearning/manning/airbnb_price_prediction/notebooks/data_airbnb_preparation_config.yml


# General explorations


In [9]:
# get various summary statistics, excluding NaN values
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [10]:
# get a sample of the raw data
df.sample(10)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
28347,22028251,Harlem Master bed and bath,135944525,Tiffany,Manhattan,Harlem,40.81876,-73.9536,Private room,45,1,0,,,1,0
129,29455,ACCOMMODATIONS GALORE #1,126607,Laurine,Manhattan,Harlem,40.81618,-73.94894,Entire home/apt,120,3,155,2019-06-20,1.42,3,213
22972,18610314,2BR/2BATH Luxury-Sweeping Skylines,118124127,Lana,Brooklyn,Downtown Brooklyn,40.69156,-73.98686,Entire home/apt,179,2,5,2019-07-03,0.19,1,14
28782,22231328,Spacious Brooklyn loft in Clinton Hill,4642506,Devin,Brooklyn,Bedford-Stuyvesant,40.69216,-73.96011,Private room,275,2,17,2019-06-25,0.92,1,72
40746,31624862,Upper East Side Studio on 68th street,10371008,Anna,Manhattan,Upper East Side,40.76455,-73.95646,Entire home/apt,135,4,1,2019-06-13,1.0,2,217
24903,19964070,Artsy 1BR/1BA high wood ceilings steps from water,2036797,Catinca,Brooklyn,Williamsburg,40.71236,-73.96668,Entire home/apt,180,3,20,2019-05-08,0.91,1,0
46207,35154571,Beautiful 2BR APT in Hell's Kitchen!,147810492,Harold Huengue,Manhattan,Hell's Kitchen,40.76183,-73.9916,Entire home/apt,250,2,3,2019-06-23,3.0,1,212
26307,20970437,★Luminous and vivid room in NYC★,119828457,Marlice,Brooklyn,Bushwick,40.6871,-73.90546,Private room,110,3,26,2019-06-23,1.21,4,347
41492,32244865,LARGE Trendy Studio!!!,53931758,Lc,Brooklyn,Flatbush,40.64464,-73.96453,Entire home/apt,100,3,3,2019-06-03,0.97,1,73
33974,26898028,"ONE Room ￫￫￫20mins to TimesSQ ☆彡 COZY, COZY, ...",19303369,Hiroki,Queens,Woodside,40.74296,-73.90328,Private room,39,30,3,2019-07-01,0.29,37,32
