# Setup

In [29]:
import pandas as pd
import numpy as np

In [27]:
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Read data

In [33]:
locations = [
    {
        'city': 'los_angeles',
        'state': 'california',
        'country': 'united_states'
    },
    {
        'city': 'san_diego',
        'state': 'california',
        'country': 'united_states'
    },
    {
        'city': 'chicago',
        'state': 'illinois',
        'country': 'united_states'
    },
    {
        'city': 'austin',
        'state': 'texas',
        'country': 'united_states'
    },
    {
        'city': 'dallas',
        'state': 'texas',
        'country': 'united_states'
    }
]

raw_df = pd.DataFrame()
for location in locations:
    aux_df = pd.read_csv(f"../data/raw/listings-{location['country']}-{location['state']}-{location['city']}.csv.gz",
                         compression='gzip')
    
    aux_df['country'] = location['country']
    aux_df['state'] = location['state']
    aux_df['city'] = location['city']

    raw_df = pd.concat([aux_df,  raw_df])

# Data profiling

In [34]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86295 entries, 0 to 45532
Data columns (total 78 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            86295 non-null  int64  
 1   listing_url                                   86295 non-null  object 
 2   scrape_id                                     86295 non-null  int64  
 3   last_scraped                                  86295 non-null  object 
 4   source                                        86295 non-null  object 
 5   name                                          86294 non-null  object 
 6   description                                   83816 non-null  object 
 7   neighborhood_overview                         48031 non-null  object 
 8   picture_url                                   86294 non-null  object 
 9   host_id                                       86295 non-null  int6

In [35]:
raw_df.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [36]:
raw_df.describe(include=['int64','float64'], percentiles=np.arange(0.1,1,0.1)).T

Unnamed: 0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,max
id,86295.0,5.189065298842653e+17,4.8863888016661824e+17,6.0,14430280.4,27694820.6,41935631.8,51521660.6,6.426237489052054e+17,7.944577870927692e+17,9.303202384549535e+17,1.0474456356302212e+18,1.1507383305325245e+18,1.2441483965922324e+18
scrape_id,86295.0,20240834316409.414,119151661.3292,20240621025845.0,20240624164233.0,20240624164233.0,20240904164210.0,20240904164210.0,20240904164210.0,20240904164210.0,20240904164210.0,20240904164210.0,20240913025540.0,20240913025540.0
host_id,86295.0,190077911.3002,187002790.2394,23.0,5943134.4,18361158.8,36534353.0,67108209.0,110271788.0,178113872.0,283808397.8,414861542.0,492305565.0,651348256.0
host_listings_count,86291.0,121.5143,634.4123,0.0,1.0,1.0,1.0,2.0,3.0,5.0,10.0,22.0,63.0,4680.0
host_total_listings_count,86291.0,163.7323,783.7427,0.0,1.0,1.0,2.0,3.0,5.0,8.0,15.0,32.0,96.0,9019.0
latitude,86295.0,33.8515,2.9118,30.0784,30.2747,32.7153,32.7923,33.7781,33.9924,34.047,34.0831,34.1276,34.5645,42.0222
longitude,86295.0,-110.393,11.2583,-118.9171,-118.4453,-118.3749,-118.3213,-118.2235,-117.9794,-117.1773,-97.7847,-97.7184,-96.7562,-87.5284
accommodates,86295.0,4.431,3.1185,1.0,2.0,2.0,2.0,3.0,4.0,4.0,6.0,6.0,8.0,16.0
bathrooms,72734.0,1.6351,1.1096,0.0,1.0,1.0,1.0,1.0,1.0,1.5,2.0,2.0,3.0,50.0
bedrooms,82064.0,1.8692,1.3557,0.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,4.0,50.0


In [37]:
raw_df.describe(include=['object']).T

Unnamed: 0,count,unique,top,freq
listing_url,86295,86295,https://www.airbnb.com/rooms/37853838,1
last_scraped,86295,11,2024-09-05,38312
source,86295,2,city scrape,72755
name,86294,83396,Wyndham Austin Resort|1BR/1BA King Bed w/ Gym&...,41
description,83816,72967,Keep it simple at this peaceful and centrally-...,259
neighborhood_overview,48031,37286,"• CW Austin Resort is in Austin, Texas.",116
picture_url,86294,84095,https://a0.muscache.com/pictures/miso/Hosting-...,22
host_url,86295,43340,https://www.airbnb.com/users/show/107434423,1263
host_name,86291,13365,Blueground,1263
host_since,86291,5283,2016-12-16,1279


# Data cleansing

## Adjust data types

In [None]:
# Numeric features
nm_features = ""