# 1 Data Cleaning

# 0) Import Modules

In [1]:
import os
from zipfile import ZipFile

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder




# 1) Load Data

In [2]:
os.chdir("/Users/anhthyngo/Documents/NYU/Fall 2019/DS-GA 1001/project/ds-ga1001/")

In [3]:
cwd = os.getcwd()
datadir = '/'.join(cwd.split('/')) + '/data/'
listings = datadir + 'listings.csv.zip'

In [4]:
## Read Data
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
zf = ZipFile(listings) 
data = pd.read_csv(zf.open("listings.csv"), parse_dates=['host_since', 'first_review', 'last_review'], low_memory=False)


In [5]:
data.head(1)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3647,https://www.airbnb.com/rooms/3647,20190912153101,2019-09-13,THE VILLAGE OF HARLEM....NEW YORK !,,WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY T...,WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY T...,none,,,,,,Upon arrival please have a legibile copy of yo...,,,https://a0.muscache.com/im/pictures/838341/9b3...,,4632,https://www.airbnb.com/users/show/4632,Elisabeth,2008-11-25,"New York, New York, United States",Make Up Artist National/ (Website hidden by Ai...,a few days or more,0%,,f,https://a0.muscache.com/im/users/4632/profile_...,https://a0.muscache.com/im/users/4632/profile_...,Harlem,1.0,1.0,"['email', 'phone', 'google', 'reviews', 'jumio...",t,t,"New York, NY, United States",Harlem,Harlem,Manhattan,New York,NY,10027,New York,"New York, NY",US,United States,40.80902,-73.9419,t,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,"{""Cable TV"",Internet,Wifi,""Air conditioning"",K...",,$150.00,,,$200.00,$75.00,2,$20.00,3,7,3,3,7,7,3.0,7.0,37 months ago,t,30,60,90,365,2019-09-13,0,0,NaT,NaT,,,,,,,,f,,,f,f,strict_14_with_grace_period,t,t,1,0,1,0,


# 2) Cleaning Data

### 2.1 Keep only relevant columns

In [6]:
data = data[['neighbourhood_cleansed',
 'city',
 'zipcode',
 'market',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'price',
 'weekly_price',
 'monthly_price',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'calendar_updated',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']]

In [7]:
print("There are",data.shape[0],"rows and", data.shape[1],"columns.")

There are 48377 rows and 33 columns.


### 2.2 Handle Missing Values
We will check dataset for missing values. If a column has more than 50% of values missing, we will drop the column.

In [8]:
for col in list(data.columns):
    count_miss_ratio = pd.isnull(data[col]).sum()/len(data)*100
    if count_miss_ratio > 50:
        print("%0.2f %% observations missing for %s." % (float(count_miss_ratio),col))
        data = data.drop([col], axis=1)
        

88.11 % observations missing for weekly_price.
89.66 % observations missing for monthly_price.


In [9]:
print("There are",data.shape[0],"rows and", data.shape[1],"columns.")

There are 48377 rows and 31 columns.


If there are only a few missing observations for a specific feature, we will just ignore the missing entry data.

In [10]:
for col in list(data.columns):
    count_miss_ratio = pd.isnull(data[col]).sum()/len(data)*100
    #if count_miss_ratio != 0 and count_miss_ratio < 1:
    if count_miss_ratio > 0:
        print("%0.2f %% observations missing for %s." % (float(count_miss_ratio),col))
    

0.14 % observations missing for city.
0.97 % observations missing for zipcode.
0.21 % observations missing for market.
0.10 % observations missing for bathrooms.
0.08 % observations missing for bedrooms.
0.07 % observations missing for beds.
21.83 % observations missing for review_scores_rating.
21.91 % observations missing for review_scores_accuracy.
21.88 % observations missing for review_scores_cleanliness.
21.94 % observations missing for review_scores_checkin.
21.90 % observations missing for review_scores_communication.
21.95 % observations missing for review_scores_location.
21.95 % observations missing for review_scores_value.


In [11]:
for col in list(data.columns):
    null_ratio = pd.isnull(data[col]).sum()/len(data)*100
    if null_ratio != 0 and null_ratio < 1:
    #if count_miss_ratio:
        print("%0.2f %% observations missing for %s." % (float(null_ratio),col))
        data = data[pd.notnull(data[col])]

0.14 % observations missing for city.
0.96 % observations missing for zipcode.
0.21 % observations missing for market.
0.10 % observations missing for bathrooms.
0.08 % observations missing for bedrooms.
0.04 % observations missing for beds.


In [12]:
print("There are",data.shape[0],"rows and", data.shape[1],"columns.")

There are 47639 rows and 31 columns.


The rest of these features are numerical and we will impute them later.

In [13]:
for col in list(data.columns):
    null_ratio = pd.isnull(data[col]).sum()/len(data)*100
    if null_ratio != 0 and null_ratio > 1:
        print("%0.2f %% observations missing for %s." % (float(null_ratio),col))
    

21.78 % observations missing for review_scores_rating.
21.86 % observations missing for review_scores_accuracy.
21.83 % observations missing for review_scores_cleanliness.
21.90 % observations missing for review_scores_checkin.
21.86 % observations missing for review_scores_communication.
21.91 % observations missing for review_scores_location.
21.90 % observations missing for review_scores_value.


### 2.3 Convert Currency features
Currency features are currently strings, we will convert them to floats and we will impute them with the median for currency features with null values. We pick the median because `security_deposit` and `cleaning_fee` are extremely left-skewed and therefore median will be a better choice to deal with outliers.

In [14]:
money_attributes = ['extra_people',  'price']
money_floats = data[money_attributes].replace( '[\$,)]','', regex=True ).replace( '[(]','-',   regex=True ).astype(float)

money_floats.head()

Unnamed: 0,extra_people,price
0,20.0,150.0
1,0.0,89.0
3,100.0,200.0
5,12.0,79.0
6,0.0,79.0


In [15]:
data = data.drop(columns=['extra_people','price'])
data = pd.concat([data,money_floats],axis = 1)
data.head()


Unnamed: 0,neighbourhood_cleansed,city,zipcode,market,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,guests_included,minimum_nights,maximum_nights,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,extra_people,price
0,Harlem,New York,10027,New York,40.80902,-73.9419,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,2,3,7,37 months ago,30,60,90,365,0,,,,,,,,20.0,150.0
1,Clinton Hill,Brooklyn,11238,New York,40.68514,-73.95976,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,1,1,730,3 days ago,1,3,8,192,279,90.0,9.0,9.0,10.0,9.0,10.0,9.0,0.0,89.0
3,Murray Hill,New York,10016,New York,40.74767,-73.975,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,2,3,21,today,3,3,13,13,75,89.0,10.0,9.0,10.0,10.0,9.0,9.0,100.0,200.0
5,Hell's Kitchen,New York,10019,New York,40.76489,-73.98493,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,1,2,14,a week ago,1,5,10,246,443,83.0,8.0,7.0,9.0,9.0,10.0,8.0,12.0,79.0
6,Upper West Side,New York,10025,New York,40.80178,-73.96723,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,1,2,14,26 months ago,0,0,0,0,118,98.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,79.0


In [16]:
data['zipcode'] = data['zipcode'].str.replace('NY ','').str[:5]
data = data[data['zipcode'] != ' ']
data['zipcode'] = data['zipcode'].astype(int)

### 2.4 Convert Binary Columns

Binary columns are labeled as either "True" or "False". Let's change them to the dummy values 1 and 0.

### 2.6 Dealing with Missing Values pt. 2
OK let's impute those missing value columns now.

In [17]:
for col in list(data.columns):
    null_ratio = pd.isnull(data[col]).sum()/len(data)*100
    if null_ratio > 0:
        print("%0.2f %% observations missing for %s." % (float(null_ratio),col))


21.78 % observations missing for review_scores_rating.
21.86 % observations missing for review_scores_accuracy.
21.83 % observations missing for review_scores_cleanliness.
21.90 % observations missing for review_scores_checkin.
21.86 % observations missing for review_scores_communication.
21.91 % observations missing for review_scores_location.
21.90 % observations missing for review_scores_value.


In [18]:
for col in list(data.columns):
    null_ratio = pd.isnull(data[col]).sum()/len(data)*100
    if null_ratio > 0:
        print("%0.2f %% observations missing for %s." % (float(null_ratio),col))

21.78 % observations missing for review_scores_rating.
21.86 % observations missing for review_scores_accuracy.
21.83 % observations missing for review_scores_cleanliness.
21.90 % observations missing for review_scores_checkin.
21.86 % observations missing for review_scores_communication.
21.91 % observations missing for review_scores_location.
21.90 % observations missing for review_scores_value.


### 2.7 One Hot Encoding
We will now look at the unique values in each string column.
If there are too many distinct values (i.e, 10) we will delete the column to avoid explosion in columns.
We will make an exception for `property_type` and `neighbourhood` as we believe these features have powerful predicting power.

In [19]:
strColumn = data.select_dtypes(include='object')
for name in strColumn:
    print(name,data[name].nunique())
    

neighbourhood_cleansed 224
city 309
market 16
property_type 36
room_type 4
bed_type 5
calendar_updated 91


In [20]:
strColumn = data.select_dtypes(include='object')

columnDrop = []
for name in strColumn:
    if name != 'neighbourhood_cleansed':
        if data[name].nunique() > 40:
            columnDrop.append(name)
print("We will drop these columns to avoid column explosion",columnDrop)
data = data.drop(columns=columnDrop,axis = 1)
print(data.shape)


We will drop these columns to avoid column explosion ['city', 'calendar_updated']
(47638, 29)


In [21]:
strColumn = data.select_dtypes(include='object')
for name in strColumn:
    print(name,data[name].nunique())
    

neighbourhood_cleansed 224
market 16
property_type 36
room_type 4
bed_type 5


`host_response_time` is a tricky one. We will impute the `NaN` values with "not provided" and then create a dummy.

In [22]:
data.head()

Unnamed: 0,neighbourhood_cleansed,zipcode,market,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,guests_included,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,extra_people,price
0,Harlem,10027,New York,40.80902,-73.9419,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,2,3,7,30,60,90,365,0,,,,,,,,20.0,150.0
1,Clinton Hill,11238,New York,40.68514,-73.95976,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,1,1,730,1,3,8,192,279,90.0,9.0,9.0,10.0,9.0,10.0,9.0,0.0,89.0
3,Murray Hill,10016,New York,40.74767,-73.975,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,2,3,21,3,3,13,13,75,89.0,10.0,9.0,10.0,10.0,9.0,9.0,100.0,200.0
5,Hell's Kitchen,10019,New York,40.76489,-73.98493,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,1,2,14,1,5,10,246,443,83.0,8.0,7.0,9.0,9.0,10.0,8.0,12.0,79.0
6,Upper West Side,10025,New York,40.80178,-73.96723,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,1,2,14,0,0,0,0,118,98.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,79.0


In [23]:
strColumn = data.select_dtypes(include='object')
dummy_list = []

for name in strColumn.columns:
    if data[name].nunique() == 1:
        print("1", name)
    else:
        dummies = pd.get_dummies(data[name])
        dummies = dummies.add_prefix(name + '_')
        data = data.drop(columns=[name])
        print(name)
        data = pd.concat([data,dummies],axis = 1)
    

neighbourhood_cleansed
market
property_type
room_type
bed_type


In [24]:
data.head()

Unnamed: 0,zipcode,latitude,longitude,accommodates,bathrooms,bedrooms,beds,guests_included,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,extra_people,price,neighbourhood_cleansed_Allerton,neighbourhood_cleansed_Arden Heights,neighbourhood_cleansed_Arrochar,neighbourhood_cleansed_Arverne,neighbourhood_cleansed_Astoria,neighbourhood_cleansed_Bath Beach,neighbourhood_cleansed_Battery Park City,neighbourhood_cleansed_Bay Ridge,neighbourhood_cleansed_Bay Terrace,"neighbourhood_cleansed_Bay Terrace, Staten Island",neighbourhood_cleansed_Baychester,neighbourhood_cleansed_Bayside,neighbourhood_cleansed_Bayswater,neighbourhood_cleansed_Bedford-Stuyvesant,neighbourhood_cleansed_Belle Harbor,neighbourhood_cleansed_Bellerose,neighbourhood_cleansed_Belmont,neighbourhood_cleansed_Bensonhurst,neighbourhood_cleansed_Bergen Beach,neighbourhood_cleansed_Boerum Hill,neighbourhood_cleansed_Borough Park,neighbourhood_cleansed_Breezy Point,neighbourhood_cleansed_Briarwood,neighbourhood_cleansed_Brighton Beach,neighbourhood_cleansed_Bronxdale,neighbourhood_cleansed_Brooklyn Heights,neighbourhood_cleansed_Brownsville,neighbourhood_cleansed_Bull's Head,neighbourhood_cleansed_Bushwick,neighbourhood_cleansed_Cambria Heights,neighbourhood_cleansed_Canarsie,neighbourhood_cleansed_Carroll Gardens,neighbourhood_cleansed_Castle Hill,neighbourhood_cleansed_Castleton Corners,neighbourhood_cleansed_Charleston,neighbourhood_cleansed_Chelsea,neighbourhood_cleansed_Chinatown,neighbourhood_cleansed_City Island,neighbourhood_cleansed_Civic Center,neighbourhood_cleansed_Claremont Village,neighbourhood_cleansed_Clason Point,neighbourhood_cleansed_Clifton,neighbourhood_cleansed_Clinton Hill,neighbourhood_cleansed_Co-op City,neighbourhood_cleansed_Cobble Hill,neighbourhood_cleansed_College Point,neighbourhood_cleansed_Columbia St,neighbourhood_cleansed_Concord,neighbourhood_cleansed_Concourse,neighbourhood_cleansed_Concourse Village,neighbourhood_cleansed_Coney Island,neighbourhood_cleansed_Corona,neighbourhood_cleansed_Country Club,neighbourhood_cleansed_Crown Heights,neighbourhood_cleansed_Cypress Hills,neighbourhood_cleansed_DUMBO,neighbourhood_cleansed_Ditmars Steinway,neighbourhood_cleansed_Dongan Hills,neighbourhood_cleansed_Douglaston,neighbourhood_cleansed_Downtown Brooklyn,neighbourhood_cleansed_Dyker Heights,neighbourhood_cleansed_East Elmhurst,neighbourhood_cleansed_East Flatbush,neighbourhood_cleansed_East Harlem,neighbourhood_cleansed_East Morrisania,neighbourhood_cleansed_East New York,neighbourhood_cleansed_East Village,neighbourhood_cleansed_Eastchester,neighbourhood_cleansed_Edenwald,neighbourhood_cleansed_Edgemere,neighbourhood_cleansed_Elmhurst,neighbourhood_cleansed_Eltingville,neighbourhood_cleansed_Emerson Hill,neighbourhood_cleansed_Far Rockaway,neighbourhood_cleansed_Fieldston,neighbourhood_cleansed_Financial District,neighbourhood_cleansed_Flatbush,neighbourhood_cleansed_Flatiron District,neighbourhood_cleansed_Flatlands,neighbourhood_cleansed_Flushing,neighbourhood_cleansed_Fordham,neighbourhood_cleansed_Forest Hills,neighbourhood_cleansed_Fort Greene,neighbourhood_cleansed_Fort Hamilton,neighbourhood_cleansed_Fort Wadsworth,neighbourhood_cleansed_Fresh Meadows,neighbourhood_cleansed_Gerritsen Beach,neighbourhood_cleansed_Glendale,neighbourhood_cleansed_Gowanus,neighbourhood_cleansed_Gramercy,neighbourhood_cleansed_Graniteville,neighbourhood_cleansed_Grant City,neighbourhood_cleansed_Gravesend,neighbourhood_cleansed_Great Kills,neighbourhood_cleansed_Greenpoint,neighbourhood_cleansed_Greenwich Village,neighbourhood_cleansed_Grymes Hill,neighbourhood_cleansed_Harlem,neighbourhood_cleansed_Hell's Kitchen,neighbourhood_cleansed_Highbridge,neighbourhood_cleansed_Hollis,neighbourhood_cleansed_Holliswood,neighbourhood_cleansed_Howard Beach,neighbourhood_cleansed_Howland Hook,neighbourhood_cleansed_Huguenot,neighbourhood_cleansed_Hunts Point,neighbourhood_cleansed_Inwood,neighbourhood_cleansed_Jackson Heights,neighbourhood_cleansed_Jamaica,neighbourhood_cleansed_Jamaica Estates,neighbourhood_cleansed_Jamaica Hills,neighbourhood_cleansed_Kensington,neighbourhood_cleansed_Kew Gardens,neighbourhood_cleansed_Kew Gardens Hills,neighbourhood_cleansed_Kingsbridge,neighbourhood_cleansed_Kips Bay,neighbourhood_cleansed_Laurelton,neighbourhood_cleansed_Lighthouse Hill,neighbourhood_cleansed_Little Italy,neighbourhood_cleansed_Little Neck,neighbourhood_cleansed_Long Island City,neighbourhood_cleansed_Longwood,neighbourhood_cleansed_Lower East Side,neighbourhood_cleansed_Manhattan Beach,neighbourhood_cleansed_Marble Hill,neighbourhood_cleansed_Mariners Harbor,neighbourhood_cleansed_Maspeth,neighbourhood_cleansed_Melrose,neighbourhood_cleansed_Middle Village,neighbourhood_cleansed_Midland Beach,neighbourhood_cleansed_Midtown,neighbourhood_cleansed_Midwood,neighbourhood_cleansed_Mill Basin,neighbourhood_cleansed_Morningside Heights,neighbourhood_cleansed_Morris Heights,neighbourhood_cleansed_Morris Park,neighbourhood_cleansed_Morrisania,neighbourhood_cleansed_Mott Haven,neighbourhood_cleansed_Mount Eden,neighbourhood_cleansed_Mount Hope,neighbourhood_cleansed_Murray Hill,neighbourhood_cleansed_Navy Yard,neighbourhood_cleansed_Neponsit,neighbourhood_cleansed_New Brighton,neighbourhood_cleansed_New Dorp,neighbourhood_cleansed_New Dorp Beach,neighbourhood_cleansed_New Springville,neighbourhood_cleansed_NoHo,neighbourhood_cleansed_Nolita,neighbourhood_cleansed_North Riverdale,neighbourhood_cleansed_Norwood,neighbourhood_cleansed_Oakwood,neighbourhood_cleansed_Olinville,neighbourhood_cleansed_Ozone Park,neighbourhood_cleansed_Park Slope,neighbourhood_cleansed_Parkchester,neighbourhood_cleansed_Pelham Bay,neighbourhood_cleansed_Pelham Gardens,neighbourhood_cleansed_Port Morris,neighbourhood_cleansed_Port Richmond,neighbourhood_cleansed_Prince's Bay,neighbourhood_cleansed_Prospect Heights,neighbourhood_cleansed_Prospect-Lefferts Gardens,neighbourhood_cleansed_Queens Village,neighbourhood_cleansed_Randall Manor,neighbourhood_cleansed_Red Hook,neighbourhood_cleansed_Rego Park,neighbourhood_cleansed_Richmond Hill,neighbourhood_cleansed_Richmondtown,neighbourhood_cleansed_Ridgewood,neighbourhood_cleansed_Riverdale,neighbourhood_cleansed_Rockaway Beach,neighbourhood_cleansed_Roosevelt Island,neighbourhood_cleansed_Rosebank,neighbourhood_cleansed_Rosedale,neighbourhood_cleansed_Rossville,neighbourhood_cleansed_Schuylerville,neighbourhood_cleansed_Sea Gate,neighbourhood_cleansed_Sheepshead Bay,neighbourhood_cleansed_Shore Acres,neighbourhood_cleansed_Silver Lake,neighbourhood_cleansed_SoHo,neighbourhood_cleansed_Soundview,neighbourhood_cleansed_South Beach,neighbourhood_cleansed_South Ozone Park,neighbourhood_cleansed_South Slope,neighbourhood_cleansed_Springfield Gardens,neighbourhood_cleansed_Spuyten Duyvil,neighbourhood_cleansed_St. Albans,neighbourhood_cleansed_St. George,neighbourhood_cleansed_Stapleton,neighbourhood_cleansed_Stuyvesant Town,neighbourhood_cleansed_Sunnyside,neighbourhood_cleansed_Sunset Park,neighbourhood_cleansed_Theater District,neighbourhood_cleansed_Throgs Neck,neighbourhood_cleansed_Todt Hill,neighbourhood_cleansed_Tompkinsville,neighbourhood_cleansed_Tottenville,neighbourhood_cleansed_Tremont,neighbourhood_cleansed_Tribeca,neighbourhood_cleansed_Two Bridges,neighbourhood_cleansed_Unionport,neighbourhood_cleansed_University Heights,neighbourhood_cleansed_Upper East Side,neighbourhood_cleansed_Upper West Side,neighbourhood_cleansed_Van Nest,neighbourhood_cleansed_Vinegar Hill,neighbourhood_cleansed_Wakefield,neighbourhood_cleansed_Washington Heights,neighbourhood_cleansed_West Brighton,neighbourhood_cleansed_West Farms,neighbourhood_cleansed_West Village,neighbourhood_cleansed_Westchester Square,neighbourhood_cleansed_Westerleigh,neighbourhood_cleansed_Whitestone,neighbourhood_cleansed_Williamsbridge,neighbourhood_cleansed_Williamsburg,neighbourhood_cleansed_Willowbrook,neighbourhood_cleansed_Windsor Terrace,neighbourhood_cleansed_Woodhaven,neighbourhood_cleansed_Woodlawn,neighbourhood_cleansed_Woodrow,neighbourhood_cleansed_Woodside,market_Adirondacks,market_Agra,market_Atlanta,market_Boston,market_Catskills and Hudson Valley,market_Cuba,market_D.C.,market_Jamaica South Coast,market_Kyoto,"market_Lagos, NG",market_Los Angeles,market_New Orleans,market_New York,market_Other (Domestic),market_Paris,market_San Francisco,property_type_Aparthotel,property_type_Apartment,property_type_Barn,property_type_Bed and breakfast,property_type_Boat,property_type_Boutique hotel,property_type_Bungalow,property_type_Bus,property_type_Cabin,property_type_Camper/RV,property_type_Casa particular (Cuba),property_type_Castle,property_type_Cave,property_type_Condominium,property_type_Cottage,property_type_Dome house,property_type_Earth house,property_type_Farm stay,property_type_Guest suite,property_type_Guesthouse,property_type_Hostel,property_type_Hotel,property_type_House,property_type_Houseboat,property_type_Island,property_type_Loft,property_type_Nature lodge,property_type_Other,property_type_Resort,property_type_Serviced apartment,property_type_Tent,property_type_Timeshare,property_type_Tiny house,property_type_Townhouse,property_type_Villa,property_type_Yurt,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
0,10027,40.80902,-73.9419,2,1.0,1.0,1.0,2,3,7,30,60,90,365,0,,,,,,,,20.0,150.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,11238,40.68514,-73.95976,3,1.0,1.0,4.0,1,1,730,1,3,8,192,279,90.0,9.0,9.0,10.0,9.0,10.0,9.0,0.0,89.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
3,10016,40.74767,-73.975,2,1.0,1.0,1.0,2,3,21,3,3,13,13,75,89.0,10.0,9.0,10.0,10.0,9.0,9.0,100.0,200.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
5,10019,40.76489,-73.98493,2,1.0,1.0,1.0,1,2,14,1,5,10,246,443,83.0,8.0,7.0,9.0,9.0,10.0,8.0,12.0,79.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
6,10025,40.80178,-73.96723,1,1.0,1.0,1.0,1,2,14,0,0,0,0,118,98.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,79.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [25]:
data.to_csv(os.getcwd()+'/data/airbnb_no_impute.csv.gz', index=False, compression='gzip')

In [26]:
list(data.columns)

['zipcode',
 'latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'guests_included',
 'minimum_nights',
 'maximum_nights',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'extra_people',
 'price',
 'neighbourhood_cleansed_Allerton',
 'neighbourhood_cleansed_Arden Heights',
 'neighbourhood_cleansed_Arrochar',
 'neighbourhood_cleansed_Arverne',
 'neighbourhood_cleansed_Astoria',
 'neighbourhood_cleansed_Bath Beach',
 'neighbourhood_cleansed_Battery Park City',
 'neighbourhood_cleansed_Bay Ridge',
 'neighbourhood_cleansed_Bay Terrace',
 'neighbourhood_cleansed_Bay Terrace, Staten Island',
 'neighbourhood_cleansed_Baychester',
 'neighbourhood_cleansed_Bayside',
 'neighbourhood_cleansed_Bayswater',
 'neighbourhood_cleanse

### Done!