In [16]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [46]:
df = pd.read_csv('AB_NYC_2019.csv')

In [47]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [48]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')


In [49]:
base = ['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']

In [50]:
df[base].isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [51]:
df = df[base].copy()

In [52]:
df.reviews_per_month = df.reviews_per_month.fillna(0).values

In [53]:
df.isnull().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [54]:
df.groupby('neighbourhood_group').price.agg(['count'])

Unnamed: 0_level_0,count
neighbourhood_group,Unnamed: 1_level_1
bronx,1091
brooklyn,20104
manhattan,21661
queens,5666
staten_island,373


Setting up the validation framework

In [55]:
from sklearn.model_selection import train_test_split

In [91]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [92]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [93]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [94]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [95]:
df_full_train.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
32645,brooklyn,entire_home/apt,40.71577,-73.9553,295,3,11,0.87,1,1
23615,manhattan,private_room,40.84917,-73.94048,70,2,2,0.16,1,0
31183,brooklyn,private_room,40.68993,-73.95947,58,2,0,0.0,2,0
29260,brooklyn,entire_home/apt,40.68427,-73.93118,75,3,87,4.91,1,267
7275,queens,private_room,40.74705,-73.89564,38,5,13,0.25,1,0


Feature importance: Correlation

In [129]:
numerical = [
    'latitude',
    'longitude',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]

In [130]:
df_train[numerical].corrwith(price_df.price).abs()

minimum_nights                    0.030181
number_of_reviews                 0.053513
reviews_per_month                 0.058131
calculated_host_listings_count    0.172137
availability_365                  0.104634
dtype: float64

In [131]:
corrM = df_train[numerical].corr()
corrM

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
minimum_nights,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,0.138901,0.174477,0.165376,0.225913,1.0


In [132]:
above_average = np.where(y_train >= 152, 1, 0)
above_average

array([0, 0, 0, ..., 1, 0, 0])

Feature importance: Mutual information

In [133]:
from sklearn.metrics import mutual_info_score

In [134]:
categorical = [
    'neighbourhood_group',
    'room_type',
]

In [135]:
def mutual_info_price_score(series):
    return round(mutual_info_score(series, above_average), 2)

In [136]:
mi = df_train[categorical].apply(mutual_info_price_score)
mi.sort_values(ascending=False)

room_type              0.14
neighbourhood_group    0.05
dtype: float64

One-hot encoding

In [137]:
from sklearn.feature_extraction import DictVectorizer

In [138]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

Logistic regression

In [139]:
from sklearn.linear_model import LogisticRegression

In [140]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)

In [None]:
model.fit(X_train, y_train)