In [211]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

In [212]:
df = pd.read_csv("data/AB_NYC_2019.csv")
df = df[['neighbourhood_group','room_type', 'latitude','longitude','price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count', 'availability_365']]

In [213]:
df.fillna(0, inplace=True)

### Question 1

In [214]:
df['neighbourhood_group'].value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

In [215]:
from sklearn.model_selection import train_test_split
X = df.drop(['price'], axis=1)
y = df['price']

df_train, df_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df_train, df_val, y_train, y_val = train_test_split(df_train, y_train, test_size=0.25, random_state=42)

### Question 2

In [216]:
df_train.corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


In [217]:
# y = (y >=152)
y_train = (y_train >= 152)
y_test = (y_test >=152)
y_val = (y_val >=152)
df_train['above_average'] = y_train.astype(int)

### Question 3

In [219]:
def calculate_mi(series):
    return mutual_info_score(series, df_train.above_average)

In [220]:
categorical = ['neighbourhood_group','room_type']

df_mi = df_train[categorical].apply(calculate_mi)

In [221]:
df_mi

neighbourhood_group    0.046506
room_type              0.143226
dtype: float64

### Question 4

In [222]:
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

In [223]:
numerical = ['latitude','longitude','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count', 'availability_365']
train_dict = df_train[categorical + numerical].to_dict(orient='records')
train_dict[0]

{'neighbourhood_group': 'Brooklyn',
 'room_type': 'Entire home/apt',
 'latitude': 40.7276,
 'longitude': -73.94495,
 'minimum_nights': 3,
 'number_of_reviews': 29,
 'reviews_per_month': 0.7,
 'calculated_host_listings_count': 13,
 'availability_365': 50}

In [224]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

DictVectorizer(sparse=False)

In [225]:
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)

In [226]:
y_train = df_train.above_average
y_train

13575    0
48476    0
44499    0
17382    0
14638    0
        ..
13198    0
14583    0
6168     1
12248    0
20523    0
Name: above_average, Length: 29337, dtype: int64

In [227]:
X_train = dv.transform(train_dict)

In [228]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=Bronx',
 'neighbourhood_group=Brooklyn',
 'neighbourhood_group=Manhattan',
 'neighbourhood_group=Queens',
 'neighbourhood_group=Staten Island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=Entire home/apt',
 'room_type=Private room',
 'room_type=Shared room']

In [229]:
X_train


array([[ 50.     ,  13.     ,  40.7276 , ...,   1.     ,   0.     ,
          0.     ],
       [  7.     ,   1.     ,  40.70847, ...,   0.     ,   1.     ,
          0.     ],
       [  0.     ,   1.     ,  40.83149, ...,   1.     ,   0.     ,
          0.     ],
       ...,
       [ 88.     ,   1.     ,  40.79994, ...,   0.     ,   1.     ,
          0.     ],
       [  0.     ,   1.     ,  40.69585, ...,   0.     ,   1.     ,
          0.     ],
       [281.     ,   2.     ,  40.64438, ...,   1.     ,   0.     ,
          0.     ]])

In [230]:
type(y_train)

pandas.core.series.Series

In [231]:
y_train

13575    0
48476    0
44499    0
17382    0
14638    0
        ..
13198    0
14583    0
6168     1
12248    0
20523    0
Name: above_average, Length: 29337, dtype: int64

In [232]:
model.fit(X_train, y_train)

LogisticRegression(random_state=42, solver='liblinear')

In [233]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
model.predict_proba(X_val)

array([[0.97118671, 0.02881329],
       [0.40396767, 0.59603233],
       [0.57352375, 0.42647625],
       ...,
       [0.88636526, 0.11363474],
       [0.96547976, 0.03452024],
       [0.47019858, 0.52980142]])

In [234]:
y_pred = model.predict_proba(X_val)[:, 1]

In [235]:
y_pred

array([0.02881329, 0.59603233, 0.42647625, ..., 0.11363474, 0.03452024,
       0.52980142])

In [236]:
price_above_average =  y_pred > 0.5

In [237]:
accuracy_all_feature = (y_val == price_above_average).mean()
accuracy_all_feature

0.7904693731465385

### Question 5

In [238]:
features = categorical + numerical
features

['neighbourhood_group',
 'room_type',
 'latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [240]:
for feature in features:
    features_it = [a for a in features if a!=feature]
    model_it = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
    train_dict_it = df_train[features_it].to_dict(orient='records')
    X_train_it = dv.transform(train_dict_it)
    model_it.fit(X_train_it, y_train)
    
    val_dict = df_val[features_it].to_dict(orient='records')
    X_val_it = dv.transform(val_dict)
    
    y_pred = model.predict_proba(X_val_it)[:, 1]
    price_above_average =  y_pred > 0.5
    accuracy = (y_val == price_above_average).mean()
    diff = abs(accuracy - accuracy_all_feature)
    print(feature, diff)
    
    

neighbourhood_group 0.07792207792207795
room_type 0.08763677267614278
latitude 0.48389405869720836
longitude 0.0970446875958687
minimum_nights 0.0010225994477963685
number_of_reviews 0.0021474588403722628
reviews_per_month 0.0008180795582370948
calculated_host_listings_count 0.0006135596686777101
availability_365 0.004908477349422236


### Question 6

In [267]:
alphas = [0, 0.01, 0.1, 1, 10]

In [268]:
df = pd.read_csv("data/AB_NYC_2019.csv")
df = df[['neighbourhood_group','room_type', 'latitude','longitude','price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count', 'availability_365']]
df.fillna(0, inplace=True)

In [273]:
df['price_log'] = np.log1p(1 + df['price'])

In [274]:
df[df.price ==0]

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price_log
23161,Brooklyn,Private room,40.69023,-73.95428,0,4,1,0.05,4,28,0.693147
25433,Bronx,Private room,40.83296,-73.88668,0,2,55,2.56,4,127,0.693147
25634,Brooklyn,Private room,40.69467,-73.92433,0,2,16,0.71,5,0,0.693147
25753,Brooklyn,Private room,40.72462,-73.94072,0,2,12,0.53,2,0,0.693147
25778,Brooklyn,Entire home/apt,40.70838,-73.94645,0,5,3,0.15,1,73,0.693147
25794,Brooklyn,Private room,40.68173,-73.91342,0,1,93,4.28,6,176,0.693147
25795,Brooklyn,Private room,40.68279,-73.9117,0,1,95,4.37,6,232,0.693147
25796,Brooklyn,Private room,40.68258,-73.91284,0,1,95,4.35,6,222,0.693147
26259,Manhattan,Entire home/apt,40.75091,-73.97597,0,3,0,0.0,1,0,0.693147
26841,Brooklyn,Shared room,40.69211,-73.9067,0,30,2,0.11,6,333,0.693147


In [275]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [277]:

X = df.drop(['price', 'price_log'], axis=1)
y = df['price_log']

column_trans = ColumnTransformer(
    [
        ("onehot_categorical", OneHotEncoder(),
            categorical),
    ],
    remainder="passthrough",
)

# Fit and Transform
X = column_trans.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [279]:

for alpha in alphas:
    model_reg = Ridge(alpha=alpha)
    model_reg.fit(X_train, y_train)
    y_pred = model_reg.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val,y_pred))
    print(alpha, round(rmse,3))

0 0.493
0.01 0.493
0.1 0.493
1 0.493
10 0.493
