In [1]:
data_link = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'

In [2]:
!wget $data_link

--2021-09-26 17:49:31--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7077973 (6.8M) [text/plain]
Saving to: ‘AB_NYC_2019.csv.2’


2021-09-26 17:49:32 (10.9 MB/s) - ‘AB_NYC_2019.csv.2’ saved [7077973/7077973]



In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Features

In [4]:
features = ['neighbourhood_group','room_type','latitude','longitude','price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']

In [5]:
data = pd.read_csv('AB_NYC_2019.csv')
data = data[features].copy()
data.head(2)

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355


In [6]:
data.isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [7]:
data['reviews_per_month'] = data.reviews_per_month.fillna(0)

In [8]:
data.isnull().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

## Question 1

In [9]:
# What is the most frequent observation (mode) for the column 'neighbourhood_group'
data.neighbourhood_group.mode()

0    Manhattan
dtype: object

## split the data

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
# Split your data in train/val/test sets, with 60%/20%/20% distribution.

In [12]:
full_train, X_test, full_train_y, y_test = train_test_split(data, data.price, test_size=0.2, random_state=42)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(full_train, full_train_y, test_size=0.25, random_state=42)

In [14]:
X_train.shape, X_val.shape, X_test.shape

((29337, 10), (9779, 10), (9779, 10))

In [15]:
del X_train['price']
del X_val['price']
del X_test['price']

## Question 2

In [16]:
# Create the correlation matrix for the numerical features of your train dataset

In [17]:
numerical_cols = list(X_train.dtypes[X_train.dtypes != 'object'].index)
numerical_cols

['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [18]:
X_train[numerical_cols].corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


What are the two features that have the biggest correlation in this dataset?

In [19]:
# 'number_of_reviews', and 'reviews_per_month',

## Make price binary

In [20]:
above_average_train = (y_train >= 152).astype(int)
above_average_test = (y_test >= 152).astype(int)
above_average_val = (y_val >= 152).astype(int)

## Question 3

In [21]:
from sklearn.metrics import mutual_info_score, accuracy_score

In [22]:
categorical_cols = list(X_train.dtypes[X_train.dtypes == 'object'].index)
categorical_cols

['neighbourhood_group', 'room_type']

In [23]:
for c in categorical_cols:
    mi = mutual_info_score(X_train[c].values, above_average_train)
    print(f'MI between {c} and price: {round(mi, 2)}')

MI between neighbourhood_group and price: 0.05
MI between room_type and price: 0.14


## Question 4

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
import warnings
warnings.filterwarnings('ignore')

In [25]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)

In [26]:
train_dicts = X_train[categorical_cols + numerical_cols].to_dict(orient='records')
test_dicts = X_test[categorical_cols + numerical_cols].to_dict(orient='records')
val_dicts = X_val[categorical_cols + numerical_cols].to_dict(orient='records')

In [27]:
dv = DictVectorizer(sparse=False)

In [28]:
dv.fit(train_dicts)

DictVectorizer(sparse=False)

In [29]:
train = dv.fit_transform(train_dicts)
test = dv.transform(test_dicts)
val = dv.transform(val_dicts)

In [30]:
logReg = model.fit(train, above_average_train)

In [31]:
preds = logReg.predict(val)

In [32]:
original_score = accuracy_score(preds, above_average_val)

In [33]:
print(f'Validation accuracy: {round(original_score, 2)}')

Validation accuracy: 0.79


In [34]:
# lr.score(val, above_average_val)

## Question 5

In [35]:
cols =  categorical_cols + numerical_cols
cols

['neighbourhood_group',
 'room_type',
 'latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [36]:
scores = {}
for i in range(len(cols)):
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
    tmp = cols[:i]+cols[i+1:]
    
    X = dv.fit_transform(X_train[tmp].to_dict(orient='records'))
    Xv = dv.transform(X_val[tmp].to_dict(orient='records'))
    
    lr = model.fit(X, above_average_train)
    score = lr.score(Xv, above_average_val)
    score = round(score, 3)
    
    diff = original_score - score
    
    scores['without_'+cols[i]] = diff

In [37]:
df_scores = pd.DataFrame(data=scores, index=['difference'])
df_scores.T

Unnamed: 0,difference
without_neighbourhood_group,0.035686
without_room_type,0.070686
without_latitude,0.000686
without_longitude,-0.000314
without_minimum_nights,0.000686
without_number_of_reviews,-0.000314
without_reviews_per_month,0.001686
without_calculated_host_listings_count,-0.000314
without_availability_365,0.004686


## Question 6

In [115]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [116]:
y_train_log = np.log1p(y_train.values)
y_test_log = np.log1p(y_test.values)
y_val_log = np.log1p(y_val.values)

In [119]:
alphas = [0, 0.01, 0.1, 1, 10]
rmses = {}
for alpha in alphas:
    linReg = Ridge(alpha=alpha, random_state=42)
    model_lr = linReg.fit(train, y_train_log)
    y_pred = model_lr.predict(val)
    mse = mean_squared_error(y_val_log, np.expm1(y_pred))
    rmse = np.sqrt(mse)
    rmses[alpha] = round(rmse, 3)

In [122]:
pd.DataFrame(data=rmses, index=['rmse']).T

Unnamed: 0,rmse
0.0,135.84
0.01,135.674
0.1,135.666
1.0,135.593
10.0,135.1


In [123]:
# alpha = 10

array([4.60517019, 4.06044301, 4.26267988, ..., 5.70378247, 4.18965474,
       4.53259949])