In [107]:
import numpy as np
import pandas as pd


In [108]:
np.__version__

'1.19.5'

In [109]:
pd.__version__

'1.1.5'

In [110]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv

--2021-09-27 14:36:11--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7077973 (6.8M) [text/plain]
Saving to: ‘AB_NYC_2019.csv.1’


2021-09-27 14:36:11 (78.9 MB/s) - ‘AB_NYC_2019.csv.1’ saved [7077973/7077973]



In [111]:
df = pd.read_csv('AB_NYC_2019.csv')
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


## Data preparation

In [112]:
# Del some columns
base = [
  'neighbourhood_group',
  'room_type',
  'latitude',
  'longitude',
  'price',
  'minimum_nights',
  'number_of_reviews',
  'reviews_per_month',
  'calculated_host_listings_count',
  'availability_365'
]

df = df[base]
df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,Manhattan,Private room,40.80902,-73.9419,150,3,0,,1,365
3,Brooklyn,Entire home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,Manhattan,Entire home/apt,40.79851,-73.94399,80,10,9,0.1,1,0


In [113]:
# Cleaning column names and values
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

df.head().T

Unnamed: 0,0,1,2,3,4
neighbourhood_group,brooklyn,manhattan,manhattan,brooklyn,manhattan
room_type,private_room,entire_home/apt,private_room,entire_home/apt,entire_home/apt
latitude,40.6475,40.7536,40.809,40.6851,40.7985
longitude,-73.9724,-73.9838,-73.9419,-73.9598,-73.944
price,149,225,150,89,80
minimum_nights,1,1,3,1,10
number_of_reviews,9,45,0,270,9
reviews_per_month,0.21,0.38,,4.64,0.1
calculated_host_listings_count,6,2,1,1,1
availability_365,365,355,365,194,0


In [114]:
df.dtypes
df.dtypes == 'object'

neighbourhood_group                True
room_type                          True
latitude                          False
longitude                         False
price                             False
minimum_nights                    False
number_of_reviews                 False
reviews_per_month                 False
calculated_host_listings_count    False
availability_365                  False
dtype: bool

In [115]:
df.isnull().sum()
# reviews_per_month has 10052 missing values

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [116]:
df.reviews_per_month = df.reviews_per_month.fillna(0)
df.isnull().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

## Question 1


In [117]:
df.neighbourhood_group.mode()

0    manhattan
dtype: object

In [118]:
df['above_average'] = (df.price > 152).astype(int)
df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
0,brooklyn,private_room,40.64749,-73.97237,149,1,9,0.21,6,365,0
1,manhattan,entire_home/apt,40.75362,-73.98377,225,1,45,0.38,2,355,1
2,manhattan,private_room,40.80902,-73.9419,150,3,0,0.0,1,365,0
3,brooklyn,entire_home/apt,40.68514,-73.95976,89,1,270,4.64,1,194,0
4,manhattan,entire_home/apt,40.79851,-73.94399,80,10,9,0.1,1,0,0


## Split the data

In [119]:
from sklearn.model_selection import train_test_split

In [120]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)

(39116, 9779)

In [121]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val)

(29337, 9779)

In [122]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [123]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [124]:
y_train = (y_train > 152).astype(int)
y_val = (y_val > 152).astype(int)
y_test = (y_test > 152).astype(int)

In [125]:
del df_train['price']
del df_val['price']
del df_test['price']

## Question 2

In [126]:
df_train.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
above_average                       int64
dtype: object

In [127]:
numerical = [
  'latitude',
  'longitude',
  'minimum_nights',
  'number_of_reviews',
  'reviews_per_month',
  'calculated_host_listings_count',
  'availability_365'
]

In [128]:
categorical = [
    'neighbourhood_group',
    'room_type',
]

In [129]:
corr_matrix = df_train[numerical].corr()
corr_matrix

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


In [130]:
corr_matrix[corr_matrix.abs() < 1].unstack().abs().sort_values(ascending = False)

number_of_reviews               reviews_per_month                 0.590374
reviews_per_month               number_of_reviews                 0.590374
calculated_host_listings_count  availability_365                  0.225913
availability_365                calculated_host_listings_count    0.225913
number_of_reviews               availability_365                  0.174477
availability_365                number_of_reviews                 0.174477
reviews_per_month               availability_365                  0.165376
availability_365                reviews_per_month                 0.165376
                                minimum_nights                    0.138901
minimum_nights                  availability_365                  0.138901
reviews_per_month               longitude                         0.134642
longitude                       reviews_per_month                 0.134642
minimum_nights                  reviews_per_month                 0.120703
reviews_per_month        

## Question 3

In [131]:
from sklearn.metrics import mutual_info_score

In [132]:
score_neighbourhood_group = mutual_info_score(df_train.above_average, df_train.neighbourhood_group)
score_neighbourhood_group

0.04651348750524772

In [133]:
score_room_type = mutual_info_score(df_train.above_average, df_train.room_type)
score_room_type

0.14303502293673515

In [134]:
round(score_neighbourhood_group, 2), round(score_room_type, 2)

(0.05, 0.14)

## Question 4

In [135]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [136]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

len(X_train), len(X_val)

(29337, 9779)

In [137]:
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [138]:
model.intercept_[0]

-0.08390077831143408

In [139]:
model.coef_[0].round(3)

array([ 3.000e-03,  3.000e-03, -5.828e+00, -3.172e+00, -1.200e-02,
       -1.090e-01,  1.420e-01,  1.596e+00, -7.000e-03, -1.706e+00,
       -3.000e-03, -4.200e-02,  1.951e+00, -8.300e-01, -1.206e+00])

In [140]:
y_pred = model.predict_proba(X_val)[:, 1]

In [141]:
price_decision = (y_pred >= 152)
(y_val == price_decision).mean()

0.6941405051641272

In [142]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = price_decision.astype(int)
df_pred['actual'] = y_val

In [143]:
df_pred['correct'] = df_pred.prediction == df_pred.actual
mean = df_pred.correct.mean()
mean, round(mean, 2)

(0.6941405051641272, 0.69)

## Question 5

In [144]:
def mutual_info_price_score(series):
  return mutual_info_score(series, df_full_train.price)

In [145]:
mi = df_full_train[categorical + numerical].apply(mutual_info_price_score)
mi.sort_values(ascending=False)

latitude                          3.855891
longitude                         3.564455
reviews_per_month                 0.870668
availability_365                  0.758015
number_of_reviews                 0.432918
room_type                         0.311407
calculated_host_listings_count    0.256906
minimum_nights                    0.163225
neighbourhood_group               0.105645
dtype: float64

In [146]:
def train_model(X, y, Xval):
  m = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
  m.fit(X, y)
  m.intercept_[0]
  m.coef_[0].round(3)
  ypred = m.predict_proba(Xval)[:, 1]
  decision = (ypred >= 152)
  mean = (y_val == decision).mean()
  return mean

In [147]:
def get_subarray(arr, item):
  l = []

  for i in arr:
    if i != item:
      l.append(i)
  
  return l

In [148]:
accuracy = {}
features = numerical + categorical

for v in features:
  feat = features
  feat = get_subarray(features, v)

  dv = DictVectorizer(sparse=False)

  t_dict = df_train[feat].to_dict(orient='records')
  X = dv.fit_transform(t_dict)

  v_dict = df_val[feat].to_dict(orient='records')
  Xval = dv.transform(v_dict)

  accuracy[v] = train_model(X, y_train, Xval)

accuracy

{'availability_365': 0.6941405051641272,
 'calculated_host_listings_count': 0.6941405051641272,
 'latitude': 0.6941405051641272,
 'longitude': 0.6941405051641272,
 'minimum_nights': 0.6941405051641272,
 'neighbourhood_group': 0.6941405051641272,
 'number_of_reviews': 0.6941405051641272,
 'reviews_per_month': 0.6941405051641272,
 'room_type': 0.6941405051641272}