In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("AB_NYC_2019.csv")

In [3]:
col = ["latitude", "longitude", "price", "minimum_nights", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count", "availability_365", "neighbourhood_group", "room_type"]
df = df[col]
df.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group,room_type
0,40.64749,-73.97237,149,1,9,0.21,6,365,Brooklyn,Private room
1,40.75362,-73.98377,225,1,45,0.38,2,355,Manhattan,Entire home/apt
2,40.80902,-73.9419,150,3,0,,1,365,Manhattan,Private room
3,40.68514,-73.95976,89,1,270,4.64,1,194,Brooklyn,Entire home/apt
4,40.79851,-73.94399,80,10,9,0.1,1,0,Manhattan,Entire home/apt


In [4]:
df.isnull().sum()

latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
neighbourhood_group                   0
room_type                             0
dtype: int64

In [5]:
df = df.fillna(0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   latitude                        48895 non-null  float64
 1   longitude                       48895 non-null  float64
 2   price                           48895 non-null  int64  
 3   minimum_nights                  48895 non-null  int64  
 4   number_of_reviews               48895 non-null  int64  
 5   reviews_per_month               48895 non-null  float64
 6   calculated_host_listings_count  48895 non-null  int64  
 7   availability_365                48895 non-null  int64  
 8   neighbourhood_group             48895 non-null  object 
 9   room_type                       48895 non-null  object 
dtypes: float64(3), int64(5), object(2)
memory usage: 3.7+ MB


#### Question 1
What is the most frequent observation (mode) for the column 'neighbourhood_group'?

**Split the data**
- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value ('price') is not in your dataframe.

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
df.neighbourhood_group.mode()

0    Manhattan
dtype: object

In [9]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
len(df_full_train), len(df_test)

(39116, 9779)

In [11]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [13]:
y_train = np.log1p(df_train.price)
y_val = np.log1p(df_val.price)
y_test = np.log1p(df_test.price)

In [14]:
del df_train["price"]
del df_val["price"]
del df_test["price"]

#### Question 2
- Create the correlation matrix for the numerical features of your train dataset.
    In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?
- Example of a correlation matrix for the car price dataset:
<img src="correlation-matrix.png">


**Make price binary**
- We need to turn the price variable from numeric into binary.
- Let's create a variable above_average which is 1 if the price is above (or equal to) 152.

In [15]:
df.corr()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.084788,0.033939,0.024869,-0.015389,-0.018758,0.019517,-0.010983
longitude,0.084788,1.0,-0.150019,-0.062747,0.059094,0.138516,-0.114713,0.082731
price,0.033939,-0.150019,1.0,0.042799,-0.047954,-0.050564,0.057472,0.081829
minimum_nights,0.024869,-0.062747,0.042799,1.0,-0.080116,-0.124905,0.12796,0.144303
number_of_reviews,-0.015389,0.059094,-0.047954,-0.080116,1.0,0.589407,-0.072376,0.172028
reviews_per_month,-0.018758,0.138516,-0.050564,-0.124905,0.589407,1.0,-0.047312,0.163732
calculated_host_listings_count,0.019517,-0.114713,0.057472,0.12796,-0.072376,-0.047312,1.0,0.225701
availability_365,-0.010983,0.082731,0.081829,0.144303,0.172028,0.163732,0.225701,1.0


In [16]:
y_train = y_train.apply(lambda x: 1 if x>y_train.mean() else 0)

In [17]:
y_val = y_val.apply(lambda x: 1 if x>y_val.mean() else 0)

#### Question 3
- Calculate the mutual information score with the (binarized) price for the two categorical variables that we have. Use the training set only.
- Which of these two variables has bigger score?
- Round it to 2 decimal digits using round(score, 2)

In [18]:
from sklearn.metrics import mutual_info_score

In [19]:
mutual_info_score(y_train, df_train.neighbourhood_group).round(2)

0.05

In [20]:
mutual_info_score(y_train, df_train.room_type).round(2)

0.22

#### Question 4
- Now let's train a logistic regression
- Remember that we have two categorical variables in the data. Include them using one-hot encoding.
- Fit the model on the training dataset.
    To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
- Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.

In [21]:
from sklearn.feature_extraction import DictVectorizer

In [22]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient="records")
X_val = dv.transform(val_dict)

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [25]:
model.intercept_[0]

-0.002599363672554846

In [26]:
model.coef_[0].round(3)

array([ 2.000e-03,  4.000e-03, -2.140e-01, -1.000e-01, -8.000e-03,
       -4.550e-01,  1.770e-01,  1.269e+00, -8.210e-01, -1.720e-01,
       -1.000e-03, -9.200e-02,  1.958e+00, -1.150e+00, -8.110e-01])

In [27]:
y_pred = model.predict_proba(X_val)[:,1]

In [28]:
price_decision = (y_pred >= 0.5)

In [29]:
price_decision

array([False,  True,  True, ..., False, False,  True])

In [30]:
original_acc = (np.array(y_val) == price_decision.astype(int)).mean()

In [31]:
original_acc

0.8158298394518867

#### Question 5
- We have 9 features: 7 numerical features and 2 categorical.
- Let's find the least useful one using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?
   - neighbourhood_group
   - room_type
   - number_of_reviews
   - reviews_per_month

**note: the difference doesn't have to be positive**

In [37]:
def train_model(df_train, df_val):
    train_dict = df_train.to_dict(orient="records")
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val.to_dict(orient="records")
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:,1]
    price_decision = (y_pred >= 0.5)
    return (np.array(y_val) == price_decision.astype(int)).mean()

In [38]:
col_wo_neighbourhood_group = ["latitude", "longitude", "minimum_nights", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count", "availability_365", "room_type"]
original_acc - train_model(df_train[col_wo_neighbourhood_group], df_val[col_wo_neighbourhood_group])

0.0036813580120667044

In [39]:
col_wo_room_type = ["latitude", "longitude", "minimum_nights", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count", "availability_365", "neighbourhood_group"]
original_acc - train_model(df_train[col_wo_room_type], df_val[col_wo_room_type])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.16290009203395028

In [40]:
col_wo_number_of_reviews = ["latitude", "longitude", "minimum_nights", "reviews_per_month", "calculated_host_listings_count", "availability_365", "neighbourhood_group", "room_type"]
original_acc - train_model(df_train[col_wo_number_of_reviews], df_val[col_wo_number_of_reviews])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.0012271193373555311

In [41]:
col_wo_reviews_per_month = ["latitude", "longitude", "minimum_nights", "number_of_reviews", "calculated_host_listings_count", "availability_365", "neighbourhood_group", "room_type"]
original_acc - train_model(df_train[col_wo_reviews_per_month], df_val[col_wo_reviews_per_month])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.00010225994477963685

#### Question 6
- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'price'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data.
- This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest alpha.

In [42]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = np.log1p(df_train.price)
y_val = np.log1p(df_val.price)
y_test = np.log1p(df_test.price)
del df_train["price"]
del df_val["price"]
del df_test["price"]

In [63]:
from sklearn.linear_model import Ridge

In [49]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [76]:
def train_model_linear_regu_reg(df_train, df_val, reg=1.0):
    train_dict = df_train.to_dict(orient="records")
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val.to_dict(orient="records")
    X_val = dv.transform(val_dict)
    
    model = Ridge(alpha=reg)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(rmse(y_val, y_pred).round(3))

In [77]:
for r in [0, 0.01, 0.1, 1, 10]:
    train_model_linear_regu_reg(df_train, df_val, r)

0.497
0.497
0.497
0.497
0.498
