New York City Airbnb Open Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'

!wget $data -O data-homework.csv


<H2> DATA PREPARATION</H2>

In [None]:
df = pd.read_csv('data-homework.csv')

df.head()

In [None]:
df.dtypes

In [None]:
categorical = [
'neighbourhood_group',
'room_type',
]

numerical=['latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']

 

 fill in the missing values with 0.

In [None]:
def prepare_df(df):
    df_copy = df[categorical+numerical]
    df_copy = df_copy.fillna(0)
    df_copy.price = (df_copy.price >= 152).astype(int)
    return df_copy

In [None]:
df.isnull().sum()

In [None]:
df_copy = prepare_df(df)


In [None]:
df_copy.isnull().sum()

<b>Question 1</b> <br>
What is the most frequent observation (mode) for the column 'neighbourhood_group'?



In [None]:
display(df_copy.mode()['neighbourhood_group'][0]);



### Split the data

* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to 42.
* Make sure that the target value ('price') is not in your dataframe.

In [None]:
len(df_copy)

In [None]:
from sklearn.model_selection import train_test_split

df_full_train , df_test =train_test_split(df_copy,test_size=0.2,random_state=42) # Using 20% test Size seed 42

# df_full_train = 80% of df_copy
# df_test = 20% of df_copy

In [None]:
len(df_full_train),len(df_test)

In [None]:
df_train , df_val = train_test_split(df_full_train,test_size=0.25,random_state=42) # From full train split 

In [None]:
len(df_train) , len(df_val) , len (df_test) 

# validation data set of full train 
# df_full_train = 80% of df_copy
# df_test = 20% of df_copy
# df_val = 20% of df_copy and 25% of df_full_train



In [None]:
df_train= df_train.reset_index(drop=True) # not necessary for models to work
df_val = df_val.reset_index(drop=True) # not necessary for models to work
df_test = df_test.reset_index(drop=True) # not necessary for models to work
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [None]:
del df_train['price']
del df_val['price']
del df_test['price']

In [None]:
y_train

### Question 2

* Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your train dataset.
   * In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
* What are the two features that have the biggest correlation in this dataset?


### Make price binary

* We need to turn the price variable from numeric into binary.
* Let's create a variable `above_average` which is `1` if the price is above (or equal to) `152`.


In [None]:
import seaborn as sns

In [None]:
df_full_train.price

In [None]:
df_full_train.price.value_counts()

In [None]:
df_train.corr().unstack().sort_values(ascending=False).drop_duplicates()

In [None]:
sns.heatmap(df_train.corr())
# for better visualization of correlation values

### Question 3

* Calculate the mutual information score with the (binarized) price for the two categorical variables that we have. Use the training set only.
* Which of these two variables has bigger score?
* Round it to 2 decimal digits using `round(score, 2)`


In [None]:
from sklearn.metrics import mutual_info_score



In [None]:

display(round(mutual_info_score(df_full_train.price,df_full_train.neighbourhood_group),2)) # mutual information score

display(round(mutual_info_score(df_full_train.price,df_full_train.room_type),2)) # mutual information score


In [None]:
df_full_train.dtypes

### Question 4

* Now let's train a logistic regression
* Remember that we have two categorical variables in the data. Include them using one-hot encoding.
* Fit the model on the training dataset.
   * To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
   * `model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)`
* Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.


In [None]:
df_train.dtypes

In [None]:
from sklearn.feature_extraction import DictVectorizer


base_features = (categorical + numerical).copy()

base_features.remove('price') # Removing the price 


In [None]:

train_dict= df_train[ base_features ].to_dict(orient='records') # creating a dictionary from the data

train_dict[0]


In [None]:
dv = DictVectorizer(sparse=False) # turn the dictionary into a vector
dv.fit(train_dict)
X_train = dv.transform(train_dict)

In [None]:
X_train.shape

In [None]:
dv.get_feature_names()

<h1> Logistic Regression </h1>

$$\sigma = \frac{1}{1+exp(-z)}$$

<h2> Training logic regression with Scikit-Learn </h2>

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42,max_iter=10000)
model.fit(X_train, y_train)


In [None]:
val_dict = df_val[base_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
model.coef_[0].round(2)

In [None]:
model.predict_proba(X_val)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
price_predict = (y_pred > 0.5).astype(int)

In [None]:

( y_val == price_predict).mean()




### Question 5

* We have 9 features: 7 numerical features and 2 categorical.
* Let's find the least useful one using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 
* Which of following feature has the smallest difference? 
   * `neighbourhood_group`
   * `room_type` 
   * `number_of_reviews`
   * `reviews_per_month`


In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

base = [
'neighbourhood_group',
'room_type',
'latitude',
'longitude',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365'
]

base1=base.copy()
base1.remove('neighbourhood_group')


base2=base.copy()
base2.remove('room_type')

base3=base.copy()
base3.remove('latitude')


base4=base.copy()
base4.remove('reviews_per_month')



def Feature_model(new_base):
    train_dicts= df_train[new_base].to_dict(orient='records') # creating a dictionary from the data
    
    dv = DictVectorizer(sparse=False) # turn the dictionary into a vector
    X_train = dv.fit_transform(train_dicts) # building a one-hot enconding matrix out of the data
    dv.fit(train_dicts) # SAME as above
    X_train = dv.transform(train_dicts)
    val_dict = df_val[new_base].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=10000)
    model.fit(X_train, y_train)
    model.predict(X_train
    ) # HARD predictions 
    y_pred =model.predict_proba(X_val
    )[:,1]
    #LOGISTIC REGRESSION
    price_predict = (y_pred > 0.5).astype(int)
    original_accuracy= (y_val == price_predict).mean()

    display(original_accuracy)

    return model


display("Removed neighbourhood_group:")
Feature_model(base1)

display("Removed room_type:")
Feature_model(base2)

display("Removed latitude:")
Feature_model(base3)

display("Removed reviews_per_month")
Feature_model(base4)




<i> Answer : reviews_per_month </i>


### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn
* We'll need to use the original column `'price'`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data.
* This model has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`
* Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest `alpha`.

In [None]:
def prepare_df_lr(df):
    df_copy = df[categorical+numerical]
    df_copy = df_copy.fillna(0)
    return df_copy

In [None]:
df_lr = prepare_df_lr(df)


In [176]:
df_lr.isnull().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64