In [None]:
'''
GOAL OF THIS NOTEBOOK
(1)Explore inherent feature selection methods given by Random Forest. 
(2)Determine semi-arbitrary cutoff of % important, reduce using those features
(3)Begin introductory model fitting, with goal of minimizing MSE as compared to Rsquared. 
'''

In [None]:
'''
NOTE
The only thing you should have to change to be able to
fully execute everything here is the "path" variable in the first block of code below. 
'''

In [None]:
'''
Data Pre-Processing
'''

In [2]:
import os
from zipfile import ZipFile

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [4]:
cwd = os.getcwd()
datadir = '/'.join(cwd.split('/')) + '/data/'
listings = datadir + 'airbnb_no_impute_20191202.zip'

/Users/anhthyngo/Documents/NYU/Fall 2019/DS-GA 1001/project/ds-ga1001/notebooks/data/airbnb_no_impute_20191202.zip


In [107]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
zf = ZipFile(listings) 
data = pd.read_csv(zf.open("listings.csv"), parse_dates=['host_since', 'first_review', 'last_review'], low_memory=False)

path= "/Users/Aren/ds-ga1001/data/airbnb_no_impute2.csv"
data = pd.read_csv(path)
data.head(5)

Unnamed: 0,zipcode,latitude,longitude,accommodates,bathrooms,bedrooms,beds,guests_included,minimum_nights,maximum_nights,...,property_type_Yurt,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
0,10027,40.80902,-73.9419,2,1.0,1.0,1.0,2,3,7,...,0,0,0,1,0,0,0,0,1,0
1,11238,40.68514,-73.95976,3,1.0,1.0,4.0,1,1,730,...,0,1,0,0,0,0,0,0,0,1
2,10016,40.74767,-73.975,2,1.0,1.0,1.0,2,3,21,...,0,1,0,0,0,0,0,0,0,1
3,10019,40.76489,-73.98493,2,1.0,1.0,1.0,1,2,14,...,0,0,0,1,0,0,0,0,0,1
4,10025,40.80178,-73.96723,1,1.0,1.0,1.0,1,2,14,...,0,0,0,1,0,0,0,0,0,1


In [108]:
data.shape

(47638, 309)

In [109]:
X = data.drop('price',axis=1)
Y = data.pop('price')


In [110]:
nan_cols = [i for i in X.columns if X[i].isnull().any()]
nan_cols

['review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

In [None]:
'''
Some columns have NANs. To move forward with feature selection, these will be imputed. 
When we get to the pipeline implementation, we can be more careful about this. 
For now, as in line with the rest of our analysis, we'll impute with median. 

We will separately impute our training and testing split (block below)
'''

In [111]:
'''
PART 1 -- INHERENT FEATURE SELECTION OF RANDOM FOREST
Essentially here we're just creating some notion of how a Regression RF does in terms of mean accuracy with or without 
certain features.
While it's worth nothing there is zero model optimization going on here, there's still information to be gained. 
'''
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=.2,random_state=42)

In [114]:
for col in nan_cols:
    Xtrain[col].fillna(Xtrain[col].median(),inplace=True)
    Xtest[col].fillna(Xtest[col].median(),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [115]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()
model = regr.fit(Xtrain,Ytrain)



In [None]:
'''
Now that the model has been fit, we can look at what features it deems to be important. 
In addition, it's worth knowing the ratio of important variables to overall features. 
'''

In [116]:
feature_importances = model.feature_importances_
sorted_indices = np.argsort(feature_importances)

In [None]:
'''
The following block will produce a sorted list of the columns with a feature importance > .005,
as well as some additional information (discussed above) that is worth knowing.
'''

In [125]:
total = 0 
reduced_features = []
for i in range(len(sorted_indices)):
    feature , importance = list(X.columns)[sorted_indices[-i]], feature_importances[sorted_indices[-i]]
    if(importance > .005):
        reduced_features.append(feature)
        total+=importance
        print(feature,importance)
print("-------------------------------------------------------------------------")
print(len(feature_importances[feature_importances>=.005]),"variables account for",round(total*100,2),"% of the feature importance.")
print("\n")
print("Can account for",round(total*100,2),"% of the feature importance deemed by the model with",round((len(feature_importances[feature_importances>=.005])/X.shape[1])*100,2),"% \n of the original number of features")
print("-------------------------------------------------------------------------")

neighbourhood_cleansed_Theater District 0.14057858012096883
longitude 0.09298356643570668
bathrooms 0.07662514653516858
minimum_nights 0.06814509941270092
accommodates 0.0476369642724523
latitude 0.04270809100844133
maximum_nights 0.04219363731624001
room_type_Entire home/apt 0.04098633637302451
zipcode 0.03994631885824029
number_of_reviews 0.03018396133468738
availability_365 0.028882605411228966
review_scores_rating 0.026805553581983277
extra_people 0.025545342081168244
review_scores_checkin 0.021741208585875325
beds 0.020418264065019127
bedrooms 0.019150490278983114
availability_90 0.019081402241180545
neighbourhood_cleansed_Astoria 0.019050188531999683
neighbourhood_cleansed_Lower East Side 0.01857310952196271
availability_60 0.01685532631051378
property_type_Boutique hotel 0.013859988235074039
availability_30 0.013780838872807736
neighbourhood_cleansed_Tribeca 0.010237621734653545
review_scores_location 0.009743990055025742
neighbourhood_cleansed_East Harlem 0.008437725912150907
r

In [None]:
'''
CONCLUSION
29 variables account for 92% of feature importance seems to be a good tradeoff between
complexity and feature relevance. We can move forward from here
'''

In [132]:
'''
PART 2a-- CREATING REDUCED DATASET
'''

In [137]:
'''
Re-reading in data so we don't have any carryover from modifications in step1
'''

data = pd.read_csv(path)
reduced_features.append('price')
newdata =  data[[c for c in data.columns if c in reduced_features]]
newdata.head()

Unnamed: 0,zipcode,latitude,longitude,accommodates,bathrooms,bedrooms,beds,guests_included,minimum_nights,maximum_nights,...,price,neighbourhood_cleansed_Astoria,neighbourhood_cleansed_East Harlem,neighbourhood_cleansed_Greenwich Village,neighbourhood_cleansed_Lower East Side,neighbourhood_cleansed_Theater District,neighbourhood_cleansed_Tribeca,property_type_Boutique hotel,property_type_Condominium,room_type_Entire home/apt
0,10027,40.80902,-73.9419,2,1.0,1.0,1.0,2,3,7,...,150.0,0,0,0,0,0,0,0,0,0
1,11238,40.68514,-73.95976,3,1.0,1.0,4.0,1,1,730,...,89.0,0,0,0,0,0,0,0,0,1
2,10016,40.74767,-73.975,2,1.0,1.0,1.0,2,3,21,...,200.0,0,0,0,0,0,0,0,0,1
3,10019,40.76489,-73.98493,2,1.0,1.0,1.0,1,2,14,...,79.0,0,0,0,0,0,0,0,0,0
4,10025,40.80178,-73.96723,1,1.0,1.0,1.0,1,2,14,...,79.0,0,0,0,0,0,0,0,0,0


In [139]:
newdata.shape

(47638, 30)

In [None]:
'''
If you want to read this csv onto your local system and run some models locally, just run this 
block with a specified path.

NOTE: outside of feature reduction, there are no differences between this and the original dataset. 
the 5 columns will still have NANs, there is no train-test splitting going on, etc. 
'''
savePath = "YOUR LOCAL PATH HERE"
newdata.to_csv(savePath)

In [None]:
'''
PART 2b -- MODELING

Now we can start with some modeling to see what can produce the lowest MSE. 
OPTIONS FOR MODELING: 
Linear Regression - OLS (cross validate)
Linear Regression - Lasso (cross validate, parameter grid search)
Linear Regression - Ridge (cross validate, parameter grid search)
Random Forest - (hyperparameter grid search)
XGBoost - (hyperparameter grid search)
'''