# Feature Modeling

This notebook will have an ensemble method composed of lasso linear regression, desicion tree regression, and random forest regression in order to predict Airbnb price listings for rentals.

In [39]:
# Importing libraries
import numpy as np
import pandas as pd
from tabulate import tabulate
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet
)
from xgboost import XGBClassifier, XGBRegressor
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, classification_report

import matplotlib as mpl
import matplotlib.pyplot as plt

# Make this notebook's output stable across runs
random_seed = np.random.seed(100)

In [40]:
# Reading in the CSV file
df = pd.read_excel('../listings.xlsx')


In [41]:
# Drop uneccesary columns
df.drop(labels = ['ID', 'Name', 'Host_ID', 'Host_Name', 'Neighborhood_Group',
       'Neighbourhood', 'Last_Review',
       'Reviews_per_Month', 'Calculated_Host_Listings_Count',
       'Availability_365', 'Number_of_Reviews_LTM', 'Llicense', 'City'], axis=1, inplace=True)

In [42]:
print(df.columns)
# Get Room Dummies
room = pd.get_dummies(df['Room_Type'], prefix='Room Type')
df = pd.concat([df, room], axis=1).drop('Room_Type', axis=1)

# Get State Dummies
# state = pd.get_dummies(df['State'], prefix='State')
# df = pd.concat([df, state], axis=1).drop('State', axis=1)

# Print the column names
col_data = []
for col in df.columns:
    # assign data
    single_col_data = []
    single_col_data.append(col)
    single_col_data.append(df[col].nunique())
    col_data.append(single_col_data)
 
# create header
head = ["Column", "Count of Unique Values"]
 
# display table
print(tabulate(col_data, headers=head, tablefmt="grid"))

Index(['Latitude', 'Longitude', 'Room_Type', 'Price', 'Minimum_Nights',
       'Number_of_Reviews', 'MedianIncome', 'MedianAge'],
      dtype='object')
+---------------------------+--------------------------+
| Column                    |   Count of Unique Values |
| Latitude                  |                   143933 |
+---------------------------+--------------------------+
| Longitude                 |                   143607 |
+---------------------------+--------------------------+
| Price                     |                     2528 |
+---------------------------+--------------------------+
| Minimum_Nights            |                      177 |
+---------------------------+--------------------------+
| Number_of_Reviews         |                      750 |
+---------------------------+--------------------------+
| MedianIncome              |                       26 |
+---------------------------+--------------------------+
| MedianAge                 |                     

In [43]:
# There are no rows with NaN
NAN = df[df.isna().any(axis=1)]
print(NAN)

Empty DataFrame
Columns: [Latitude, Longitude, Price, Minimum_Nights, Number_of_Reviews, MedianIncome, MedianAge, Room Type_Entire home/apt, Room Type_Hotel room, Room Type_Private room, Room Type_Shared room]
Index: []


### Linear Regression

In [44]:
# Split the data into the inputs and output
y = df['Price']
X = df.drop(labels='Price', axis=1)
X2 = df.drop(labels=['Price', 'Latitude', 'Longitude'], axis=1)

In [45]:
# print(X)
# print(y)
print(X2)

        Minimum_Nights  Number_of_Reviews  MedianIncome  MedianAge  \
0                   30                 89         49809       39.0   
1                    1                347         49809       39.0   
2                    1                 67         49809       39.0   
3                    1                296         49809       39.0   
4                   30                 58         49809       39.0   
...                ...                ...           ...        ...   
216308               1                  0         92266       34.0   
216309               1                  0         92266       34.0   
216310               1                  0         92266       34.0   
216311               3                  0         92266       34.0   
216312              91                  0         92266       34.0   

        Room Type_Entire home/apt  Room Type_Hotel room  \
0                               1                     0   
1                               1        

In [46]:
df.dtypes

Latitude                     float64
Longitude                    float64
Price                          int64
Minimum_Nights                 int64
Number_of_Reviews              int64
MedianIncome                   int64
MedianAge                    float64
Room Type_Entire home/apt      uint8
Room Type_Hotel room           uint8
Room Type_Private room         uint8
Room Type_Shared room          uint8
dtype: object

In [58]:
# Split data into training and test sets, stratify based on State

(X_train, X_test,
 y_train, y_test) = train_test_split(df[['Latitude', 'Longitude', 'Minimum_Nights', 'Number_of_Reviews', 'MedianIncome', 'MedianAge', 'Room Type_Entire home/apt', 'Room Type_Hotel room', 'Room Type_Private room', 'Room Type_Shared room']], df['Price'], test_size=0.75, random_state=random_seed, stratify='State')

print(X_train.head())

# Scale data
# scaler = StandardScaler()
# X_scale = scaler.fit_transform(X)

TypeError: Singleton array array('State', dtype='<U5') cannot be considered a valid collection.

In [48]:
# Scale data for X2
# scaler = StandardScaler()
# X_scale2 = scaler.fit_transform(X2)

# # Split data into training and test sets
# (X_train2, X_test2,
#  y_train2, y_test2) = train_test_split(X_scale2, y, test_size=0.75, random_state=random_seed)

In [49]:
 #Define functions
lin_reg = LinearRegression()
ridge = Ridge(tol=.001)
lass = Lasso(tol=0.05)
elastic_net = ElasticNet(tol=0.05)

#### Linear Regression

In [50]:
# do Linear Regression
lin_reg_fit = lin_reg.fit(X_train2, y_train2)
y_pred = lin_reg_fit.predict(X_test2)
print("\n Linear Regression: " + str(r2_score(y_test2, y_pred)))


 Linear Regression: 0.02945980071210763


#### Ridge Regression

In [51]:
# do param_grid for Ridge
ridge_search = GridSearchCV(ridge, param_grid={'alpha': [0.01, 0.25, 0.30, 0.40, 0.5, 1, 1.5, 2, 3, 4, 5]}, scoring='r2')
ridge_fit = ridge_search.fit(X_train, y_train)
print("\n RIDGE: The best score across ALL searched params:\n", ridge_search.best_score_)
print("\n RIDGE: The best parameters across ALL searched params:\n", ridge_search.best_params_)


 RIDGE: The best score across ALL searched params:
 0.03450636420147011

 RIDGE: The best parameters across ALL searched params:
 {'alpha': 5}


#### Lasso Regression

In [52]:
# do param_grid for Lasso
lasso_search = GridSearchCV(ridge, param_grid={'alpha' : [0.01, 0.25, 0.50, 0.75, 1, 1.5, 2, 3, 4, 5]}, scoring='r2')
lasso_fit = lasso_search.fit(X_train, y_train)
print("\n LASSO: The best score across ALL searched params:\n", lasso_search.best_score_)
print("\n LASSO: The best parameters across ALL searched params:\n", lasso_search.best_params_)


 LASSO: The best score across ALL searched params:
 0.03450636420147011

 LASSO: The best parameters across ALL searched params:
 {'alpha': 5}


#### Elastic Net Regression

In [53]:
# NOT WORKING.  do param_grid for Elastic
# elasticnet_search = GridSearchCV(elastic_net, param_grid={'alpha': [0.01, 0.5, 1, 1.5, 2, 3, 4, 5], 'l1_ratio' : [0.25, 0.30, 0.40, 0.50, 0.75, 1]}, scoring='r2')
# elasticnet_fit = elasticnet_search.fit(X_train, y_train)
# print("\n ELASTICNET: The best score across ALL searched params:\n", elasticnet_search.best_score_)
# print("\n ELASTICNET: The best parameters across ALL searched params:\n", elasticnet_search.best_params_)

### Decision Tree

In [54]:
tree_reg = DecisionTreeRegressor(max_depth=10, random_state=random_seed)
tree_reg.fit(X_train, y_train)

# Evaluate on testing data
y_pred = tree_reg.predict(X_train)
print("Training Dataset Accuracy: ", r2_score(y_train, y_pred))

Training Dataset Accuracy:  0.5806469955949383


In [55]:
# Evaluate on training data
y_test_pred = tree_reg.predict(X_test)
print("Testing Dataset Accuracy: ", r2_score(y_test, y_test_pred))

Testing Dataset Accuracy:  -0.14598121955940901


### Random Forest

In [56]:
# Use cross validation to pick the number of trees (estimators)
# and the max depth of the trees
search_parameters = {
    'n_estimators':range(100, 400, 100),
    'max_depth':range(6, 8)
}

rnd_reg = RandomForestRegressor(n_jobs=3, random_state=random_seed) 
grid_reg = GridSearchCV(rnd_reg, search_parameters)
grid_reg.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print(f'R2 Score: {grid_reg.best_score_:.3f}')
grid_reg.best_params_

Accuracy: 0.122


{'max_depth': 7, 'n_estimators': 100}

In [None]:
# Get the best regressor from GridSearchCV
best_reg = grid_reg.best_estimator_

# Get accuracy on the train data
y_pred = best_reg.predict(X_train)
print("Training Dataset Accuracy: ",r2_score(y_train, y_pred))

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
# Get accuracy on the test data
y_pred = best_reg.predict(X_test)
print("Training Dataset Accuracy: ",r2_score(y_test, y_pred))

### XGB Boost Regression

In [None]:
# Create an XGBoost classifier and fit to training data
xgbreg = XGBRegressor(use_label_encoder=False,
                       eval_metric='mlogloss')

xgbreg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             use_label_encoder=False, validate_parameters=1, verbosity=None)

In [None]:
# Evaluate on training data
y_pred = xgbreg.predict(X_train)
print("Training Dataset Accuracy: ", r2_score(y_train, y_pred))

Training Dataset Accuracy:  0.6925364515575965


In [None]:
# Evaluate on testing data
y_test_pred = xgbreg.predict(X_test)
print("Testing Dataset Accuracy: ", r2_score(y_test, y_test_pred))

Testing Dataset Accuracy:  0.08995725832207058


## Voting Regressor

In [None]:
voting_reg = VotingRegressor(
    estimators=[('tree', tree_reg),
                ('xbg', xgbreg)])

In [None]:
# Evaluate voting regressor on the training data
voting_reg.fit(X_train, y_train)
y_pred = voting_reg.predict(X_train)
print("Training Dataset Accuracy: ", r2_score(y_train, y_pred,))

Training Dataset Accuracy:  0.6669984786376152


In [None]:
for reg in voting_reg.estimators_:
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print('{:<24} {:.3f}'.format(reg.__class__.__name__,
                                 r2_score(y_test, y_pred)))

DecisionTreeRegressor    -0.140
XGBRegressor             0.090


In [None]:
# Evaluate voting regressor on testing data
y_test_pred = voting_reg.predict(X_test)
print("Testing Dataset Accuracy: ", r2_score(y_test, y_test_pred,))

Testing Dataset Accuracy:  0.06622620463801698


### Adding a 5% Increase AND Rounding

In [None]:
# Fitting a 5% increase to the price.

# Add a 5% increase to all the predicted values
y_pred_inflated = y_pred * 1.05

# Validating if that worked
print(y_pred[0])
print(y_pred_inflated[0])

# Rounding to the nearest whole number
pred_round_to_whole_num = [round(num, 1) for num in y_pred_inflated]

# Validating if rounding worked
pred_round_to_whole_num[0]

144.75829
151.99619


152.0