### Load Libraries

In [2]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline

### Read in cleaned dataframe

In [7]:
nyc_copy = pd.read_csv('nyc_copy.csv')
nyc_copy.shape

(38511, 16)

### Perform Regression

### Gather variables of interest

In [8]:
# Trim to data of interest
vars_interest = pd.DataFrame(nyc_copy, columns = ['price','minimum_nights', 'number_of_reviews',
                                  'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 
                                 'room_type', 'neighbourhood_group', 'neighbourhood'] )

# Convert to dummy variables 
# Get dummy variables for categorical data:
# neighbourhood may get removed since it's 200+ variables
vars_final = pd.get_dummies(vars_interest, columns= ['room_type','neighbourhood_group', 'neighbourhood'], drop_first = True)

In [9]:
df = nyc_copy._get_numeric_data() # drop non-numeric columns

# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
vif["features"] = df.columns
vif.round(1)

Unnamed: 0,VIF Factor,features
0,5.4,id
1,2.8,host_id
2,0.8,latitude
3,424206.6,longitude
4,1.0,price
5,1.2,minimum_nights
6,8.9,number_of_reviews
7,10.2,reviews_per_month
8,1.2,calculated_host_listings_count
9,1.6,availability_365


In [11]:
# Prepare for regression
# Gather predictors 
X = vars_final.drop('price', axis=1)
X

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,room_type_Private room,room_type_Shared room,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,...,neighbourhood_Westchester Square,neighbourhood_Westerleigh,neighbourhood_Whitestone,neighbourhood_Williamsbridge,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodside
0,4.342945e-10,9.542425e-01,-6.777807e-01,7.781513e-01,2.562293e+00,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.342945e-10,1.653213e+00,-4.202164e-01,3.010300e-01,2.550228e+00,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,4.342945e-10,2.431364e+00,6.665180e-01,4.342945e-10,2.287802e+00,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.000000e+00,9.542425e-01,-1.000000e+00,4.342945e-10,-9.000000e+00,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4.771213e-01,1.869232e+00,-2.291480e-01,4.342945e-10,2.110590e+00,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38506,4.342945e-10,4.342945e-10,4.342945e-10,4.342945e-10,2.167317e+00,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
38507,4.342945e-10,4.342945e-10,4.342945e-10,7.781513e-01,2.530200e+00,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
38508,4.342945e-10,4.342945e-10,4.342945e-10,4.342945e-10,1.939519e+00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38509,4.342945e-10,3.010300e-01,3.010300e-01,4.342945e-10,1.602060e+00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Gather predicted variable (price) and log transform it
y = pd.DataFrame(nyc_copy, columns = ['price'])
y = np.log10(y['price'])

### Perform Regression

In [13]:
# Perform regression, get r square
regressor = LinearRegression()  
regressor.fit(X, y)

r_sq = regressor.score(X, y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.5797907112586753


In [83]:
# To retrieve the intercept:
# print(regressor.intercept_)

# For retrieving the slope:
# print(regressor.coef_)

In [15]:
# Predict
y_pred = regressor.predict(X)

In [14]:
# View Coefficients
regressor.coef_
coeff_df = pd.DataFrame(X.columns, regressor.coef_)  
coeff_df = coeff_df.reset_index()
coeff_df.rename(columns={'index':'Coefficient',0:'Variable'}, inplace=True)
coeff_df

Unnamed: 0,Coefficient,Variable
0,-8.063223e-02,minimum_nights
1,-1.880370e-02,number_of_reviews
2,-1.412537e-02,reviews_per_month
3,-8.880048e-03,calculated_host_listings_count
4,9.435152e-03,availability_365
...,...,...
223,-2.634048e+07,neighbourhood_Willowbrook
224,-1.280529e+09,neighbourhood_Windsor Terrace
225,1.401843e+08,neighbourhood_Woodhaven
226,-9.522915e-02,neighbourhood_Woodlawn


In [15]:
df = pd.DataFrame({'Actual': np.round(10 ** y, 0), 
                   'Predicted': np.round(10 ** y_pred, 0)})
df.head(10)

NameError: name 'y_pred' is not defined

In [None]:
# Plot actual vs predicted values on subset of data
# Can modify df1 size to view more towers
df1 = df.head(20)
df1.plot(kind='bar',figsize=(10,5))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='black')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [19]:
# Print out model metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_pred)))
print('Mean Average Percent Error:', np.mean(np.abs(y - y_pred)/np.abs(y))) 

Mean Absolute Error: 0.13685907600536099
Mean Squared Error: 0.03483118217337602
Root Mean Squared Error: 0.18663113934543726
Mean Average Percent Error: 0.0667814952832297


### Split into train and test sets

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [21]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train) # training the algorithm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [22]:
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])  
coeff_df

Unnamed: 0,Coefficient
minimum_nights,-8.000620e-02
number_of_reviews,-1.716102e-02
reviews_per_month,-1.356379e-02
calculated_host_listings_count,-6.696373e-03
availability_365,9.072179e-03
...,...
neighbourhood_Willowbrook,-1.387779e-17
neighbourhood_Windsor Terrace,2.504533e+09
neighbourhood_Woodhaven,3.873341e+09
neighbourhood_Woodlawn,-8.803785e-02


In [23]:
y_pred = regressor.predict(X_test)

In [24]:
df = pd.DataFrame({'Actual': np.round(10 ** y_test, 0), 
                   'Predicted': np.round(10 ** y_pred, 0)})
df.head(10)

Unnamed: 0,Actual,Predicted
28784,65.0,89.0
18639,175.0,151.0
28850,125.0,188.0
9104,125.0,174.0
8164,120.0,178.0
21176,1000.0,144.0
19122,799.0,136.0
35456,60.0,129.0
23439,176.0,258.0
35500,50.0,79.0


In [25]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 8957.913365491799
Mean Squared Error: 618102096325.1731
Root Mean Squared Error: 786194.6936511166


In [26]:
print('Price mean:', np.round(np.mean(y), 2))  
print('Price std:', np.round(np.std(y), 2))
print('RMSE:', np.round(np.sqrt(metrics.mean_squared_error(y_test, regressor.predict(X_test))), 2))
print('R2 score train:', np.round(r2_score(y_train, regressor.predict(X_train), multioutput='variance_weighted'), 2))
print('R2 score test:', np.round(r2_score(y_test, regressor.predict(X_test), multioutput='variance_weighted'), 2))

Price mean: 2.04
Price std: 0.29
RMSE: 786194.69
R2 score train: 0.58
R2 score test: -7530374427932.16


### Lets split the data into a low and high price point and predict price on those two separate datasets

In [None]:
"""
From our exploratory analysis we saw that most of the listings are actually < $300 a night, however the few
that are much higher contribute to large variance and likely lower the predicition accuracy of the model. For this
reason I would like to try to run the model on two data sets - the > $300 and the < $300 one. 
"""

In [16]:
# Filter the dataset for prices between 50 and $175
nyc_low = nyc_copy.loc[(nyc_copy['price'] <= 300)]
nyc_high = nyc_copy.loc[(nyc_copy['price'] > 300)]

(2163, 16)

In [17]:
print(nyc_high.shape)

(2163, 16)


In [18]:
print(nyc_low.shape)

(36348, 16)


In [26]:
nyc_high['price'].describe()

count     2163.000000
mean       561.722607
std        586.487126
min        303.000000
25%        350.000000
50%        415.000000
75%        550.000000
max      10000.000000
Name: price, dtype: float64

### Lets do the low price dataset first

In [38]:
# Trim to data of interest
vars_interest_low = pd.DataFrame(nyc_low, columns = ['price','minimum_nights', 'number_of_reviews',
                                  'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 
                                 'room_type', 'neighbourhood_group', 'neighbourhood'] )

# Convert to dummy variables 
# Get dummy variables for categorical data:
# neighbourhood may get removed since it's 200+ variables
vars_final_low = pd.get_dummies(vars_interest_low, columns= ['room_type','neighbourhood_group', 'neighbourhood'], drop_first = True)

In [39]:
# Prepare for regression
# Gather predictors 
X_low = vars_final_low.drop('price', axis=1)
X_low

Unnamed: 0,minimum_nights,reviews_per_month,calculated_host_listings_count,availability_365,room_type_Private room,room_type_Shared room,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,...,neighbourhood_Westchester Square,neighbourhood_Westerleigh,neighbourhood_Whitestone,neighbourhood_Williamsbridge,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodside
0,4.342945e-10,-6.777807e-01,7.781513e-01,2.562293e+00,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.342945e-10,-4.202164e-01,3.010300e-01,2.550228e+00,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.342945e-10,6.665180e-01,4.342945e-10,2.287802e+00,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.000000e+00,-1.000000e+00,4.342945e-10,-9.000000e+00,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4.771213e-01,-2.291480e-01,4.342945e-10,2.110590e+00,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48782,4.342945e-10,4.342945e-10,4.342945e-10,2.167317e+00,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
48790,4.342945e-10,4.342945e-10,7.781513e-01,2.530200e+00,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
48799,4.342945e-10,4.342945e-10,4.342945e-10,1.939519e+00,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
48805,4.342945e-10,3.010300e-01,4.342945e-10,1.602060e+00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# Gather predicted variable (price) and log transform it
y_low = pd.DataFrame(nyc_low, columns = ['price'])
y_low = np.log10(y_low['price'])

In [41]:
# Perform regression, get r square
regressor = LinearRegression()  
regressor.fit(X_low, y_low)

r_sq = regressor.score(X_low, y_low)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.6156169244427214


In [42]:
# Predict
y_pred_low = regressor.predict(X_low)

In [43]:
# Print out model metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_low, y_pred_low))  
print('Mean Squared Error:', metrics.mean_squared_error(y_low, y_pred_low))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_low, y_pred_low)))
print('Mean Average Percent Error:', np.mean(np.abs(y_low - y_pred_low)/np.abs(y_low))) 

Mean Absolute Error: 0.11714841117788516
Mean Squared Error: 0.02286982526963669
Root Mean Squared Error: 0.1512277265240627
Mean Average Percent Error: 0.059655005399768625


In [44]:
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X_low, y_low, test_size=0.2, random_state=0)

In [45]:
regressor = LinearRegression()  
regressor.fit(X_train_low, y_train_low) # training the algorithm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [46]:
y_pred = regressor.predict(X_test_low)

In [47]:
print('Price mean:', np.round(np.mean(y_low), 2))  
print('Price std:', np.round(np.std(y_low), 2))
print('RMSE:', np.round(np.sqrt(metrics.mean_squared_error(y_test_low, regressor.predict(X_test_low))), 2))
print('R2 score train:', np.round(r2_score(y_train_low, regressor.predict(X_train_low), multioutput='variance_weighted'), 2))
print('R2 score test:', np.round(r2_score(y_test_low, regressor.predict(X_test_low), multioutput='variance_weighted'), 2))

Price mean: 2.0
Price std: 0.24
RMSE: 0.15
R2 score train: 0.61
R2 score test: 0.62


### Now for the high price dataset

In [48]:
# Trim to data of interest
vars_interest_high = pd.DataFrame(nyc_high, columns = ['price','minimum_nights', 'number_of_reviews',
                                  'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 
                                 'room_type', 'neighbourhood_group', 'neighbourhood'] )

# Convert to dummy variables 
# Get dummy variables for categorical data:
# neighbourhood may get removed since it's 200+ variables
vars_final_high = pd.get_dummies(vars_interest_high, columns= ['room_type','neighbourhood_group', 'neighbourhood'], drop_first = True)

In [49]:
# Prepare for regression
# Gather predictors 
X_high = vars_final_high.drop('price', axis=1)
X_high

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,room_type_Private room,room_type_Shared room,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,...,neighbourhood_Vinegar Hill,neighbourhood_Wakefield,neighbourhood_Washington Heights,neighbourhood_West Village,neighbourhood_Westchester Square,neighbourhood_Whitestone,neighbourhood_Williamsbridge,neighbourhood_Williamsburg,neighbourhood_Windsor Terrace,neighbourhood_Woodside
85,4.342945e-10,1.397940e+00,-6.197888e-01,4.342945e-10,0.845098,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
103,6.020600e-01,1.662758e+00,-2.596373e-01,3.010300e-01,2.385606,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
114,3.010300e-01,8.450980e-01,-1.221849e+00,6.020600e-01,2.474216,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
116,4.771213e-01,2.510545e+00,4.785665e-01,4.342945e-10,2.029384,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
121,3.010300e-01,1.204120e+00,-6.197888e-01,3.010300e-01,2.334454,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47774,6.989700e-01,3.010300e-01,3.010300e-01,4.342945e-10,2.396199,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
47927,3.010300e-01,4.342945e-10,4.342945e-10,4.342945e-10,2.539076,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
47950,4.771213e-01,4.342945e-10,4.342945e-10,4.342945e-10,2.537819,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
48118,4.771213e-01,4.771213e-01,4.771213e-01,4.342945e-10,2.537819,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
# Gather predicted variable (price) and log transform it
y_high = pd.DataFrame(nyc_high, columns = ['price'])
y_high = np.log10(y_high['price'])

In [6]:
# Perform regression, get r square
regressor = LinearRegression()  
regressor.fit(X_high, y_high)

r_sq = regressor.score(X_high, y_high)
print('coefficient of determination:', r_sq)

NameError: name 'X_high' is not defined

In [52]:
# Predict
y_pred_high = regressor.predict(X_high)

In [56]:
# Print out model metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_high, y_pred_high))  
print('Mean Squared Error:', metrics.mean_squared_error(y_high, y_pred_high))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_high, y_pred_high)))
print('Mean Average Percent Error:', np.mean(np.abs(y_high- y_pred_high)/np.abs(y_high)))

Mean Absolute Error: 0.12800798511522216
Mean Squared Error: 0.03340091318519576
Root Mean Squared Error: 0.1827591671714329
Mean Average Percent Error: 0.046055799177576386


In [57]:
X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(X_high, y_high, test_size=0.2, random_state=0)

In [58]:
regressor = LinearRegression()  
regressor.fit(X_train_high, y_train_high) # training the algorithm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [59]:
y_pred = regressor.predict(X_test_high)

In [60]:
print('Price mean:', np.round(np.mean(y_high), 2))  
print('Price std:', np.round(np.std(y_high), 2))
print('RMSE:', np.round(np.sqrt(metrics.mean_squared_error(y_test_high, regressor.predict(X_test_high))), 2))
print('R2 score train:', np.round(r2_score(y_train_high, regressor.predict(X_train_high), multioutput='variance_weighted'), 2))
print('R2 score test:', np.round(r2_score(y_test_high, regressor.predict(X_test_high), multioutput='variance_weighted'), 2))

Price mean: 2.68
Price std: 0.19
RMSE: 1249822155.52
R2 score train: 0.1
R2 score test: -5.03852950698074e+19
