In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
%matplotlib inline

In [16]:
#read in data, and figure out the feathers affect review score rating
dflb = pd.read_csv('list_boston.csv')

In [17]:
dflb = dflb[['id','cancellation_policy','guests_included','host_response_time','host_response_rate','host_acceptance_rate','host_has_profile_pic','host_identity_verified','room_type','accommodates','bathrooms','bedrooms','beds','bed_type','price','number_of_reviews','review_scores_rating','instant_bookable','reviews_per_month','review_scores_value','review_scores_location','review_scores_communication','review_scores_checkin','review_scores_cleanliness','review_scores_accuracy']]

In [18]:
# update data become float
dflb['price'] = dflb['price'].str.replace('$','')
dflb['price'] = dflb['price'].str.replace(',','')
dflb['price'] = dflb.price.astype(float)
dflb['host_response_rate'] = dflb['host_response_rate'].str.replace('%','')
dflb['host_response_rate'] = dflb.host_response_rate.astype(float)
dflb['host_acceptance_rate'] = dflb['host_acceptance_rate'].str.replace('%','')
dflb['host_acceptance_rate'] = dflb.host_acceptance_rate.astype(float)
dflb = dflb.drop(['id'],axis=1)

In [65]:
dflb_a = dflb

In [66]:
# we need to predict the review score rating, so it should has value
dflb_a = dflb_a.dropna(subset=['review_scores_rating'])

In [67]:
dflb_a.shape[0]

2772

In [68]:
# use mean value to fill nan for int and float data
num_vars = dflb_a.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
    dflb_a[col].fillna((dflb_a[col].mean()), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [69]:
Xb = dflb_a.drop(['review_scores_rating'],axis=1)
yb=dflb_a['review_scores_rating']

In [70]:
#get dummy
cat_vars = Xb.select_dtypes(include=['object']).copy().columns
for var in  cat_vars:
    Xb = pd.concat([Xb.drop(var, axis=1), pd.get_dummies(Xb[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
    


In [71]:
Xb.sum()

guests_included                            4112.000000
host_response_rate                       265070.327586
host_acceptance_rate                     235973.017241
accommodates                               8485.000000
bathrooms                                  3367.578261
bedrooms                                   3467.756962
beds                                       4504.750542
price                                    462049.000000
number_of_reviews                         68210.000000
reviews_per_month                          5547.720000
review_scores_value                       25417.524430
review_scores_location                    26095.138306
review_scores_communication               26741.882863
review_scores_checkin                     26739.169320
review_scores_cleanliness                 25662.546638
review_scores_accuracy                    26143.745020
cancellation_policy_moderate                786.000000
cancellation_policy_strict                 1279.000000
cancellati

In [72]:
reduce_Xb = Xb.iloc[:, np.where((Xb.sum() > 50) == True)[0]]
X_train, X_test, y_train, y_test = train_test_split(reduce_Xb, yb, test_size = .30, random_state=42) 
lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
        
#Predict and score the model
y_test_preds = lm_model.predict(X_test) 
"The r-squared score for the model using only quantitative variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for the model using only quantitative variables was 0.7158221019128446 on 832 values.'

In [74]:
#see the top feathers affect review
def coef_weights(coefficients, X_train):
    '''
    INPUT:
    coefficients - the coefficients of the linear model 
    X_train - the training data, so the column names can be used
    OUTPUT:
    coefs_df - a dataframe holding the coefficient, estimate, and abs(estimate)
    
    Provides a dataframe that can be used to understand the most influential coefficients
    in a linear model by providing the coefficient estimates along with the name of the 
    variable attached to the coefficient.
    '''
    coefs_df = pd.DataFrame()
    coefs_df['est_int'] = X_train.columns
    coefs_df['coefs'] = lm_model.coef_
    coefs_df['abs_coefs'] = np.abs(lm_model.coef_)
    coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)
    return coefs_df

#Use the function
coef_df = coef_weights(lm_model.coef_, X_train)

#A quick look at the top results
coef_df.head(20)

Unnamed: 0,est_int,coefs,abs_coefs
10,review_scores_value,2.967596,2.967596
14,review_scores_cleanliness,2.669329,2.669329
13,review_scores_checkin,1.641381,1.641381
12,review_scores_communication,1.620325,1.620325
15,review_scores_accuracy,1.43103,1.43103
19,host_response_time_within a day,0.906295,0.906295
22,host_has_profile_pic_t,0.749227,0.749227
11,review_scores_location,0.649955,0.649955
20,host_response_time_within a few hours,0.627914,0.627914
21,host_response_time_within an hour,0.570289,0.570289


In [88]:
#do the same thing for seattle data
dfls = pd.read_csv('list_seattle.csv')

In [89]:
dfls = dfls[['id','cancellation_policy','guests_included','host_response_time','host_response_rate','host_acceptance_rate','host_has_profile_pic','host_identity_verified','room_type','accommodates','bathrooms','bedrooms','beds','bed_type','price','number_of_reviews','review_scores_rating','instant_bookable','reviews_per_month','review_scores_value','review_scores_location','review_scores_communication','review_scores_checkin','review_scores_cleanliness','review_scores_accuracy']]

In [90]:
dfls['price'] = dfls['price'].str.replace('$','')
dfls['price'] = dfls['price'].str.replace(',','')
dfls['price'] = dfls.price.astype(float)
dfls['host_response_rate'] = dfls['host_response_rate'].str.replace('%','')
dfls['host_response_rate'] = dfls.host_response_rate.astype(float)
dfls['host_acceptance_rate'] = dfls['host_acceptance_rate'].str.replace('%','')
dfls['host_acceptance_rate'] = dfls.host_acceptance_rate.astype(float)
dfls = dfls.drop(['id'],axis=1)

In [99]:
dfls_a = dfls

In [100]:
num_vars = dfls_a.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
    dfls_a[col].fillna((dfls_a[col].mean()), inplace=True)

In [101]:
Xs = dfls_a.drop(['review_scores_rating'],axis=1)
ys=dfls_a['review_scores_rating']

In [102]:
cat_vars = Xs.select_dtypes(include=['object']).copy().columns
for var in  cat_vars:
    Xs = pd.concat([Xs.drop(var, axis=1), pd.get_dummies(Xs[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
    



In [107]:
reduce_Xs = Xs.iloc[:, np.where((Xs.sum() > 20) == True)[0]]
X_train, X_test, y_train, y_test = train_test_split(reduce_Xs, ys, test_size = .30, random_state=42) 
lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
        
#Predict and score the model
y_test_preds = lm_model.predict(X_test) 
"The r-squared score for the model using only quantitative variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for the model using only quantitative variables was 0.6188254195503238 on 1146 values.'

In [108]:
coef_df = coef_weights(lm_model.coef_, X_train)

#A quick look at the top results
coef_df.head(20)

Unnamed: 0,est_int,coefs,abs_coefs
10,review_scores_value,3.000368,3.000368
14,review_scores_cleanliness,1.947019,1.947019
15,review_scores_accuracy,1.91118,1.91118
21,host_has_profile_pic_t,-1.703535,1.703535
25,bed_type_Futon,-1.334128,1.334128
12,review_scores_communication,1.31515,1.31515
26,bed_type_Pull-out Sofa,-1.068875,1.068875
13,review_scores_checkin,0.880702,0.880702
22,host_identity_verified_t,0.694309,0.694309
11,review_scores_location,0.683894,0.683894
