My outline for this in depth analysis was to try out different models, see which model has the best CV score, tune the hyperparameters of the top performing models, and see the final result.

However, I see that the CV scores I'm getting are currently really low (some even have negative cv scores which I read only happens when you have a really bad model :///////). I tried it on two different metrics (my target/dependent variable): plainly predicting the number of cancellations and predicting the number of cancellations divided by the number of reviews.

In [199]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats

In [200]:
#data cleaning and wrangling

reviews=pd.read_csv('/Users/anna/Downloads/reviews.csv.gz',compression='gzip').dropna()
listings=pd.read_csv('/Users/anna/Downloads/listings (1).csv',error_bad_lines=False)
neighborhoods=pd.read_csv('/Users/anna/Downloads/neighbourhoods.csv',error_bad_lines=False)
crime=pd.read_csv('/Users/anna/Downloads/crime rates.csv',error_bad_lines=False).dropna()

inactive=listings.loc[listings.availability_365==365]
inactive.set_index('id',inplace=True)
listings.set_index('id',inplace=True)
new_listings=listings.drop(inactive.index,axis=0)

autopost_1=reviews[reviews.comments.str.contains('The host canceled my reservation')]
autopost_2=reviews[reviews.comments.str.contains('This is an automated posting')]
autopost=pd.concat([autopost_1,autopost_2], axis=0).reset_index()


def days_count(text):
    if 'the day before' in text:
        return 1
    else:
        return int(re.findall('\d+', text)[0])
    
total_days=pd.DataFrame(autopost.comments.apply(days_count))

auto=pd.merge(autopost,total_days,how='left',left_index=True,right_index=True)

avg_days=pd.DataFrame(auto[['listing_id','comments_y']].groupby('listing_id')['comments_y'].apply(np.mean).apply(round))


canceled=autopost.listing_id.value_counts()
num_cancel=canceled.to_frame(name='cancellations')
new_listings['num_cancellations']=num_cancel

booked=365-new_listings.availability_365
booked=booked.to_frame('days booked')
df=new_listings.merge(booked, how='left',left_index=True,right_index=True)
df['days_cancelled_avg']=avg_days
df=df[['neighbourhood','room_type','price','minimum_nights','number_of_reviews'\
       ,'calculated_host_listings_count','days booked','num_cancellations','days_cancelled_avg']]
df['num_cancellations']=df.num_cancellations.fillna(0)
df['days_cancelled_avg']=df.days_cancelled_avg.fillna(0)


conv = lambda x: float(x.replace(',',''))
crime['Number_of_offences']=crime.Number_of_offences.apply(conv)
all_crimes=crime[crime['Offences']=='All recorded offences']

all_crimes=all_crimes[all_crimes['Borough']!='Inner London']
all_crimes=all_crimes[all_crimes['Borough']!='England and Wales']
all_crimes=all_crimes[all_crimes['Borough']!='Met Police Area']
all_crimes=all_crimes[all_crimes['Borough']!='Outer London']
all_crimes=all_crimes[all_crimes['Borough']!='Heathrow']

conv_fl = lambda x: int(x.replace('-',''))
year = all_crimes.Year.apply(conv_fl)
all_crimes['Year']=year
latest=all_crimes[(all_crimes['Year'])>=201112]
df_main=pd.merge(df,latest.groupby('Borough').mean(),left_on='neighbourhood',right_on='Borough',how='left').drop(columns=['Year','Number_of_offences'])


In [201]:
df_metrics=df_main
df_metrics['cancel_review']=round((df.num_cancellations/df.number_of_reviews),4)
df_metrics['cancel_review']=df_metrics.cancel_review.fillna(0)

In [202]:
#create dummies

categories=['price','neighbourhood','room_type','minimum_nights','number_of_reviews', 'days booked', 'days_cancelled_avg', 'Rate']

for category in categories:
    df_main[category]=df_main[category].astype('category')
    
new_df=pd.get_dummies(df_main,drop_first=True)

X=new_df.drop(columns=['calculated_host_listings_count', 'num_cancellations'])
take_log = lambda x: np.log(x) if x>0 else 0
y=new_df.num_cancellations.apply(take_log)

## Linear Regression

In [203]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = LinearRegression()
reg.fit(X_train, y_train)

cv_scores= cross_val_score(reg, X_train, y_train, cv=5,scoring='r2')


print('Linear Regression Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())


Linear Regression Cross-Validation Scores:
 [-6.57263634e+19 -6.48071599e+22 -2.91565127e+16 -1.55813758e+20
 -1.96186019e+20]
Average Score on 5-Folds: -13044983047799491788800.00%


## Ridge Regression

In [204]:
print('scores:\n',cross_val_scores)

scores:
 [ -2.27916476  -5.85067312  -0.88889237 -11.94086785  -8.99423917]


In [205]:
from sklearn.linear_model import Ridge

ridge = Ridge(random_state=42)
ridge.fit(X_train, y_train)

cv_scores = cross_val_score(ridge, X_train, y_train, cv=5, scoring="r2")*100

print('Ridge Regression Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())


Ridge Regression Cross-Validation Scores:
 [20.62296048 21.83250404 19.71058208 21.38504286 21.93898812]
Average Score on 5-Folds: 21.10%


## Lasso Regression

In [206]:
from sklearn.linear_model import Lasso

lasso = Lasso(random_state=42)
lasso.fit(X_train, y_train)

cv_scores = cross_val_score(lasso, X_train, y_train, cv=5, scoring="r2")*100



print('Lasso Regression Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())

Lasso Regression Cross-Validation Scores:
 [-0.00920381 -0.00270682 -0.01337011 -0.00737041 -0.00225126]
Average Score on 5-Folds: -0.01%


## Decision Trees

In [207]:
from sklearn import tree

trees = tree.DecisionTreeRegressor(max_depth=6, random_state=42)
trees.fit(X_train, y_train)

from sklearn.ensemble import BaggingRegressor

bag = BaggingRegressor(base_estimator=trees, random_state=42)
bag.fit(X_train, np.ravel(y_train))

cv_scores = cross_val_score(bag, X_train, np.ravel(y_train), cv=5, scoring="r2")*100

print('Decision Trees Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())

Decision Trees Cross-Validation Scores:
 [4.13520527 5.8091225  3.94495879 6.02381468 3.8016622 ]
Average Score on 5-Folds: 4.74%


## Random Forest

In [239]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(random_state=42)
forest.fit(X_train, np.ravel(y_train))

cross_val_scores = cross_val_score(forest, X_train, np.ravel(y_train), cv=5, scoring="r2")*100


print('Random Forest Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())



Random Forest Cross-Validation Scores:
 [-15.73347843  -4.53792521  -2.74621846 -45.21973406 -23.41130515]
Average Score on 5-Folds: -18.33%


## Gradient Boost 

In [208]:

from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, np.ravel(y_train))

cv_scores = cross_val_score(gbr, X_train, np.ravel(y_train),cv=5, scoring="r2")*100

print('Gradient Boost Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())

Gradient Boost Cross-Validation Scores:
 [13.05191473 13.67639528 12.67413229 14.18105751 12.40067699]
Average Score on 5-Folds: 13.20%


# Second Metric: (number of cancellations)/(number of reviews)

In [219]:
#create dummies

categories2=['price','neighbourhood','room_type','minimum_nights', 'days booked', 'days_cancelled_avg', 'Rate']

for category in categories2:
    df_metrics[category]=df_metrics[category].astype('category')
    
new_df2=pd.get_dummies(df_metrics,drop_first=True)

In [None]:
#create dummies

categories2=['price','neighbourhood','room_type','minimum_nights', 'days booked', 'days_cancelled_avg', 'Rate']

for category in categories2:
    df_metrics[category]=df_metrics[category].astype('category')
    
new_df2=pd.get_dummies(df_metrics,drop_first=True)

X2=new_df2.drop(columns=['calculated_host_listings_count', 'num_cancellations','cancel_review'])
y2=new_df2.cancel_review.apply(take_log)

## Linear Regression

In [230]:

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)

reg2 = LinearRegression()
reg2.fit(X2_train, y2_train)

cv_scores= cross_val_score(reg2, X2_train, y2_train, cv=5,scoring='r2')


print('Linear Regression Cross-Validation Scores:\n',cv_scores)
print("Average Score on 10-Folds: %.2f%%" % cv_scores.mean())

Linear Regression Cross-Validation Scores:
 [-3.86344577e+18 -6.14419061e+19 -7.71345512e+19 -1.49236522e+20
 -4.46327860e+20]
Average Score on 10-Folds: -147600856945668063232.00%


## Ridge Regression

In [225]:

ridge2 = Ridge(random_state=42)
ridge2.fit(X_train, y_train)

cv_scores = cross_val_score(ridge2, X2_train, y2_train, \
                                   cv=5, scoring="r2")*100

print('Ridge Regression Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())

Ridge Regression Cross-Validation Scores:
 [-2.66137636 -1.01772156 -1.77384085 -2.41347602 -2.95032144]
Average Score on 5-Folds: -2.16%


## Lasso Regression

In [226]:
from sklearn.linear_model import Lasso

lasso2 = Lasso(random_state=42)
lasso2.fit(X2_train, y2_train)

cv_scores = cross_val_score(lasso2, X2_train, y2_train, cv=5, scoring="r2")*100

print('Lasso Regression Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())

Lasso Regression Cross-Validation Scores:
 [-0.01051867 -0.00491009 -0.00147286 -0.00111656 -0.00044051]
Average Score on 5-Folds: -0.00%


## Decision Trees

In [235]:
from sklearn import tree

trees2 = tree.DecisionTreeRegressor(max_depth=6, random_state=42)
trees2.fit(X_train, y_train)

from sklearn.ensemble import BaggingRegressor

bag2 = BaggingRegressor(base_estimator=trees, \
                       random_state=42)
bag2.fit(X_train, np.ravel(y_train))

cv_scores = cross_val_score(bag2, X2_train, np.ravel(y2_train),
                                   cv=5, scoring="r2")*100

print('Decision Trees Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())

Decision Trees Cross-Validation Scores:
 [-4.47654316 -1.69383502 -1.1900655  -5.30567949 -6.22343193]
Average Score on 5-Folds: -3.78%


## Random Forest 

In [237]:
from sklearn.ensemble import RandomForestRegressor

forest2 = RandomForestRegressor(random_state=42)
forest2.fit(X2_train, np.ravel(y_train))

cv_scores = cross_val_score(forest2, X2_train, np.ravel(y2_train), cv=5, scoring="r2")*100

print('Random Forest Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())



Random Forest Cross-Validation Scores:
 [-15.73347843  -4.53792521  -2.74621846 -45.21973406 -23.41130515]
Average Score on 5-Folds: -18.33%


## Gradient Boost

In [228]:
from sklearn.ensemble import GradientBoostingRegressor

gbr2 = GradientBoostingRegressor(random_state=42)
gbr2.fit(X2_train, np.ravel(y_train))

cross_val_scores = cross_val_score(gbr, X2_train, np.ravel(y2_train), cv=5, scoring="r2")*100

print('Gradient Boost Cross-Validation Scores:\n',cv_scores)
print("Average Score on 5-Folds: %.2f%%" % cv_scores.mean())

Gradient Boost Cross-Validation Scores:
 [-0.01051867 -0.00491009 -0.00147286 -0.00111656 -0.00044051]
Average Score on 5-Folds: -0.00%
