In [39]:
import numpy as np
import pandas as pd
import datetime as dt

#visualization libraries
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import preprocessing

In [40]:
import matplotlib.pyplot as plt
import seaborn as sns
 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [41]:
def read_listings():
    base = './data/listings_'
    file_type = '.csv'
    lx = pd.read_csv(base + str(0) + file_type)
    return lx

In [42]:
def assign_zipcodes(listings_df):
    ### Zipcode ###
    listings_df['zipcode'] = listings_df['zipcode'].map(lambda x: str(x).replace(' ','') if x != np.nan and len(str(x)) >= 5 else '')
    listings_df['zipcode'] = listings_df['zipcode'].map(lambda y: y if y == '' or y.isdigit() else '')
    
    zipcodes_df = listings_df[['id','zipcode','longitude','latitude']]
    nazipcodes_df = zipcodes_df[zipcodes_df.zipcode == '']
    czipcodes_df = zipcodes_df[zipcodes_df.zipcode != ''].copy()
    
    #assign zipcodes from nearest geographical neighbours 
    for i ,z in nazipcodes_df.iterrows():
        lg = z.longitude
        lt = z.latitude
        czipcodes_df = czipcodes_df.assign(diff=lambda x: np.sqrt((x['longitude'] - lg)**2 + (x['latitude'] - lt)**2))
        temp = czipcodes_df.sort_values(by=['diff'], ascending=True)[:6]
        zipc = temp.zipcode.value_counts().index[0]    
        listings_df.loc[listings_df.id == z.id,['zipcode']] = zipc
    return listings_df

In [43]:
def assign_neighbourhood_from_zipcode(listings_df):
    
    listings_df.loc[:,'neighbourhood'] =  listings_df.neighbourhood_cleansed
    zipcode_df = listings_df.loc[:,['zipcode', 'neighbourhood']]
    zipcode_df['count'] = 1
    zipcode_lookup = zipcode_df.groupby(['zipcode', 'neighbourhood']).agg({'count':'sum'})
    zipcode_lookup = pd.DataFrame(zipcode_lookup.to_records())
    
    for i, r in zipcode_lookup.iterrows():
        mv = zipcode_lookup[zipcode_lookup['zipcode'] == r.zipcode]['count'].max()
        zipcode_lookup.loc[((zipcode_lookup['count'] != mv) & (zipcode_lookup['zipcode'] == r.zipcode)),'count'] = np.nan
        
    zipcode_lookup = zipcode_lookup.dropna()

    for i, r in zipcode_lookup.iterrows():
        listings_df.loc[listings_df.zipcode==r.zipcode, 'neighbourhood'] \
            = zipcode_lookup.loc[zipcode_lookup.zipcode == r.zipcode].neighbourhood.values[0]
    
    return listings_df

In [44]:
def clean_listings_df(listings_df_x):
    listings_df = listings_df_x.copy()
    listings_df = listings_df.dropna(how='all', axis = 1)
    non_unique = listings_df.apply(pd.Series.nunique)
    non_unique_cols = non_unique[non_unique == 1].index
    listings_df.drop(non_unique_cols, axis=1, inplace = True)

    listings_df['price'] = listings_df['price'].map(lambda x: float(x.lstrip('$').replace(',','')))
    
    naidx = listings_df.weekly_price.isna()
    listings_df[naidx] = listings_df.loc[naidx].assign(weekly_price = lambda x: x.price * 7)
    idx = np.logical_not(naidx)
    a = listings_df.loc[idx]
    listings_df.loc[idx, ['weekly_price']] = a['weekly_price'].map(lambda x: float(x.lstrip('$').replace(',','')))
    
    naidx = listings_df.monthly_price.isna()
    listings_df[naidx] = listings_df.loc[naidx].assign(monthly_price = lambda x: x.price * 30)
    idx = np.logical_not(naidx)
    a = listings_df.loc[idx]
    listings_df.loc[idx, ['monthly_price']] = a['monthly_price'].map(lambda x: float(x.lstrip('$').replace(',','')))
    
    listings_df.loc[:,'is_location_exact']  = listings_df.is_location_exact.apply(lambda x: 1 if str(x).strip() == 't' else 0)
    listings_df.loc[:,'name_len'] = listings_df.name.apply(lambda x: 0 if np.isnan(len(str(x).split())) else len(str(x).split()))
    listings_df.loc[:,'description_len'] = listings_df.description.apply(lambda x: 0 if np.isnan(len(str(x).split())) else len(str(x).split()))
    listings_df.loc[:,'space_len'] = listings_df.space.apply(lambda x: 0 if np.isnan(len(str(x).split())) else len(str(x).split()))
    listings_df.loc[:,'summary_len'] = listings_df.summary.apply(lambda x: 0 if np.isnan(len(str(x).split())) else len(str(x).split()))
    listings_df.drop(['name', 'summary', 'space', 'description'],axis=1, inplace=True)

    #creating new columns
    listings_df['has_neighborhood_overview'] = ~listings_df['neighborhood_overview'].isna()
    listings_df['has_notes'] = ~listings_df['notes'].isna()
    listings_df['has_transit'] = ~listings_df['transit'].isna()
    listings_df['has_access'] = ~listings_df['access'].isna()
    listings_df['has_interaction'] = ~listings_df['interaction'].isna()
    listings_df['has_house_rules'] = ~listings_df['house_rules'].isna()
    listings_df.loc[:,'extra_people'] = listings_df['extra_people'].map(lambda x: float(str(x).lstrip('$').replace(',','')))
    listings_df.loc[:,'instant_bookable']  = listings_df.instant_bookable.apply(lambda x: 1 if str(x).strip() == 't' else 0)
    listings_df.loc[:,'require_guest_phone_verification']  = listings_df.require_guest_phone_verification.apply(lambda x: 1 if str(x).strip() == 't' else 0)
    listings_df.loc[:,'require_guest_profile_picture']  = listings_df.require_guest_profile_picture.apply(lambda x: 1 if str(x).strip() == 't' else 0)
    listings_df.drop(['neighborhood_overview','notes','transit','access','interaction','house_rules'], axis = 1,inplace = True)

    listings_df['has_host_about'] = ~listings_df['host_about'].isna()
    listings_df['host_about_len'] = ~listings_df.host_about.apply(lambda x: 0 if np.isnan(len(str(x).split())) else len(str(x).split()))
    listings_df['has_host_location'] = ~listings_df['host_location'].isna()
    listings_df['has_host_response_rate'] = ~listings_df['host_response_rate'].isna()
    listings_df['host_since'] = listings_df.host_since.map(lambda x: x if x is None or str(x) == 'nan' else (dt.datetime.strptime(str(x), '%Y-%m-%d') - dt.datetime.today()).days)
    listings_df.loc[:,'host_is_superhost']  = listings_df.host_is_superhost.apply(lambda x: 1 if str(x).strip() == 't' else 0)
    listings_df.loc[:,'host_has_profile_pic']  = listings_df.host_has_profile_pic.apply(lambda x: 1 if str(x).strip() == 't' else 0)
    listings_df.loc[:,'host_identity_verified']  = listings_df.host_identity_verified.apply(lambda x: 1 if str(x).strip() == 't' else 0)
    listings_df.loc[:,'host_verification_count']  = listings_df.host_verifications.apply(lambda x: 0 if str(x).strip('[]') == '' else len(str(x).strip('[]').split(',')))
    listings_df.drop(['host_about', 'host_location', 'host_verifications'],axis=1, inplace=True)
    listings_df.drop(['host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'],axis=1, inplace=True)
    listings_df.dropna(subset =['host_name'], inplace = True)
    listings_df.drop(['host_url','host_name','host_thumbnail_url','host_picture_url','host_neighbourhood'],axis=1, inplace=True)
    
    #response rate
    listings_df['host_response_rate'].fillna('0%', inplace = True)
    listings_df['host_response_rate'] = listings_df['host_response_rate'].map(lambda x: int(x.rstrip('%')))
    
    ### Bedrooms
    listings_df['bathrooms'].fillna(listings_df['bathrooms'].mean(), inplace=True)
    listings_df['bedrooms'].fillna(listings_df['bedrooms'].mean(), inplace=True)
    listings_df['beds'].fillna(listings_df['beds'].mean(), inplace=True)
    
    
    listings_df['first_review'] = listings_df.first_review.map(lambda x: x if x is None or str(x) == 'nan' else (dt.datetime.strptime(str(x), '%Y-%m-%d') - dt.datetime.today()).days)
    listings_df['last_review'] = listings_df.last_review.map(lambda x: x if x is None or str(x) == 'nan' else (dt.datetime.strptime(str(x), '%Y-%m-%d') - dt.datetime.today()).days)

    listings_df['first_review'].fillna(300,inplace=True)
    listings_df['last_review'].fillna(300,inplace=True)
    
    listings_df['has_reviews'] = ~listings_df['number_of_reviews'].isna()
    listings_df['reviews_per_month'].fillna(0,inplace=True)
    listings_df['minimum_nights_avg_ntm'].fillna(0,inplace=True)
    listings_df['maximum_nights_avg_ntm'].fillna(0,inplace=True)
    listings_df['number_of_reviews_ltm'].fillna(0,inplace=True)
    
    listings_df['review_scores_rating'].fillna(0,inplace=True)
    listings_df['review_scores_accuracy'].fillna(0,inplace=True)
    listings_df['review_scores_cleanliness'].fillna(0,inplace=True)
    listings_df['review_scores_communication'].fillna(0,inplace=True)
    listings_df['review_scores_checkin'].fillna(0,inplace=True)
    listings_df['review_scores_location'].fillna(0,inplace=True)
    listings_df['review_scores_value'].fillna(0,inplace=True)
    
    listings_df['security_deposit'] = listings_df.security_deposit.map(lambda x: x if x is None else float(str(x).strip("', /\n$").replace(',','')))
    listings_df['cleaning_fee'] = listings_df.cleaning_fee.map(lambda x: x if x is None else float(str(x).strip("', /\n$").replace(',','')))
    listings_df['security_deposit'].fillna(0, inplace = True)
    listings_df['cleaning_fee'].fillna(0, inplace = True)
    listings_df.loc[:,'amenities_no'] =listings_df.amenities.apply(lambda x: 0 if str(x).strip('[]') == '' else len(str(x).strip('[]').split(',')))
    listings_df.drop(['amenities'],axis=1, inplace=True)
    
    #dropping columns
    #all listings are in same city
    listings_df.drop(['market', 'state', 'city', 'neighbourhood_cleansed','zipcode'], axis=1, inplace=True)
    #irrelevant system information
    listings_df.drop(['listing_url', 'host_id'], axis=1, inplace=True)
    listings_df.drop(['picture_url'], axis=1, inplace=True)
    #in this project zipcode will be the minimum granulity
    listings_df.drop(['street', 'smart_location'], axis=1, inplace=True)
    #cleaning future information
    listings_df.drop([ 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights'],axis=1, inplace=True)
    listings_df.drop(['calendar_updated', 'availability_30', 'availability_60', 'availability_90', 'availability_365'],axis=1, inplace=True)
    #listings_df = populate_cols(listings_df, 'zipcode')
    listings_df.rename(columns={'id':'listing_id'},inplace = True)
    #more than %90 is Nan
    listings_df.drop('square_feet',axis=1, inplace=True)
    return listings_df

In [45]:
def coef_weights_model(model, coefficients, X_train):
    '''
    INPUT:
    coefficients - the coefficients of the linear model 
    X_train - the training data, so the column names can be used
    OUTPUT:
    coefs_df - a dataframe holding the coefficient, estimate, and abs(estimate)
    
    Provides a dataframe that can be used to understand the most influential coefficients
    in a linear model by providing the coefficient estimates along with the name of the 
    variable attached to the coefficient.
    '''
    coefs_df = pd.DataFrame()
    coefs_df['est_int'] = X_train.columns
    coefs_df['coefs'] = model.coef_
    coefs_df['abs_coefs'] = np.abs(model.coef_)
    coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)
    return coefs_df

In [46]:
def prepare_x_y(df, col_to_predict, cols_to_drop=[]):
    y = df.loc[:,col_to_predict]
    X = df.drop([col_to_predict],axis=1)
    X.drop(cols_to_drop ,axis=1, inplace=True)
    cat_cols = X.select_dtypes(include=['object']).columns
    X_cat = pd.get_dummies(X[cat_cols], prefix=cat_cols, prefix_sep='_', drop_first=True)
    X.drop(cat_cols, axis=1, inplace=True)
    X = pd.concat([X, X_cat], axis=1)
    return X, y

In [47]:
listings_df = read_listings()
listings_df = assign_zipcodes(listings_df)
listings_df = assign_neighbourhood_from_zipcode(listings_df)
listings_df = clean_listings_df(listings_df)

In [48]:
q_low = listings_df["price"].quantile(0.03)
q_hi  = listings_df["price"].quantile(0.97)

listings_df = listings_df[(listings_df["price"] < q_hi) & (listings_df["price"] > q_low)]

In [49]:
result2 = listings_df.copy()
result2.drop(['listing_id'],axis=1, inplace=True)

In [50]:
cols_to_drop = ['weekly_price', 'monthly_price', 'longitude', 'latitude']
col_to_predict = 'price'

In [51]:
X , y = prepare_x_y(result2, 'price', cols_to_drop)

In [52]:
X = X.applymap(lambda x: np.uint8(1) if x == True else (np.uint8(0) if x == False else x))

In [53]:
X.describe()

Unnamed: 0,host_since,host_response_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,is_location_exact,accommodates,bathrooms,bedrooms,beds,...,room_type_Hotel room,room_type_Private room,room_type_Shared room,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed,cancellation_policy_moderate,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_60
count,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0,...,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0,7185.0
mean,-1691.353653,45.143633,0.113013,0.997077,0.367293,0.718024,3.153793,1.159944,1.383056,1.990679,...,0.007098,0.138344,0.00501,0.00167,0.00167,0.004732,0.991093,0.263326,0.255393,0.000696
std,698.8179,48.006075,0.316631,0.053987,0.482101,0.449993,1.589439,0.370296,0.990353,1.439015,...,0.083957,0.345284,0.070612,0.040836,0.040836,0.068632,0.093965,0.440469,0.436112,0.026372
min,-4052.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-2155.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,-1737.0,0.0,0.0,1.0,0.0,1.0,3.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,-1247.0,100.0,0.0,1.0,1.0,1.0,4.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
max,-200.0,100.0,1.0,1.0,1.0,1.0,16.0,5.0,10.0,21.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [54]:
X_back = X.copy()

In [55]:
X = X_back.copy()

In [56]:
X_s = X.copy()

In [57]:
from sklearn.preprocessing import StandardScaler
sc_result = StandardScaler()
X_s[:] = sc_result.fit_transform(X)

In [58]:
# remove constant features
constant_cols = [
    col for col in X_s.columns if X_s[col].std() == 0
]
 
X.drop(labels=constant_cols, axis=1, inplace=True)
X_s.drop(labels=constant_cols, axis=1, inplace=True)

X.shape

(7185, 100)

In [59]:
# remove quasi-constant features
sel = VarianceThreshold(
    threshold=0.01)  # 0.1 indicates 99% of observations approximately
 
sel.fit(X_s)  # fit finds the features with low variance

sum(sel.get_support()) # how many not quasi-constant?

100

In [60]:
cols = X.columns[sel.get_support()]
X = sel.transform(X)
X = pd.DataFrame(X)
X.columns = cols

In [61]:
X.head()

Unnamed: 0,host_since,host_response_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,is_location_exact,accommodates,bathrooms,bedrooms,beds,...,room_type_Hotel room,room_type_Private room,room_type_Shared room,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed,cancellation_policy_moderate,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_60
0,-3226.0,0.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-3215.0,63.0,0.0,1.0,0.0,0.0,3.0,1.0,2.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,-3205.0,100.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-3198.0,100.0,0.0,1.0,0.0,1.0,4.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,-3154.0,50.0,0.0,1.0,0.0,1.0,2.0,1.0,1.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [62]:
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                print(str(i) + ' ' + str(j) + ' ' + colname + ' ' + corr_matrix.columns[j])
                col_corr.add(colname)
    return col_corr
 
corr_features = correlation(X, 0.95)
print('correlated features: ', len(set(corr_features)) )

16 14 minimum_nights_avg_ntm minimum_nights
17 15 maximum_nights_avg_ntm maximum_nights
23 22 review_scores_accuracy review_scores_rating
24 22 review_scores_cleanliness review_scores_rating
24 23 review_scores_cleanliness review_scores_accuracy
25 22 review_scores_checkin review_scores_rating
25 23 review_scores_checkin review_scores_accuracy
25 24 review_scores_checkin review_scores_cleanliness
26 22 review_scores_communication review_scores_rating
26 23 review_scores_communication review_scores_accuracy
26 24 review_scores_communication review_scores_cleanliness
26 25 review_scores_communication review_scores_checkin
27 22 review_scores_location review_scores_rating
27 23 review_scores_location review_scores_accuracy
27 24 review_scores_location review_scores_cleanliness
27 25 review_scores_location review_scores_checkin
27 26 review_scores_location review_scores_communication
28 22 review_scores_value review_scores_rating
28 23 review_scores_value review_scores_accuracy
28 24 revie

In [63]:
X.drop(labels=corr_features, axis=1, inplace=True)

In [64]:
X.shape

(7185, 91)

In [65]:
X_back2 = X.copy()

In [66]:
X = X_back2.copy()

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [68]:
from sklearn.preprocessing import StandardScaler
sc_result = StandardScaler()

In [69]:
X_train[:] = sc_result.fit_transform(X_train)
X_test[:] = sc_result.fit_transform(X_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc._setitem_with_indexer(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._setitem_slice(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.o

In [70]:
y_train[:] = sc_result.fit_transform(y_train.values.reshape(-1,1))[:,0]
y_test[:] = sc_result.fit_transform(y_test.values.reshape(-1,1))[:,0]


In [71]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt

In [72]:
#Elastic Net
model_enet = ElasticNet(alpha = 0.002, max_iter=10000, tol=0.0001)
model_enet.fit(X_train, y_train) 
pred_train_enet= model_enet.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_enet)))
print(r2_score(y_train, pred_train_enet))

pred_test_enet= model_enet.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_enet)))
print(r2_score(y_test, pred_test_enet))

0.7380948723160664
0.4552159594607297
0.7325358953590735
0.4633911620104806


In [73]:
coef_df = coef_weights_model(model_enet, model_enet.coef_, X_train)

In [74]:
coef_df.head(50)

Unnamed: 0,est_int,coefs,abs_coefs
54,neighbourhood_Södermalms,0.329729,0.329729
49,neighbourhood_Norrmalms,0.266659,0.266659
8,bedrooms,0.256079,0.256079
6,accommodates,0.236376,0.236376
56,neighbourhood_Östermalms,0.223443,0.223443
48,neighbourhood_Kungsholmens,0.173914,0.173914
82,room_type_Private room,-0.166206,0.166206
1,host_response_rate,0.089375,0.089375
24,calculated_host_listings_count,0.080239,0.080239
7,bathrooms,0.079955,0.079955


In [75]:
from sklearn import model_selection
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

scoring = 'r2'
results = model_selection.cross_val_score(model_enet, X_train, y_train, cv=kfold, scoring=scoring)
print("train data r2 mean score: " + str(results.mean()) + "std: " + str(results.std()))
results = model_selection.cross_val_score(model_enet, X_test, y_test, cv=kfold, scoring=scoring)
print("test r2 mean score: " + str(results.mean()) + "std: " + str(results.std()))



train data r2 mean score: 0.43117217250600853std: 0.03137614781886902
test r2 mean score: 0.44355446657932107std: 0.06461177588532493
