In [124]:
from collections import Counter
import re
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import math


In [126]:
df = pd.read_csv('./data/la_listings_edited.csv')
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)


In [128]:
# Create a list of filtering conditions to apply them all at the end
cond_list=[]

### Filter listings to the most common property types:
```
['Entire apartment',
 'Entire house',
 'Private room in house',
 'Private room in apartment',
 'Entire guesthouse',
 'Entire condominium',
 'Entire guest suite',
 'Entire serviced apartment',
 'Entire bungalow',
 'Private room in condominium',
 'Shared room in house',
 'Private room in townhouse',
 'Entire townhouse',
 'Entire villa',
 'Entire loft']
 ```
 



In [130]:
# Create condition to filter out unnessesary property_types
pr_type_filter = list(pd.crosstab(df['property_type'], 'count').sort_values('count', ascending=False).head(15).index)
pr_type_condition = df['property_type'].apply(lambda x: True if x in pr_type_filter else False)
cond_list.append(pr_type_condition)

### Filter listing to the City of Los Angeles area


In [132]:
# Create condition to filter out locations beyond City of Los Angeles
location_condition = df['neighborhood'] == 'City of Los Angeles'
cond_list.append(location_condition)

### Apply conditions to the data

In [134]:
df = df[pr_type_condition][location_condition].dropna()

### Make TV and FM

In [136]:
X=df[['latitude', 'longitude','property_type','accommodates','bathrooms_text','bedrooms','beds']]
y=df['price']

### Make train/test split

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### Establish baseline estimation

In [138]:
baseline = y.mean()

### Fit Random Forest Regressor model

In [139]:
model = make_pipeline(
    OrdinalEncoder(),
    RandomForestRegressor(n_estimators=82, min_samples_split=8, max_features='log2',n_jobs=-1, random_state=42)
)
model.fit(X_train,y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['property_type', 'bathrooms_text'],
                                mapping=[{'col': 'property_type',
                                          'data_type': dtype('O'),
                                          'mapping': Entire house                    1
Entire loft                     2
Entire apartment                3
Private room in apartment       4
Private room in house           5
Shared room in house            6
Entire guesthouse               7
Entire guest suite              8
Entire condominium              9
Entire bungalow                10
Entire townhouse               11
Private room i...
5 shared baths       23
8.5 baths            24
7 baths              25
6 baths              26
7.5 baths            27
Half-bath            28
0 shared baths       29
8.5 shared baths     30
8 baths              31
9.5 baths            32
10 baths             33
4.5 shared baths     34
0 baths              35
6

In [140]:
#Baseline MAE
print(f'Baseline MAE: {mean_absolute_error(y_test, [baseline]*len(y_test))}')
#MAE of the model
print(f'Model MAE: {mean_absolute_error(y_test, model.predict(X_test))}')

Baseline MAE: 143.04131711024746
Model MAE: 72.64348539014685


### Optimize the model

In [141]:
# dir(model.named_steps['randomforestregressor'])
# params={'randomforestregressor__n_estimators': range(20,150),
#         'randomforestregressor__min_samples_split': range(5,20),
#         'randomforestregressor__max_features': ["auto", "sqrt", "log2"]}

# search=RandomizedSearchCV(
#     estimator=model,
#     param_distributions=params,
#     n_iter=100,
#     n_jobs=-1,
#     verbose=1
# ).fit(X,y)


### search.best_params_

{'randomforestregressor__n_estimators': 82,<br>
 'randomforestregressor__min_samples_split': 8,<br>
 'randomforestregressor__max_features': 'log2'}

### Build model that utilizes vector similarity to make estimations

In [142]:
# Ordinal encoding X
encoder = OrdinalEncoder().fit(X)
X_enc = encoder.transform(X)

In [143]:
def vector_sim_predict(case, X, y_true, n_closest=5):
    closest = []
    result = []
    for i in range(len(X)):
        vec1, vec2 = X[i], case
        distance =  math.sqrt(sum((vec1-vec2)**2))
        closest.append((i,distance))
    closest = sorted(closest, key=lambda item: item[1])[:n_closest-1]
    for item in closest:
        result.append(y_true[item[0]])
    
    return sum(result)/len(result)

In [144]:
X_enc

Unnamed: 0,latitude,longitude,property_type,accommodates,bathrooms_text,bedrooms,beds
1,34.09768,-118.34602,1,1,1,1.0,1.0
6,33.98750,-118.43200,2,4,2,2.0,2.0
7,34.09521,-118.34801,1,1,1,1.0,2.0
9,34.11543,-118.26090,3,4,2,1.0,2.0
11,33.99638,-118.47734,1,1,3,1.0,1.0
...,...,...,...,...,...,...,...
30959,34.14580,-118.40546,5,6,4,2.0,3.0
30964,34.23760,-118.43913,2,3,2,2.0,2.0
30987,34.10187,-118.36438,2,6,4,3.0,3.0
31089,34.10659,-118.33817,5,2,2,1.0,1.0


In [156]:
vector_sim_predict(np.array(X_enc)[3], np.array(X_enc), np.array(y), n_closest=4)

118.33333333333333

In [146]:
y.head(10)

1      71.0
6     125.0
7      73.0
9      89.0
11     85.0
13     74.0
15    195.0
17    155.0
19    289.0
20    425.0
Name: price, dtype: float64