In [1]:
from collections import Counter
import re
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import math


In [2]:
df = pd.read_csv('./data/la_listings_edited.csv')
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)


In [3]:
# Create a list of filtering conditions to apply them all at the end
cond_list=[]

### Filter listings to the most common property types:
```
['Entire apartment',
 'Entire house',
 'Private room in house',
 'Private room in apartment',
 'Entire guesthouse',
 'Entire condominium',
 'Entire guest suite',
 'Entire serviced apartment',
 'Entire bungalow',
 'Private room in condominium',
 'Shared room in house',
 'Private room in townhouse',
 'Entire townhouse',
 'Entire villa',
 'Entire loft']
 ```
 



In [4]:
# Create condition to filter out unnessesary property_types
pr_type_filter = list(pd.crosstab(df['property_type'], 'count').sort_values('count', ascending=False).head(15).index)
pr_type_condition = df['property_type'].apply(lambda x: True if x in pr_type_filter else False)
cond_list.append(pr_type_condition)

### Filter listing to the City of Los Angeles area


In [5]:
# Create condition to filter out locations beyond City of Los Angeles
location_condition = df['neighborhood'] == 'City of Los Angeles'
cond_list.append(location_condition)

### Apply conditions to the data

In [6]:
df = df[pr_type_condition][location_condition].dropna()

### Make TV and FM

In [7]:
X=df[['latitude', 'longitude','property_type','accommodates','bathrooms_text','bedrooms','beds']]
y=df['price']

### Make train/test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### Establish baseline estimation

In [9]:
baseline = y.mean()

### Fit Random Forest Regressor model

In [10]:
model = make_pipeline(
    OrdinalEncoder(),
    RandomForestRegressor(n_estimators=139, min_samples_split=10, max_features='log2',n_jobs=-1, random_state=42)
)
model.fit(X_train,y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['property_type', 'bathrooms_text'],
                                mapping=[{'col': 'property_type',
                                          'data_type': dtype('O'),
                                          'mapping': Entire house                    1
Entire loft                     2
Entire apartment                3
Private room in apartment       4
Private room in house           5
Shared room in house            6
Entire guesthouse               7
Entire guest suite              8
Entire condominium              9
Entire bungalow                10
Entire townhouse               11
Private room i...
5 shared baths       23
8.5 baths            24
7 baths              25
6 baths              26
7.5 baths            27
Half-bath            28
0 shared baths       29
8.5 shared baths     30
8 baths              31
9.5 baths            32
10 baths             33
4.5 shared baths     34
0 baths              35
6

In [11]:
#Baseline MAE
print(f'Baseline MAE: {mean_absolute_error(y_test, [baseline]*len(y_test))}')
#MAE of the model
print(f'Model MAE: {mean_absolute_error(y_test, model.predict(X_test))}')

Baseline MAE: 143.04131711024746
Model MAE: 72.90024079786362


### Optimize the model

In [12]:
# dir(model.named_steps['randomforestregressor'])
# params={'randomforestregressor__n_estimators': range(20,150),
#         'randomforestregressor__min_samples_split': range(5,20),
#         'randomforestregressor__max_features': ["auto", "sqrt", "log2"]}

# search=RandomizedSearchCV(
#     estimator=model,
#     param_distributions=params,
#     n_iter=100,
#     n_jobs=-1,
#     verbose=1
# ).fit(X,y)


### search.best_params_

{'randomforestregressor__n_estimators': 139,<br>
 'randomforestregressor__min_samples_split': 10,<br>
 'randomforestregressor__max_features': 'log2'}

### Build model that utilizes vector similarity to make estimations

In [13]:
# Ordinal encoding X
encoder = OrdinalEncoder().fit(X_train)
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)

In [14]:
class VectorSimilarity:
    def __init__(self):
        return
    
    # Pass training data and convert it to the instance of numpy.ndarray
    def fit(self, X, y):
        if (isinstance(X, np.ndarray)):
            self.X = X
        else:
            self.X = np.array(X)
        
        if (isinstance(y, np.ndarray)):
            self.y = y
        else:
            self.y = np.array(y)

    # Makes single prediction
    def predict(self, case, n_closest=5):
        """Returns average value of corrensponding target values of `n_closest` vectorized instances in training data"""
        
        if (not isinstance(case, np.ndarray)):
            case = np.array(case)
        # Sanity check
        assert len(case.shape)==1, '`case` parameter must be one dimensional array'
        # Init. empty containers
        distances = []
        closest = []
        # Compare `case` vector to all vectors in training data
        for i in range(len(self.X)):
            vec1, vec2 = self.X[i], case
            distance =  math.sqrt(sum((vec1-vec2)**2))
            distances.append((i,distance))
        closest = sorted(distances, key=lambda item: item[1])[:n_closest-1]
        # Get target values for `n_closest` vectorized instances in training data
        result = [self.y[item[0]] for item in closest]
        # Return average value of `result`
        return sum(result)/len(result)


    def predict_all(self, cases, n_closest=5):
        """Makes predictions for series of vectors"""
        n = n_closest
        if (not isinstance(cases, np.ndarray)):
            cases = np.array(cases)
        return np.array([self.predict(case, n_closest=n) for case in cases])



In [15]:
obj = VectorSimilarity()
obj.fit(X_train_enc, y_train)

In [16]:
predictions = obj.predict_all(X_test_enc, n_closest=30)

In [17]:
print(f'VectorSimilarity MAE: {mean_absolute_error(y_test, predictions)}')

VectorSimilarity MAE: 83.22098831122021
