This will model the data using a Gradient Boosting Machine

In [None]:
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import lightgbm as lgb
import sklearn
from sklearn import metrics
from data_utils import load_dataset, core_metrics,map_model

plotly.io.templates.default = 'plotly_dark'

In [None]:
X_train, X_test, y_train, y_test = load_dataset()

In [None]:
# using lgbm api

train_data = lgb.Dataset(X_train, label=y_train,free_raw_data=False)
test_data = lgb.Dataset(X_test, label=y_test)
params = {'objective':'regression','metric':'mae','learning_rate': 0.05, 'num_leaves': 301, "max_depth": 30, "lambda_l2": 0.03}
evals_result = {}
model = lgb.train(params,
                  train_data,
                  num_boost_round=200,
                  valid_sets=[train_data,test_data],
                  valid_names=['Train','Test'],
                  verbose_eval=20,
                  evals_result=evals_result,
                  )

lgb.plot_metric(evals_result)
y_pred = model.predict(X_test)

In [None]:
# same thing but with sklearn
from sklearn.model_selection import RandomizedSearchCV

lg = lgb.LGBMRegressor(num_leaves=192,learning_rate=0.05,n_estimators=200,objective='l2',silent=False)

param_dist = {'num_leaves': [31,51,101,151,201,301],
              'max_depth': [10,20,30,-1],
              'lambda_l2': [0.01,0.03,0.1,0.3]
             }

# if scoring is not provided, the estimator's scoring paramter is used
cv = RandomizedSearchCV(lg,param_dist,n_iter=25,scoring='neg_median_absolute_error')

search = cv.fit(X_train,y_train)

best_params = search.best_params_

lg.set_params(**best_params)

lg.fit(X_train,y_train)

y_pred = lg.predict(X_test)

print("Best Params: {}".format(best_params))
metrics.mean_absolute_error(y_test,y_pred)

In [None]:
from data_utils import core_metrics

core_metrics(y_test,y_pred)

In [None]:
from keys import mapbox_access_token
import plotly.graph_objects as go

# cache the populated coordinates

populated_coordinates = np.array([])

In [None]:
def remove_empty_coords (X_test,coordinates,lat_step,long_step):
    global populated_coordinates
    # removes coordinates which have no data
    if populated_coordinates.any():
        print('Using cached coordinates')
        use_coordinates = np.array(populated_coordinates)
    else:
        populated_coordinates = []
        for lat,long in coordinates:
            if ((X_test['latitude'].between(lat,lat+lat_step)) & (X_test['longitude'].between(long,long + long_step))).any():
                populated_coordinates.append([lat,long])
        use_coordinates = np.array(populated_coordinates)
        populated_coordinates = use_coordinates # set cache
    return use_coordinates
    

def map_model (X_test,model):
    # model should take X_test as an arugment to it's predict function
    # we are going to map the price for the typical apartment around the whole city
    
    # typical apartment characteristics
    date = X_test['date'].median()
    area = X_test['area'].median()
    bedrooms = X_test['bedrooms'].mode()[0]
    pets = X_test['pets'].mode()[0]
    furnished = X_test['furnished'].mode()[0]
    unit_type = X_test['unit_type'].mode()[0]
    
    # construct a square grid
    # throw away any points that are not close to real values
    lats, lat_step = np.linspace(X_test['latitude'].min(),X_test['latitude'].max(),num=300,retstep=True)
    longs, long_step = np.linspace(X_test['longitude'].min(),X_test['longitude'].max(),num=300,retstep=True)
    coordinate_list = np.array(np.meshgrid(lats,longs)).T.reshape(-1,2)
    
    # can this be vectorized?
    # now, remove the rows that aren't near real data
    # to do this, set a threshold value for how close we need to find a point
    # for each point in the dataframe, see if there is a point close enough
    use_coordinates = remove_empty_coords(X_test,coordinate_list,lat_step,long_step)

    df = pd.DataFrame(use_coordinates,columns=['latitude','longitude'])
    df.loc[:,'date'] = date
    df.loc[:,'area'] = area
    df.loc[:,'bedrooms'] = bedrooms
    df.loc[:,'pets'] = pets
    df.loc[:,'furnished'] = furnished
    df.loc[:,'unit_type'] = unit_type
    df['unit_type'] = pd.Categorical(df['unit_type'])
    df = df[['date', 'latitude', 'longitude', 'area', 'bedrooms', 'pets', 'furnished', 'unit_type']]
    
    #X_geo = df.to_numpy()
    y_geo = model.predict(df)
    df['price'] = y_geo

    fig = px.scatter_mapbox(df,lon='longitude',lat='latitude',color='price',width=1000,height=800)

    #fig = go.Figure(go.Scattermapbox(lon=list(map(str,list(df['longitude']))),lat=list(map(str,list(df['latitude'])))))#,marker=go.scattermapbox.Marker(size=14)))
    #go.scattermapbox.Marker(size=14,symbol='square',color=df['price'])
    #fig.update_layout(mapbox=dict(accesstoken=mapbox_access_token))
                  
    # need to fix view and scatter marker size
    #fig.data[0].marker = dict(size=10,opacity=0.5,symbol='square')
    #fig = px.scatter_mapbox(df,lon='longitude',lat='latitude',color='price')
    return fig
    
fig = map_model(X_test,model)
fig.show()