# Location Prediction for Airbnb Berlin Data

## Download the data

In [None]:
import LocationPrediction.source_data
import LocationPrediction.preprocess_data
import LocationPrediction.predictor
import sys
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import pickle
import numpy as np
from  IPython.display import display
% matplotlib inline

## Configure pandas options

In [None]:
# customize pandas print output
# pd.set_option('display.height', 1000)
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 15)
# pd.set_option('display.width', 250)
gmapsAPIKey = 'AIzaSyA2mnU4nhuF7vzR2pvLDb-LtFJcW_Vhx3w'

## Data downloading and cleaning

In [None]:
sd = LocationPrediction.source_data.source_dataClass()
# download and clean data
# df_clean_csv = sd.download_and_clean() # takes time because it reads individual source csv files, combines them, cleans them
# read the cleaned csv into memory
df_clean_csv = sd.read_clean_csv()

## Preprocess data

In [None]:
# data preprocessing up to the generation of a training dataframe
prpdata_train = LocationPrediction.preprocess_data.preprocess_dataClass(df=df_clean_csv)

# create a reduced dataset by restricting observations to fall into a specific date range
# this also creates distinct indices
min_date = '2017-02-01'
max_date = None
n_cuts = -1
N_positive_samples_to_draw = -1
Neg_Multiplier = 1
prpdata_train.create_training_data(min_date=min_date,
                                   n_cuts=n_cuts,
                                   N_positive_samples_to_draw=N_positive_samples_to_draw,
                                   Neg_Multiplier=Neg_Multiplier)

print(prpdata_train)
display(prpdata_train.df_training_categorical.head())

## Prepare training dataset and fit a random forest

In [64]:
# # create a random forest and train it with the training data
# rf = RandomForestClassifier(n_estimators=500,
#                                      random_state=42,
#                                      class_weight='balanced',
#                                      n_jobs=-1)

# df_train = prpdata_train.df_training_one_hot_vector.copy()
# X_df_train = df_train.drop(columns=[prpdata_train.label_column_name])
# X_train = X_df_train.values

# y_df_train = df_train.drop(columns=df_train.columns.difference([prpdata_train.label_column_name]))
# y_train = y_df_train.values

# rf.fit(X_train,y_train.ravel())

filename = 'my_rf.pkl'
# pickle.dump(rf, open(filename, 'wb'))
rf = pickle.load(open(filename,'rb'))

## Use the random forest for a prediction

In [65]:
# make a prediction using the trained random forest
# the features of the first row of the training data
predictor = LocationPrediction.predictor.predictorClass(model=rf,
                                                        preprocessed_data_for_training=prpdata_train)

df_predictions = None
row_for_prediction = 10
df_features_for_prediction = \
    prpdata_train.df_training_categorical.loc[row_for_prediction:row_for_prediction,prpdata_train.feature_column_names]
df_locations_for_prediction = \
    prpdata_train.df_training_categorical[prpdata_train.location_column_names].drop_duplicates()
predictor.set_features_and_locations(df_features = \
    df_features_for_prediction,df_locations=df_locations_for_prediction)
    
column_names_in_training_data = prpdata_train.df_training_one_hot_vector.columns.drop(prpdata_train.label_column_name,errors='ignore')
predictor.df_features_and_locations_one_hot_vector_to_predict = \
        LocationPrediction.preprocess_data.fix_columns(
            df=predictor.df_features_and_locations_one_hot_vector_to_predict,
            column_names=column_names_in_training_data)

df_predictions = predictor.df_features_and_locations_to_predict[prpdata_train.location_column_names].copy()
df_predictions['probability'] = predictor.predict(beta=prpdata_train.beta_for_training_data)
# df_predictions['probability'] = predictor.predict(beta=1)
df_predictions['yearmonth'] = 201706 
print(df_predictions)

        latitude  longitude  probability  yearmonth
0      52.494585  13.338790     0.000406     201706
1      52.551038  13.417609     0.000704     201706
2      52.508034  13.416990     0.000083     201706
3      52.486319  13.373899     0.000109     201706
4      52.553509  13.388626     0.000779     201706
5      52.487389  13.375153     0.000107     201706
6      52.473925  13.423973     0.000443     201706
7      52.507275  13.455811     0.000177     201706
8      52.517033  13.318576     0.001304     201706
9      52.527818  13.392644     0.000039     201706
10     52.576945  13.281537     0.043023     201706
11     52.471238  13.442608     0.001202     201706
12     52.471427  13.432339     0.000814     201706
13     52.473694  13.449379     0.000607     201706
14     52.473589  13.447518     0.000897     201706
15     52.533341  13.340620     0.000913     201706
16     52.540174  13.407556     0.000198     201706
17     52.471827  13.452512     0.000381     201706
18     52.53

In [None]:
df_predictions.sort_values(by='probability',ascending=False).head(n=20)

In [67]:
# berlin data
from IPython.display import display
import ipywidgets as widgets

import gmaps
gmaps.configure(api_key=gmapsAPIKey)

class AirbnbExplorer(object):
    """
    Jupyter widget for exploring the Airbnb Berlin dataset.

    The user uses the slider to choose the radius of the heatmap. 
    """

    def __init__(self, df):
        self._df = df
        self._heatmap = None
        self._fig = None
        self._sliderYearmonth = None
        self._sliderRadius = None
        initial_yearmonth = min(self._df['yearmonth'])
        self._yearmonthsDistinctSorted = np.sort(self._df['yearmonth'].unique())
        len_yearmonth = len(self._yearmonthsDistinctSorted)
        initial_radius = 15
        
        title_widget = widgets.HTML(
            '<h3>Airbnb places in Berlin, by yearmonth last modified</h3>'
            '<h4>Data from <a href="http://tomslee.net/airbnb-data">Tom Slees collection of Airbnb data</a></h4>'
        )

        map_figure = self._render_map(initial_yearmonth, initial_radius)
        self._fig = map_figure
        controlsYearmonth = self._render_controlsYearmonth(initial_yearmonth, len_yearmonth)
        controlsRadius = self._render_controlsRadius(initial_radius)
        self._container = widgets.VBox([title_widget, controlsYearmonth, controlsRadius, map_figure])

    def render(self):
        display(self._container)

    def _on_yearmonth_change(self, change):
        yearmonthIndex = self._sliderYearmonth.value
        yearmonth = self._yearmonthsDistinctSorted[yearmonthIndex]
        locations = self._locations_for_yearmonth(yearmonth)
        weights = self._weights_for_yearmonth(yearmonth) 
        self._heatmap.locations = locations
        self._heatmap.weights = weights
        self._total_boxYearmonth.value = self._total_last_modified_text_in_yearmonth(yearmonth)        
        
        return self._container

    def _on_radius_change(self, change):
        radius = self._sliderRadius.value
        self._heatmap.point_radius = radius
        self._total_boxRadius.value = self._total_radius_text(radius)
        return self._container

    def _render_map(self, initial_yearmonth, initial_radius):
        # fig = gmaps.figure(map_type='HYBRID')
        # fig = gmaps.figure(map_type='ROADMAP')
        fig = gmaps.figure()
        locations = self._locations_for_yearmonth(initial_yearmonth)
        weights = self._weights_for_yearmonth(initial_yearmonth)
        self._heatmap = gmaps.heatmap_layer(
            locations,
            weights=weights,
            max_intensity=1,
            opacity=0.8,
            point_radius=initial_radius,
            dissipating=True,
        )
        fig.add_layer(self._heatmap)
        l_weights = len(weights)
        N1 = int(l_weights/5)
        N1 = min(N1,20)
        N1 = int(l_weights/2)
        N1 = l_weights
        indicesForMarkers = np.argsort(weights)[-N1:]
        
        weightsMarked = weights[indicesForMarkers]
        locationsMarked = locations[indicesForMarkers]
        
        textForMarked = ['{}'.format(a) for a in weightsMarked]
        
        self._symbolLayer = gmaps.symbol_layer(locationsMarked,
                                              hover_text=textForMarked,
                                              info_box_content=textForMarked,
                                              # stroke_opacity=1, # does not do anything
                                              # fill_opacity=1, # does not do anything
                                               stroke_color="rgba(255, 0, 0, 0.0)", # workaround for not working opacity
                                               fill_color="rgba(255, 0, 0, 0.0)", # workaround for not working opacity
                                              scale=3)
        fig.add_layer(self._symbolLayer)
        
        
#         self._markerLayer = gmaps.marker_layer(locationsMarked,
#                                               hover_text=textForMarked,
#                                               info_box_content=textForMarked,
#                                               )
#         fig.add_layer(self._markerLayer)
        

        
        return fig

    def _render_controlsYearmonth(self, initial_yearmonth, len_yearmonth):
        self._sliderYearmonth = widgets.IntSlider(
            value=np.nonzero(self._yearmonthsDistinctSorted==initial_yearmonth)[0][0],
            min=0,
            max=len_yearmonth-1,
            description='Yearmonth Index',
            continuous_update=False
        )
        self._total_boxYearmonth = widgets.Label(
            value=self._total_last_modified_text_in_yearmonth(initial_yearmonth)
        )
        self._sliderYearmonth.observe(self._on_yearmonth_change, names='value')
        controls = widgets.HBox(
            [self._sliderYearmonth, self._total_boxYearmonth],
            layout={'justify_content': 'space-between'}
        )
        return controls

    def _render_controlsRadius(self, initial_radius):
        self._sliderRadius = widgets.IntSlider(
            value=initial_radius,
            min=1,
            max=30,
            description='Radius',
            continuous_update=False
        )
        self._total_boxRadius = widgets.Label(
            value=self._total_radius_text(initial_radius)
        )
        self._sliderRadius.observe(self._on_radius_change, names='value')
        controls = widgets.HBox(
            [self._sliderRadius, self._total_boxRadius],
            layout={'justify_content': 'space-between'}
        )
        return controls

    def _locations_for_yearmonth(self, yearmonth):
        return self._df[self._df['yearmonth'] == yearmonth][['latitude', 'longitude']].values

    def _weights_for_yearmonth(self, yearmonth):
        return self._df[self._df['yearmonth'] == yearmonth]['probability'].values

    def _total_last_modified_in_yearmonth(self, yearmonth):
        return int(self._df[self._df['yearmonth'] == yearmonth]['yearmonth'].count())

    def _total_last_modified_text_in_yearmonth(self, yearmonth):
        return '# of places last modified in {}: {}'.format(yearmonth, self._total_last_modified_in_yearmonth(yearmonth))

    def _total_radius_text(self, radius):
        return 'radius: {}'.format(radius)

In [None]:
AirbnbExplorer(df_predictions).render()