# Demand Prediction

We use geohash2 to convert geohash6 to latitude, longitude value.

In [1]:
import geohash2
import pandas as pd
from sklearn import linear_model
import sklearn.metrics as sm

## Data Set
### https://www.aiforsea.com/traffic-management
This dataset has 4 attributes:
 - geohash6 : latitude longitude geocoding (level 6)
 - day : day value in the sequential order, not particular day of the month
 - timestamp : time a day in 15 minutes interval (hour:minutes)
 - demand : aggregated deman normalised to be in range [0, 1]

In [2]:
data = pd.read_csv('training.csv')
data.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468


## Feature Engineering
### Split 'timestamp' data to single data for hour data as 'jam' and minutes data as 'menit'

In [3]:
df = data
df[['timestamp', 'menit']] = df['timestamp'].str.split(':', expand=True)
df = df.rename(columns={'timestamp': 'jam'})
df

Unnamed: 0,geohash6,day,jam,demand,menit
0,qp03wc,18,20,0.020072,0
1,qp03pn,10,14,0.024721,30
2,qp09sw,9,6,0.102821,15
3,qp0991,32,5,0.088755,0
4,qp090q,15,4,0.074468,0
5,qp03tu,1,12,0.023843,15
6,qp096d,25,3,0.007460,30
7,qp03nr,51,20,0.000293,45
8,qp093r,48,6,0.054170,15
9,qp03r2,4,22,0.123463,15


## geohash6 convert to latitude and longitude

In [4]:
df['geohash6'] = df['geohash6'].apply(lambda x: geohash2.decode(x))
df

Unnamed: 0,geohash6,day,jam,demand,menit
0,"(-5.35, 90.7)",18,20,0.020072,0
1,"(-5.41, 90.7)",10,14,0.024721,30
2,"(-5.33, 90.9)",9,6,0.102821,15
3,"(-5.35, 90.8)",32,5,0.088755,0
4,"(-5.41, 90.7)",15,4,0.074468,0
5,"(-5.34, 90.6)",1,12,0.023843,15
6,"(-5.39, 90.8)",25,3,0.007460,30
7,"(-5.41, 90.6)",51,20,0.000293,45
8,"(-5.36, 90.8)",48,6,0.054170,15
9,"(-5.4, 90.7)",4,22,0.123463,15


In [5]:
type(df['geohash6'].loc[0])

tuple

## Separate converted geohash6 data to latitude and longitude

In [6]:
df['geohash6'].tolist()
df[['latitude', 'longitude']] = pd.DataFrame(df['geohash6'].tolist(), index=df.index)
df

Unnamed: 0,geohash6,day,jam,demand,menit,latitude,longitude
0,"(-5.35, 90.7)",18,20,0.020072,0,-5.35,90.7
1,"(-5.41, 90.7)",10,14,0.024721,30,-5.41,90.7
2,"(-5.33, 90.9)",9,6,0.102821,15,-5.33,90.9
3,"(-5.35, 90.8)",32,5,0.088755,0,-5.35,90.8
4,"(-5.41, 90.7)",15,4,0.074468,0,-5.41,90.7
5,"(-5.34, 90.6)",1,12,0.023843,15,-5.34,90.6
6,"(-5.39, 90.8)",25,3,0.007460,30,-5.39,90.8
7,"(-5.41, 90.6)",51,20,0.000293,45,-5.41,90.6
8,"(-5.36, 90.8)",48,6,0.054170,15,-5.36,90.8
9,"(-5.4, 90.7)",4,22,0.123463,15,-5.4,90.7


## Drop geohash6

In [7]:
df = df.drop("geohash6", axis=1)
df

Unnamed: 0,day,jam,demand,menit,latitude,longitude
0,18,20,0.020072,0,-5.35,90.7
1,10,14,0.024721,30,-5.41,90.7
2,9,6,0.102821,15,-5.33,90.9
3,32,5,0.088755,0,-5.35,90.8
4,15,4,0.074468,0,-5.41,90.7
5,1,12,0.023843,15,-5.34,90.6
6,25,3,0.007460,30,-5.39,90.8
7,51,20,0.000293,45,-5.41,90.6
8,48,6,0.054170,15,-5.36,90.8
9,4,22,0.123463,15,-5.4,90.7


## Set demand from dataset 'demand' and drop from dataset

In [8]:
demand = df['demand']
df = df.drop("demand", axis=1)
#df[['demand']] = pd.DataFrame(demand, index=df.index)
df

Unnamed: 0,day,jam,menit,latitude,longitude
0,18,20,0,-5.35,90.7
1,10,14,30,-5.41,90.7
2,9,6,15,-5.33,90.9
3,32,5,0,-5.35,90.8
4,15,4,0,-5.41,90.7
5,1,12,15,-5.34,90.6
6,25,3,30,-5.39,90.8
7,51,20,45,-5.41,90.6
8,48,6,15,-5.36,90.8
9,4,22,15,-5.4,90.7


## Set dataset as array X

In [9]:
X = df.values
X

array([[18, '20', '0', '-5.35', '90.7'],
       [10, '14', '30', '-5.41', '90.7'],
       [9, '6', '15', '-5.33', '90.9'],
       ...,
       [32, '12', '15', '-5.39', '90.6'],
       [42, '5', '15', '-5.46', '90.7'],
       [15, '4', '0', '-5.29', '90.7']], dtype=object)

## Set demand as array y

In [10]:
y = demand.values
y

array([0.02007179, 0.02472097, 0.10282096, ..., 0.12326031, 0.12009976,
       0.04265642])

## Split data to training and test data
### In this part we use 80% data as training data and 20% as test data

In [11]:
num_training = int(0.8 * len(X))
num_test = len(X) - num_training

X_train, y_train = X[:num_training], y[:num_training]
X_test, y_test = X[num_training:], y[num_training:]
y_test

array([0.04139951, 0.06871307, 0.0097046 , ..., 0.12326031, 0.12009976,
       0.04265642])

## Use Linear Reggression to train the model

In [12]:
linear_regressor = linear_model.LinearRegression()
linear_regressor.fit(X_train, y_train)
y_test_pred = linear_regressor.predict(X_test)
y_test_pred

array([0.07953054, 0.11116914, 0.08401165, ..., 0.11204258, 0.12628058,
       0.11600796])

## Function for SMAPE score

In [23]:
#Value 0% - 200%
import numpy as np

def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

## Model Performance
## Calculate the MAE, RMSE, and SMAPE

In [24]:
from math import sqrt

print("Linear Regressor performance:") 
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2)) 
print("Root Mean Square Error =", round(sqrt(sm.mean_squared_error(y_test, y_test_pred)), 2))
print("Symetric MAPE =", round(smape(y_test,y_test_pred), 5))

Linear Regressor performance:
Mean absolute error = 0.1
Root Mean Square Error = 0.16
Symetric MAPE = 95.10489
