# Exploratory Data Analysis

In [5]:
import pandas as pd
from utils import geohash

In [17]:
%matplotlib inline
import matplotlib.pyplot as plt
import gmplot

In [7]:
DATA_PATH = 'data'
train = pd.read_csv('data/traffic-management/training.csv'.format(DATA_PATH))

## Data Fields Description

| Field     | Description                                                                                                                                               |
|-----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------|
| geohash6  | geohash level 6                                                                                                                                           |
| day       | day, where the value indicates the sequential order and not a particular day of the month                                                                 |
| timestamp | start time of 15-minute intervals, in the following format:   <hour>:<minute>, where hour ranges from 0 to 23 and minute is either one of (0, 15, 30, 45) |
| demand    | aggregated demand normalised to be in the range [0,1]                                                                                                     |


In [8]:
train.head()


Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468


In [9]:
train.shape


(4206321, 4)

In [27]:
%%time
train['lat_long'] = train['geohash6'].apply(lambda x: geohash.decode(x))
train['lat'] = train['lat_long'].apply(lambda x: x[0])
train['long'] = train['lat_long'].apply(lambda x: x[1])
train = train.drop(['lat_long'], axis=1)

Wall time: 27.1 s


In [10]:
train.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468


In [11]:
train_agg = train.copy()
train_agg['count'] = 1
train_agg = train_agg.groupby('geohash6').count()[['count']]
train_agg = train_agg.reset_index()

In [12]:
train_agg.head()
train_agg['lat_long'] = train_agg['geohash6'].apply(lambda x: geohash.decode(x))
train_agg['lat'] = train_agg['lat_long'].apply(lambda x: x[0])
train_agg['long'] = train_agg['lat_long'].apply(lambda x: x[1])
train_agg = train_agg.drop(['lat_long', 'geohash6'], axis=1)

In [16]:
train_agg.head()

Unnamed: 0,count,lat,long
0,577,-5.484924,90.653687
1,89,-5.479431,90.653687
2,2,-5.468445,90.653687
3,7,-5.462952,90.653687
4,106,-5.457458,90.653687


In [18]:
latitude_list = list(train_agg['lat'].values)
longitude_list = list(train_agg['long'].values)

In [21]:
gmap3 = gmplot.GoogleMapPlotter(-5.484924, 90.653687, 13) 

In [25]:
# scatter method of map object  
# scatter points on the google map 
gmap3.scatter(latitude_list, longitude_list, '# FF0000', 
                              size = 40, marker = False ) 
  
# Plot method Draw a line in 
# between given coordinates 
gmap3.heatmap(latitude_list, longitude_list) 

In [26]:
gmap3.draw('map_plot.html')

In [29]:
safety_labels = pd.read_csv('data/safety/safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')

In [30]:
safety_labels

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
2,171798691856,0
3,1520418422900,0
4,798863917116,0
5,283467841567,0
6,231928234141,0
7,188978561143,0
8,1194000908346,0
9,274877906944,1


In [32]:
feats = pd.read_csv('data/safety/safety/features/part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv')

In [33]:
feats.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,1202590843006,3.0,353.0,1.228867,8.9001,3.986968,0.008221,0.002269,-0.009966,1362.0,0.0
1,274877907034,9.293,17.0,0.032775,8.659933,4.7373,0.024629,0.004028,-0.010858,257.0,0.19
2,884763263056,3.0,189.0,1.139675,9.545974,1.951334,-0.006899,-0.01508,0.001122,973.0,0.667059
3,1073741824054,3.9,126.0,3.871543,10.386364,-0.136474,0.001344,-0.339601,-0.017956,902.0,7.913285
4,1056561954943,3.9,50.0,-0.112882,10.55096,-1.56011,0.130568,-0.061697,0.16153,820.0,20.419409
