In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [2]:
data_features = pd.DataFrame()
for i in range(10):
    filename = "./safety/features/part-0000"+str(i)+"-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv"
    temp_data_features = pd.read_csv(filename, header=0)
    data_features = data_features.append(temp_data_features)

In [12]:
filename = "./safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv"
data_labels = pd.read_csv(filename,header=0)

In [3]:
data_features.shape

(16135561, 11)

In [5]:
data_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16135561 entries, 0 to 1613561
Data columns (total 11 columns):
bookingID         int64
Accuracy          float64
Bearing           float64
acceleration_x    float64
acceleration_y    float64
acceleration_z    float64
gyro_x            float64
gyro_y            float64
gyro_z            float64
second            float64
Speed             float64
dtypes: float64(10), int64(1)
memory usage: 1.4 GB


#Check for Duplicates

In [59]:
def features_remove_duplicates(df):
    df.drop_duplicates()
    return df

def labels_remove_duplicates(df):
    df.drop_duplicates("bookingID", keep="last")

##Check for Null/NA

In [6]:
data_features.describe()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
count,16135560.0,16135560.0,16135560.0,16135560.0,16135560.0,16135560.0,16135560.0,16135560.0,16135560.0,16135560.0,16135560.0
mean,818481900000.0,11.61035,168.9684,0.06914529,4.468976,0.8942974,-0.001710847,0.0002800733,-0.0002569688,3803.484,9.006543
std,495247600000.0,86.91672,107.2966,1.424161,8.130664,3.251549,0.1445619,0.340063,0.1480824,1436686.0,8.106415
min,0.0,0.75,0.0,-78.41969,-72.99412,-78.44842,-48.45575,-74.88861,-53.55445,0.0,-2.0
25%,377957100000.0,3.9,78.0,-0.5081238,-2.022476,-0.9316101,-0.02678888,-0.02993851,-0.018765,241.0,1.02
50%,807453900000.0,4.25,168.87,0.06137085,9.081485,0.775745,-0.0006432411,0.0002645046,-3.542004e-05,520.0,7.53
75%,1254130000000.0,8.0,262.984,0.635062,9.709778,2.750938,0.02330639,0.03143085,0.01823425,863.0,15.48
max,1709397000000.0,6070.101,359.9995,66.87346,75.05589,78.05576,39.83975,80.31496,66.30078,1495797000.0,148.0186


# Anomalies in data
* GPS Accuracy Max = 6070m
* Max Duration = 1.4e9 seconds
* Max Speed = 148 m/s = 532 kmh


In [88]:
def clean_features(df):
    #rename to more uniform convention
    df.columns = ['bookingID', 'accuracy', 'bearing', 
                    'acceleration_x', 'acceleration_y', 'acceleration_z',
                    'gyro_x', 'gyro_y', 'gyro_z',
                    'second','speed']
    #remove NAs
    df = df.dropna()
    df = df.drop_duplicates()
    df = df.loc[(df["accuracy"]<20) &
                (df["second"]<1e6) &
                (df["speed"]>0) &
                (df["speed"]<60)]
    return df

def clean_labels(df):
    df = df.dropna()
    df = df.drop_duplicates("bookingID", keep="last") #some booking IDs have multiple labels. set them to be dangerous
    return df

def process_data(features, labels):
    temp_features = clean_features(features)
    temp_labels = clean_labels(labels)
    data_combined = data = pd.merge(temp_features, temp_labels, how="inner",on="bookingID")
    data_combined.sort_values(["bookingID","second"], inplace=True)
    return data_combined

data_combined = process_data(data_features, data_labels)

In [87]:
data_combined.to_csv(r'./data_combined.csv')