# Car by Car examination of time dependent data

## Examples are speed and fuel but you could adapt to other things

## Usual Library Imports

In [None]:
import numpy as np
import datetime
from dateutil import tz
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd

%matplotlib inline

## Subroutine to load in from data set 1

**David's ideas**

**To do: do the same for data set 2**

In [None]:
def car1_loc_data(contents):
    data_select=pd.DataFrame(columns=contents)
    for n in np.arange(10):
        print('n=',n,' of ten')
        data_raw=pd.read_csv('~/data/5-Sanandaji/case_study_dt1.csv',
                nrows=100000,skiprows=range(1,n*100000))
        data_raw = data_raw[contents]
        data_select=pd.concat([data_select,data_raw])
        del data_raw
    return data_select

## Function to compute seconds after data start from a time stamp string

In [None]:
# Routine takes time string from the data and gives back:
# 1. flag = 0: for success, 
#           1: if the input is not a string, 
#           2: if it is not long enough
# 2. seconds since February 1, 2016
#
def get_time(dxt):
    err_flag = 0
    delta_sec = 0
    
    dt0 = datetime.datetime(2016,6,9) # Measure time since June 9, 2016
    
    if not (type(dxt) is str):
        err_flag = 1
    elif len(dxt)<19:
        err_flag = 2
    else:
        year = int(dxt[0:4])
        month = int(dxt[5:7])
        day = int(dxt[8:10])
        hour = int(dxt[11:13])
        minute = int(dxt[14:16])
        second = int(dxt[17:19])
        dt1 = datetime.datetime(
            year, month, day, hour=hour, minute=minute, second=second)
        delta = dt1-dt0
        delta_sec = delta.total_seconds()
        
    return err_flag, delta_sec

## Function to extract local time hour and day of the week

In [None]:
# Routine to get the hour minute and second and day of the week 
# adjusted from UTC to EST. Returns:
# 1. hour
# 2. day of the week 0-6 M-Su
# Note: no error checking, assumes past get_time() checks above 
def get_day(dxt,state):
    from_zone = tz.gettz('UTC')
    if state == np.nan:
        state = "UTC"
    to_zone = tz.gettz(state)
    year = int(dxt[0:4])
    month = int(dxt[5:7])
    day = int(dxt[8:10])
    hour = int(dxt[11:13])
    minute = int(dxt[14:16])
    second = int(dxt[17:19])
    dt_utc = datetime.datetime(
        year, month, day, hour=hour, minute=minute, second=second)
    dt_utc = dt_utc.replace(tzinfo=from_zone)
    dt_local = dt_utc.astimezone(to_zone)
    return dt_local.hour, dt_local.weekday()

## Load in the data from set 1 from the columns you want 

**Ten lots of size 100,000**

**To do: append the data from set #2**

*Notice that there are missing values in the set with nan values even in string columns*

In [None]:
# columns to try and identify when vehicles are refueled 
cols_gas = ["source_id", "source_Vehicle_Location_Timestamp", "source_Vehicle_Speed_Unit", 
            "source_Vehicle_Speed_Value", "source_Vehicle_FuelLevel_Value", 
            "source_Vehicle_Location_Address_State"]
data_gas = car1_loc_data(cols_gas)
list(data_gas)

## Read in the list of cars from set #1

**To do: add the cars from set #2**

In [None]:
sources = pd.read_csv("~/bcdata-mojio/brian/test1_sources.txt")
sources = sources["Car IDs"].values
print('Number of cars:',sources.size)

**Pick a car!**

Get all the records for that car

In [None]:
car_number = 2
car_info = data_gas[data_gas["source_id"]==sources[car_number]]
Ncar = car_info.shape[0] # number of records for this car

**Get the states the car has visited. Fill in nan values**

In [None]:
state = car_info["source_Vehicle_Location_Address_State"].values
for j in np.arange(Ncar):
    if state[j] is np.nan:
        state[j] = "UTC"
print(np.unique(state))

**See if the speeds are mph**

In [None]:
speed_units = car_info["source_Vehicle_Speed_Unit"]
print(np.unique(speed_units.values))

**Compute the times of the records and plot the order in the data**

*Note that the data is not sorted by time*

In [None]:
time_string = car_info["source_Vehicle_Location_Timestamp"].values
time_seconds = np.zeros(Ncar)
for j in np.arange(Ncar):
    code, t = get_time(time_string[j])
    if code != 0:
        print('Error',code,' in get_time')
    time_seconds[j] = t
plt.xlabel('Data number')
plt.ylabel('Seconds since June 9, 2016')
plt.plot(np.arange(Ncar),time_seconds)
plt.show()

## Sort the car data by time

In [None]:
car_info.insert(1,"seconds",time_seconds) # don't execute this twice

In [None]:
car_info2 = car_info.sort_values("seconds")
time_sorted = car_info2["seconds"].values

## Now look at the time history for this car

**Speed versus time**

*Car #2 is a fast driver*

In [None]:
speed = car_info2["source_Vehicle_Speed_Value"].values
plt.plot(time_sorted/24/3600,speed)
plt.ylabel("Speed mph")
plt.xlabel("Time in days past June 9, 2016")
plt.show()

**Now fuel % versus time**

*You can clearly see where they refueled*

In [None]:
fuel = car_info2["source_Vehicle_FuelLevel_Value"].values
plt.plot(time_sorted/24/3600,fuel)
plt.ylabel("Fuel %")
plt.xlabel("Time in days past June 9, 2016")
plt.show()

### Tried to get a sense of when they refueled during the day

*Looks like driver #2 refuels in the mornings*

In [None]:
time_string = car_info2["source_Vehicle_Location_Timestamp"].values
time_hour = np.zeros(Ncar)
time_dayofweek = np.zeros(Ncar)
for j in np.arange(Ncar):
    hour, day = get_day(time_string[j],state[j])
    time_hour[j] = hour
    time_dayofweek[j] = day
plt.xlabel('Day of the week (Monday to Sunday)')
plt.ylabel('Hour of the day (local time)')
plt.scatter(time_dayofweek, time_hour, c=fuel, alpha=1, cmap="inferno")
plt.colorbar()
plt.show()

## How about a speed scatter plot?

**Seems to be dominated by the times the car is not moving**

In [None]:
plt.xlabel('Day of the week (Monday to Sunday)')
plt.ylabel('Hour of the day (local time)')
plt.scatter(time_dayofweek, time_hour, c=speed, alpha=1, cmap="inferno")
plt.colorbar()
plt.show()