In [1]:
from __future__ import print_function, division
import time, os
import numpy as np
import matplotlib.pyplot as plt
import sys
import networkx as nx
import pandas

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
snapshots_df_full = pandas.read_csv('/home/cs231n/data/Field_Snaps_With_warranty.txt')
selected_snapshots_full = snapshots_df_full[['Veh Ref ID',
 'Event DateTime',
 'Event Type Description',
 'Acc Pedal Position',
 'Ambient Air Temp',
 'Barometric Press',
 'Brake Switch',
 'Bus Utilization',
 'Cat Intake Gas Temp',
 'Cat Outlet Gas Temp',
 'Clutch Switch',
 'Cmd Eng Fuel Press',
 'Cruise Status',
 'Dpf Regen Inhibit Sw',
 'Dpf Thermal Mngmnt',
 'Drvr Demand Torque',
 'Eng Air Flow Rate',
 'Eng Avg Fuel Econ',
 'Eng Coolant Level',
 'Eng Coolant Temp',
 'Eng Demand Torque',
 'Eng DPF Intake Press',
 'Eng Egr Valve Pos',
 'Eng Exhaust Gas Temp',
 'Eng Fuel Del Press',
 'EngFuelTemp1',
 'Engine Speed',
 'Eng Man Abs Pressure',
 'Eng Oil Pressure',
 'EngInjRail1Press',
 'EngIntakeMan1Temp',
 'EngOilTemp1',
 'Eng Percent Torque',
 'EngTurbo1Boost',
 'EngTurbo1Pos',
 'EngTurbo1Speed',
 'Event - All Lamps On Time Hr',
 'Event - Amber Lamp Time Hr',
 'Event - Mil Lamp Time Hr',
 'Event - Red Lamp Time Hr',
 'Exhaust Tank Level',
 'Exhaust Tank Temp',
 'Fan Speed',
 'Keyswitch Bat Pot',
 'Part Trap Diff Press',
 'Part Trap Out Temp',
 'Scr Intake Gas Temp',
 'Scr Outlet Gas Temp',
 'Vehicle Speed',
 'Population',
 'DTCID',
 'Trip Distance',
 'Trip Idle Time',
 'Trip Run Time',
 'Altitude',
 'Engine Start Ambient',
 'Engine Start Coolant',
 'Latitude',
 'Longitude',
 'Lifetime Idle Hours',
 'Lifetime Idle Fuel',
 'Lifetime Fuel',
 'Lifetime Distance',
 'Lifetime Engine Hours']]
selected_repairs_full = pandas.read_csv('/home/cs231n/data/repairs.csv')[[
 'Chassis\nReference\nNumber',
 'Model Vehicle',
 'Build_Dt',
 'Dlvry_Dt',
 'In Service Date',
 'Miles',
 'Rpr_Dt',
 'ATA3',
 'ATA3Desc',
 'ATA6',
 'ATA6Desc',
 'ATA9',
 'ATA9Desc',
 'Fail Type',
 'Repair Cost']]
selected_repairs_full = selected_repairs_full[selected_repairs_full['Chassis\nReference\nNumber'].notnull()] 

In [4]:
# don't include repairs before delivery
selected_repairs = selected_repairs_full[selected_repairs_full['Miles']>0]

In [5]:
#selected_repairs = selected_repairs_full[selected_repairs_full['Repair Cost'].isin(['medium', 'high ', 'very high'])]
veh_ids = selected_repairs['Chassis\nReference\nNumber'].unique()
selected_snapshots = selected_snapshots_full[selected_snapshots_full['Veh Ref ID'].isin(veh_ids)]

In [10]:
selected_snapshots = selected_snapshots.fillna(method='bfill')
selected_repairs = selected_repairs.fillna(method='bfill')

In [11]:
selected_snapshots.to_csv('/home/cs231n/data/snapshots_full_cleaned.csv')
selected_repairs.to_csv('/home/cs231n/data/repairs_full_cleaned.csv')

In [12]:
# 6/14/2016 20:17:07
s_to_date = lambda x: pandas.to_datetime(x, format='%m/%d/%Y %H:%M:%S')
snapshots_df = pandas.read_csv('/home/cs231n/data/snapshots_full_cleaned.csv', converters={'Event DateTime':s_to_date})

In [14]:
# 2/3/2016
r_to_date = lambda x: pandas.to_datetime(x, format='%m/%d/%Y')
repairs_df = pandas.read_csv('/home/cs231n/data/repairs_full_cleaned.csv', converters={'Rpr_Dt':r_to_date})

In [28]:
snapshots_df = snapshots_df.drop(['Unnamed: 0'],1)

In [30]:
repairs_df = repairs_df.drop(['Unnamed: 0'],1)

In [31]:
snapshots_df.to_csv('/home/cs231n/data/snapshots_cleaned_formatted.csv')
repairs_df.to_csv('/home/cs231n/data/repairs_cleaned_formatted.csv')

In [23]:
## k is number of intervals
## spacing is distance between intervals
## end_date is when the repair occurred which ends the sequence
def get_k_dates(k, spacing, end_date):
    end = pandas.to_datetime(end_date)
    # p1 so the last date is included
    return [end + pandas.DateOffset(days=-i*spacing) for i in range(0, k+1)]

In [33]:
s_veh_key = 'Veh Ref ID'
r_veh_key = 'Chassis\nReference\nNumber'
s_time_key = 'Event DateTime'
r_time_key = 'Rpr_Dt'

In [53]:
snapshots_df[0:3]

Unnamed: 0,Veh Ref ID,Event DateTime,Event Type Description,Acc Pedal Position,Ambient Air Temp,Barometric Press,Brake Switch,Bus Utilization,Cat Intake Gas Temp,Cat Outlet Gas Temp,...,Altitude,Engine Start Ambient,Engine Start Coolant,Latitude,Longitude,Lifetime Idle Hours,Lifetime Idle Fuel,Lifetime Fuel,Lifetime Distance,Lifetime Engine Hours
0,254.0,2016-04-03 18:00:44,trip_start,0.0,38.59,101.5,0.0,49.0,211.78,214.28,...,48.0,37.77,86.66,37.5,-121.01,43.35,210.56,267.87,138.59,55.75
1,254.0,2016-04-03 17:59:00,trip_end,0.0,38.37,101.5,0.0,48.0,206.09,211.68,...,50.0,10.55,25.55,37.5,-121.01,43.3,210.56,267.87,138.59,55.75
2,254.0,2016-04-25 17:58:26,trip_periodic,0.0,38.0,101.5,0.0,49.0,312.68,312.37,...,49.0,23.88,92.77,37.5,-121.01,215.45,1132.35,1335.88,388.09,264.5


In [74]:
def get_slices(k, spacing, num_slices, snapshots):
    slices = []
    for i in range(k):
        start = num_slices - (i+1)*spacing
        end = num_slices - i*spacing
        
        if i == k-1 and num_slices - start > 0: # last iter
            start = 0
            
        if start >=0 and end >=0:
            slices.append((i, snapshots[start : end]))
        else:
            break
            
    return slices

In [82]:
def get_repair_slices_map(veh_ids, snapshots, repairs, k=10, spacing=10, code='ATA9'):
    repair_slices = {}
    for veh_id in veh_ids:
        v_snapshots = snapshots[snapshots[s_veh_key] == veh_id].sort_values(by=s_time_key)
        v_repairs = repairs[repairs[r_veh_key] == veh_id].sort_values(by=r_time_key)

        start_date = pandas.to_datetime('1/1/2000') ## in past so first snapshot is captured

        repair_slices[veh_id] = {}
        veh_slices = repair_slices[veh_id]

        ## Best indicator of repair type is the ATA9 code
        ## Iterate over each repair type and append slices
        for repair_type, repair_group in v_repairs.groupby([code]):
            start = start_date

            ## for each repair type, grab slices of snapshots
            veh_slices_repair = {}
            for end in repair_group[r_time_key]:
                range_mask = (v_snapshots[s_time_key] >= start) & (v_snapshots[s_time_key] <= end)
                num_slices = len(v_snapshots[range_mask])
                
                for i,slices in get_slices(k, spacing, num_slices, v_snapshots):
                    if len(slices) > 0:
                        if i not in veh_slices_repair:
                            veh_slices_repair[i] = []
                        veh_slices_repair[i].append(slices)

                ## reset start to end for next iteration
                start = end
            
            #if len(veh_slices_repair[0]) > 0 or len(veh_slices_repair[1]) > 0 or len(veh_slices_repair[2]) > 0:
            veh_slices[repair_type] = veh_slices_repair

    return repair_slices

In [83]:
def get_train_val_test(repair_slices_map, train_ids, val_ids, test_ids):
    return (repair_slices_map[train_ids], repair_slices_map[val_ids], repair_slices_map[test_ids])

In [35]:
## need to remove veh ids
## need to divide by test, train, val

In [78]:
repair_slices_all = get_repair_slices_map(veh_ids, snapshots_df, repairs_df)

In [81]:
len(repair_slices_all[259][44004001][2])

1

In [None]:
get_train_val_test(repair_slices_map, train_ids, val_ids, test_ids)