In [1]:
import pandas as pd
import numpy as np
#from sklearn.preprocessing import OneHotEncoder, LabelEncoder
#from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
faults = pd.read_csv('../data/J1939Faults.csv',low_memory=False)
faults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,0,111,17,True,2,,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25.000
1,2,990360,2015-02-21 11:34:34.000,,,unknown,unknown,unknown,unknown,11,629,12,True,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10.000
2,3,990364,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,False,127,,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26.000
3,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,True,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08.000
4,5,990416,2015-02-21 11:39:41.000,,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37.000


In [3]:
columns_to_drop = ['ESS_Id', 
                   'actionDescription', 
                   'ecuSoftwareVersion', 
                   'ecuSerialNumber', 
                   'ecuModel', 
                   'ecuMake', 
                   'ecuSource', 
                   'faultValue', 
                   'MCTNumber']

faults_a = faults.drop(columns=columns_to_drop)
faults_a.head()

Unnamed: 0,RecordID,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,LocationTimeStamp
0,1,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851,2015-02-21 11:34:25.000
1,2,2015-02-21 11:34:34.000,,629,12,True,127,1439,38.857638,-84.626851,2015-02-21 11:35:10.000
2,3,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,41.42125,-87.767361,2015-02-21 11:35:26.000
3,4,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361,2015-02-21 11:36:08.000
4,5,2015-02-21 11:39:41.000,,4364,17,False,2,1674,38.416481,-89.442638,2015-02-21 11:39:37.000


In [4]:
faults_a['EventTimeStamp'] = pd.to_datetime(faults_a['EventTimeStamp'])
faults_a['LocationTimeStamp'] = pd.to_datetime(faults_a['LocationTimeStamp'])

# Function to split time of day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    else:
        return 'Evening'

# Apply the function to create a new column for time of day
faults_a['time_of_day'] = faults_a['EventTimeStamp'].dt.hour.apply(categorize_time_of_day)

faults_a['Month'] = faults_a['EventTimeStamp'].dt.month
faults_a['Year'] = faults_a['EventTimeStamp'].dt.year

In [5]:
equipment_list = faults_a['EquipmentID'].unique().tolist()

In [6]:
diagnostics = pd.read_csv("../data/VehicleDiagnosticOnboardData.csv")
diagnostics.head()

Unnamed: 0,Id,Name,Value,FaultId
0,1,IgnStatus,False,1
1,2,EngineOilPressure,0,1
2,3,EngineOilTemperature,96.74375,1
3,4,TurboBoostPressure,0,1
4,5,EngineLoad,11,1


In [7]:
diagnostics_pivoted = diagnostics.pivot(index='FaultId', columns='Name', values='Value')
diagnostic_name_list = diagnostics_pivoted.columns.tolist()
print(diagnostic_name_list)
diagnostics_pivoted.head()

['AcceleratorPedal', 'BarometricPressure', 'CruiseControlActive', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IgnStatus', 'IntakeManifoldTemperature', 'LampStatus', 'ParkingBrake', 'ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']


Name,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,EngineRpm,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
FaultId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,14.21,False,66.48672,423178.7,100.4,11.0,0.0,96.74375,0.0,...,,False,78.8,1023,True,,0.0,3276.75,,0.0
2,,,,,,,,,,,...,,True,,1279,,,,,,
3,,,,,,,,,,,...,,,,1279,,,,,,
4,,,,,,,,,,,...,,True,,1279,,,,,,
5,,,,,,,,,,,...,,,,16639,,,,,,


In [8]:
faults_diagnostics = pd.merge(faults_a, diagnostics_pivoted, 
                        left_on='RecordID',right_on= 'FaultId',how='left')

faults_diagnostics.head()

Unnamed: 0,RecordID,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851,...,,False,78.8,1023,True,,0.0,3276.75,,0.0
1,2,2015-02-21 11:34:34,,629,12,True,127,1439,38.857638,-84.626851,...,,True,,1279,,,,,,
2,3,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,41.42125,-87.767361,...,,,,1279,,,,,,
3,4,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361,...,,True,,1279,,,,,,
4,5,2015-02-21 11:39:41,,4364,17,False,2,1674,38.416481,-89.442638,...,,,,16639,,,,,,


In [9]:
#Switched to get_dummies approach later, so this is commented out now!

# faults_diagnostics= faults_diagnostics[faults_diagnostics['active']==True]
# #Full Derate
# faults_diagnostics['IsSpn5246'] = np.where(faults_diagnostics['spn']==5246,True,False)
# #Low Engine Coolant Level
# faults_diagnostics['IsSpn111'] = np.where(faults_diagnostics['spn']==111,True,False)
# #Condition Exists Engine Protection Torque Derate
# faults_diagnostics['IsSpn1569'] = np.where(faults_diagnostics['spn']==1569,True,False)
# #Low Voltage (Aftertreatment 1 Outlet NOx)
# faults_diagnostics['IsSpn3226'] = np.where(faults_diagnostics['spn']==3226,True,False)

# #print(faults_diagnostics[faults_diagnostics['IsSpn5246']==True].shape)
# #faults_diagnostics[faults_diagnostics['EquipmentID']=='1395'].tail(50)
# faults_diagnostics.head()

In [10]:
#In (lat, lon) format

#Mt Juliet Location: (36.0666667, -86.4347222)
#(36.2115942, -86.4347222) #10 miles North
#(35.9217392, -86.4347222) #10 miles South
#(36.0666667, -86.305976) #10 miles East
#(36.0666667, -86.563468) #10 miles West

#Shelbyville location: (35.588333, -86.443888)
#North: (35.6382021, -86.443888)
#South: (35.5384639, -86.443888)
#East: (35.588333, -86.3391842)
#West: (35.588333, -86.5485918)

#Russellville location: (36.1950, -83.174722)
#North: (36.2374703, -83.174722)
#South: (36.1525297, -83.174722)
#East: (36.195, -83.0693852)
#West: (36.195, -83.2800588)

faults_diagnostics_noservice = faults_diagnostics[~(
                    ((faults_diagnostics['Latitude'] > 35.9217392) & (faults_diagnostics['Latitude'] < 36.2115942) & 
                    (faults_diagnostics['Longitude'] > -86.5485918) & (faults_diagnostics['Longitude'] < -86.305976)) |
                    
                    ((faults_diagnostics['Latitude'] > 35.5384639) & (faults_diagnostics['Latitude'] < 35.6382021) & 
                    (faults_diagnostics['Longitude'] > -86.5485918) & (faults_diagnostics['Longitude'] < -86.3391842)) |
                    
                    ((faults_diagnostics['Latitude'] > 36.1525297) & (faults_diagnostics['Latitude'] < 36.2374703) & 
                    (faults_diagnostics['Longitude'] > -83.2800588) & (faults_diagnostics['Longitude'] < -83.0693852))
                                                   )]
print(faults_diagnostics.shape)
print(faults_diagnostics_noservice.shape)
faults_diagnostics_noservice.head()

(1187335, 38)
(1029682, 38)


Unnamed: 0,RecordID,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851,...,,False,78.8,1023,True,,0.0,3276.75,,0.0
1,2,2015-02-21 11:34:34,,629,12,True,127,1439,38.857638,-84.626851,...,,True,,1279,,,,,,
2,3,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,41.42125,-87.767361,...,,,,1279,,,,,,
3,4,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361,...,,True,,1279,,,,,,
4,5,2015-02-21 11:39:41,,4364,17,False,2,1674,38.416481,-89.442638,...,,,,16639,,,,,,


In [11]:
faults_diagnostics_noservice.sort_values(by = ['EquipmentID', 'EventTimeStamp'], ascending = True).head(30)

Unnamed: 0,RecordID,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
1001106,1038243,2018-07-20 09:31:33,High (Severity Medium) J1939 Network #2,1231,16,True,2,105406655,36.139351,-85.629722,...,,True,138.2,255,False,,5.203984,,100.0,5.8
1001107,1038244,2018-07-20 09:42:30,High (Severity Medium) J1939 Network #2,1231,16,False,2,105406655,36.138981,-85.629675,...,,,,255,,,,,,
358800,366301,2016-01-31 07:12:25,,629,12,True,127,105301976,41.987175,-87.73199,...,,True,,1279,,,,,,
927313,955890,2018-01-26 18:58:08,Low (Severity Medium) Transmission Air Tank Pr...,37,18,True,1,105338729,38.348564,-85.70912,...,,True,120.2,50175,False,,0.0,,100.0,0.29
927314,955891,2018-01-26 18:59:36,Low (Severity Medium) Transmission Air Tank Pr...,37,18,False,1,105338729,38.348611,-85.70912,...,,,,50175,,,,,,
936445,965022,2018-02-12 20:29:39,,5939,16,True,3,105338729,39.953425,-81.935462,...,,True,114.8,50175,False,,0.0,,100.0,1.16
936449,965026,2018-02-12 20:32:28,,5939,16,False,4,105338729,39.953287,-81.935462,...,,,,50175,,,,,,
936451,965028,2018-02-12 20:33:23,,5939,0,True,1,105338729,39.953379,-81.935416,...,,True,109.4,50175,False,,0.2815588,,100.0,0.29
936610,965187,2018-02-13 06:58:56,,5939,0,False,1,105338729,39.953287,-81.93537,...,,,,50175,,,,,,
938572,967149,2018-02-15 21:21:55,Low (Severity Medium) Transmission Air Tank Pr...,37,18,True,4,105338729,29.806388,-95.275648,...,,True,125.6,50175,False,,0.0,,100.0,1.16


In [12]:
#faults_diagnostics.to_csv('../data/faults_diagnostics.csv')
#faults_diagnostics_noservice.to_csv('../data/faults_diagnostics_no_service_stations.csv')

In [13]:
#faults_diagnostics_noservice.dtypes
#faults_diagnostics_noservice.set_index('EventTimeStamp').rolling(5, on='EquipmentID', min_periods=0).sum().reset_index()
#faults_diagnostics_noservice.rolling('5D', on='spn')
faults_diagnostics_noservice.sort_values(['EventTimeStamp']).groupby(['EquipmentID']).rolling('5D', on='EventTimeStamp', min_periods=0).sum().head()

       'DistanceLtd', 'EngineCoolantTemperature', 'EngineOilPressure',
       'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel',
       'FuelLtd', 'FuelRate', 'FuelTemperature', 'IgnStatus',
       'IntakeManifoldTemperature', 'LocationTimeStamp', 'ParkingBrake',
       'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure',
       'eventDescription', 'time_of_day'],
      dtype='object')
  faults_diagnostics_noservice.sort_values(['EventTimeStamp']).groupby(['EquipmentID']).rolling('5D', on='EventTimeStamp', min_periods=0).sum().head()


Unnamed: 0_level_0,Unnamed: 1_level_0,CruiseControlSetSpeed,EventTimeStamp,EngineLoad,LampStatus,Latitude,Longitude,Month,RecordID,ServiceDistance,Year,active,activeTransitionCount,fmi,spn
EquipmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
105406655,1001106,0.0,2018-07-20 09:31:33,12.0,255.0,36.139351,-85.629722,7.0,1038243.0,0.0,2018.0,1.0,2.0,16.0,1231.0
105406655,1001107,0.0,2018-07-20 09:42:30,12.0,510.0,72.278332,-171.259397,14.0,2076487.0,0.0,4036.0,1.0,4.0,32.0,2462.0
105301976,358800,0.0,2016-01-31 07:12:25,0.0,1279.0,41.987175,-87.73199,1.0,366301.0,0.0,2016.0,1.0,127.0,12.0,629.0
105338729,927313,0.0,2018-01-26 18:58:08,16.0,50175.0,38.348564,-85.70912,1.0,955890.0,0.0,2018.0,1.0,1.0,18.0,37.0
105338729,927314,0.0,2018-01-26 18:59:36,16.0,100350.0,76.697175,-171.41824,2.0,1911781.0,0.0,4036.0,1.0,2.0,36.0,74.0


In [14]:
#Would a onehotencoder or pd.get_dummies approach with 418 new columns make sense?
spn_list = faults_diagnostics_noservice['spn'].unique().tolist()
#spn_list_string = list(map(str, spn_list))
print(len(spn_list))

420


In [15]:
faults_diagnostics_noservice_spndummies = pd.concat((faults_diagnostics_noservice,pd.get_dummies(faults_diagnostics_noservice['spn'])),1)
faults_diagnostics_noservice_spndummies.head()

  faults_diagnostics_noservice_spndummies = pd.concat((faults_diagnostics_noservice,pd.get_dummies(faults_diagnostics_noservice['spn'])),1)


Unnamed: 0,RecordID,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,...,520413,520953,521032,523530,523531,523543,524033,524037,524071,524287
0,1,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851,...,0,0,0,0,0,0,0,0,0,0
1,2,2015-02-21 11:34:34,,629,12,True,127,1439,38.857638,-84.626851,...,0,0,0,0,0,0,0,0,0,0
2,3,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,41.42125,-87.767361,...,0,0,0,0,0,0,0,0,0,0
3,4,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361,...,0,0,0,0,0,0,0,0,0,0
4,5,2015-02-21 11:39:41,,4364,17,False,2,1674,38.416481,-89.442638,...,0,0,0,0,0,0,0,0,0,0


In [16]:
final_columns = ['EventTimeStamp', 'EquipmentID', *spn_list]
faults_diagnostics_noservice_spndummies[final_columns].sort_values(['EventTimeStamp']).groupby(['EquipmentID']).rolling('5D', on='EventTimeStamp', min_periods=0).sum().reset_index()
#Not sure what this "level_1" column is, but everything else seems great!

Unnamed: 0,EquipmentID,level_1,EventTimeStamp,0,16,27,33,37,38,51,...,520413,520953,521032,523530,523531,523543,524033,524037,524071,524287
0,0105406655,1001106,2018-07-20 09:31:33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0105406655,1001107,2018-07-20 09:42:30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,105301976,358800,2016-01-31 07:12:25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,105338729,927313,2018-01-26 18:58:08,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,105338729,927314,2018-01-26 18:59:36,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029677,R1762,4427,2015-02-24 15:31:17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1029678,R1762,4426,2015-02-24 15:31:56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1029679,R1762,4494,2015-02-24 16:24:05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1029680,R1762,6438,2015-02-26 13:12:11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
faults_diagnostics_noservice_spndummies[final_columns].head()

Unnamed: 0,EventTimeStamp,EquipmentID,111,629,1807,4364,97,1067,96,829,...,335040,520200,7323,88121,153931,13600,767,603,577,5953
0,2015-02-21 10:47:13,1439,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2015-02-21 11:34:34,1439,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2015-02-21 11:35:31,1369,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2015-02-21 11:35:33,1369,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2015-02-21 11:39:41,1674,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
faults_diagnostics_noservice_spndummies[faults_diagnostics_noservice_spndummies['EquipmentID']=='0105406655']

Unnamed: 0,RecordID,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,...,520413,520953,521032,523530,523531,523543,524033,524037,524071,524287
1001106,1038243,2018-07-20 09:31:33,High (Severity Medium) J1939 Network #2,1231,16,True,2,105406655,36.139351,-85.629722,...,0,0,0,0,0,0,0,0,0,0
1001107,1038244,2018-07-20 09:42:30,High (Severity Medium) J1939 Network #2,1231,16,False,2,105406655,36.138981,-85.629675,...,0,0,0,0,0,0,0,0,0,0


In [19]:
predictors = spn_list.copy()
predictors.remove(5246)
X = faults_diagnostics_noservice_spndummies[predictors]
y = faults_diagnostics_noservice_spndummies[5246]
print(len(predictors))
print(len(spn_list))

419
420


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

In [21]:
rf = RandomForestClassifier(max_depth = 2).fit(X_train, y_train)
print(f'Test Set Accuracy: {accuracy_score(y_test, rf.predict(X_test))}\n')
print(classification_report(y_test, rf.predict(X_test)))

Test Set Accuracy: 0.9991026373139721

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257190
           1       0.00      0.00      0.00       231

    accuracy                           1.00    257421
   macro avg       0.50      0.50      0.50    257421
weighted avg       1.00      1.00      1.00    257421



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
importances = pd.DataFrame({
    'variable': predictors,
    'importance': rf.feature_importances_
})

importances.sort_values('importance', ascending = False).head(20)

#probably overfit. Ideas for next steps: 
#1) cut down on predictors
#2) use decision tree model to help visualize and formalize decision maaking for an average user.

Unnamed: 0,variable,importance
6,96,0.086171
4,97,0.062175
10,50353,0.054781
7,829,0.054487
9,929,0.048327
15,791,0.047248
24,639,0.047099
22,789,0.042703
8,596,0.039124
0,111,0.036878
