# Head Injury Criterion Prediction

In [40]:
# Import modules

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
os.listdir()

['.ipynb_checkpoints', 'data', 'data_description.pdf', 'hic-prediction.ipynb']

In [3]:
# set paths
project_path = os.getcwd()
data_path = project_path + '/data'

In [4]:
def std_col_names(df):
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    return df

In [5]:
# load data to frame
veh = pd.read_csv(data_path+'/veh.psv', delimiter= '\t')
veh = std_col_names(veh)

In [6]:
veh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12131 entries, 0 to 12130
Data columns (total 93 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   tstno    12131 non-null  int64  
 1   vehno    12131 non-null  int64  
 2   make     12131 non-null  int64  
 3   maked    12131 non-null  object 
 4   model    12131 non-null  int64  
 5   modeld   12131 non-null  object 
 6   year     11953 non-null  float64
 7   nhtsano  6470 non-null   object 
 8   body     8517 non-null   object 
 9   bodyd    11807 non-null  object 
 10  vin      7824 non-null   object 
 11  engine   11660 non-null  object 
 12  engined  11660 non-null  object 
 13  engdsp   10709 non-null  float64
 14  transm   8195 non-null   object 
 15  transmd  11699 non-null  object 
 16  vehtwt   11797 non-null  float64
 17  curbwt   637 non-null    float64
 18  whlbas   11355 non-null  float64
 19  vehlen   11488 non-null  float64
 20  vehwid   11280 non-null  float64
 21  vehcg    110

In [7]:
num_cols_veh = veh.select_dtypes(include=['int64', 'float64']).columns.to_list()
cat_cols_veh = veh.select_dtypes(include=['object']).columns.to_list()

len(num_cols_veh), len(cat_cols_veh)

(68, 25)

In [8]:
(veh.describe()).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tstno,12131.0,5210.088946,3307.114848,1.0,2226.5,5048.0,8058.0,11847.0
vehno,12131.0,1.227022,0.418924,1.0,1.0,1.0,1.0,2.0
make,12131.0,27.044926,25.930954,1.0,7.0,23.0,32.0,102.0
model,12131.0,20.729618,25.942436,1.0,2.0,11.0,27.0,99.0
year,11953.0,1450.450431,894.136012,0.0,0.0,1997.0,2008.0,2021.0
...,...,...,...,...,...,...,...,...
ax19,9495.0,6962.071617,23562.217251,0.0,0.0,3216.0,4270.0,999999.0
ax20,9508.0,6922.611695,23551.150903,0.0,0.0,3128.5,4212.0,999999.0
ax21,9286.0,6135.489446,27386.443756,-10.0,0.0,0.0,450.0,999999.0
carang,10228.0,68.312867,242.260414,0.0,0.0,0.0,0.0,999.0


In [9]:
veh.describe(include='object').T

Unnamed: 0,count,unique,top,freq
maked,12131,93,NHTSA,2747
modeld,12131,804,DEFORMABLE IMPACTOR,1850
nhtsano,6470,4819,999,85
body,8517,20,4S,3547
bodyd,11807,23,FOUR DOOR SEDAN,3547
vin,7824,6366,1FTEW1C85GFA99353,76
engine,11660,19,NAPP,3463
engined,11660,20,NOT APPLICABLE,3461
transm,8195,12,AF,3421
transmd,11699,15,NOT APPLICABLE,3503


In [10]:
null_veh = pd.DataFrame(veh.isna().sum()).reset_index()\
                .rename(columns= {'index':'feature', 0:'null_count'}).sort_values(by='null_count', ascending = False)
null_veh

Unnamed: 0,feature,null_count
17,curbwt,11494
57,apleng,9584
53,bmpeng,9065
55,sileng,9006
71,ax3,8154
...,...,...
4,model,0
3,maked,0
2,make,0
1,vehno,0


In [11]:
null_veh.loc[null_veh['feature']=='bx1', ['null_count']]

Unnamed: 0,null_count
29,2561


In [12]:
def exclude_feature(df, null_count):
    """
    Fetch list of numerical columns from dataframe
    where number of null values exceed threshold `null_count`
    """
    return [df.iloc[idx][0] for idx in range(len(df)) if df.iloc[idx][1] > null_count]

excl_veh = exclude_feature(null_veh, 2835)

In [13]:
veh.columns

Index(['tstno', 'vehno', 'make', 'maked', 'model', 'modeld', 'year', 'nhtsano',
       'body', 'bodyd', 'vin', 'engine', 'engined', 'engdsp', 'transm',
       'transmd', 'vehtwt', 'curbwt', 'whlbas', 'vehlen', 'vehwid', 'vehcg',
       'strsep', 'strsepd', 'colmec', 'colmecd', 'modind', 'modindd', 'moddsc',
       'bx1', 'bx2', 'bx3', 'bx4', 'bx5', 'bx6', 'bx7', 'bx8', 'bx9', 'bx10',
       'bx11', 'bx12', 'bx13', 'bx14', 'bx15', 'bx16', 'bx17', 'bx18', 'bx19',
       'bx20', 'bx21', 'vehspd', 'crbang', 'pdof', 'bmpeng', 'bmpengd',
       'sileng', 'silengd', 'apleng', 'aplengd', 'dpd1', 'dpd2', 'dpd3',
       'dpd4', 'dpd5', 'dpd6', 'vdi', 'lencnt', 'damdst', 'crhdst', 'ax1',
       'ax2', 'ax3', 'ax4', 'ax5', 'ax6', 'ax7', 'ax8', 'ax9', 'ax10', 'ax11',
       'ax12', 'ax13', 'ax14', 'ax15', 'ax16', 'ax17', 'ax18', 'ax19', 'ax20',
       'ax21', 'carang', 'vehor', 'vehcom'],
      dtype='object')

In [14]:
feature_cols_veh = ['tstno', 'vehno', 'body', 'engine', 'engdsp', 'transm', 
               'whlbas','vehlen', 'vehwid', 'vehcg', 'modind', 'vehspd',
                'crbang', 'pdof', 'lencnt']

In [15]:
feature = 'vehor'
(veh[feature]).isna().value_counts(normalize= True, sort= True), ((veh[feature])==0).value_counts(normalize=True, sort= False)

(False    0.841893
 True     0.158107
 Name: vehor, dtype: float64,
 True     0.719809
 False    0.280191
 Name: vehor, dtype: float64)

In [16]:
veh[feature].value_counts()
# veh[feature].sample(20)

0.0      8732
999.0     765
90.0      475
15.0      123
9.0        43
99.0       37
7.0        18
75.0       11
23.0        4
26.0        2
45.0        2
60.0        1
Name: vehor, dtype: int64

In [17]:
def dirty_val_ratio(df, feature, vals)
    return len(df[~df[feature].isin([np.nan, 0, 999])])/len(veh)

0.05902233946088534

In [85]:
# load data to frame
occ = pd.read_csv(data_path+'/occ.psv', delimiter= '|')
occ = std_col_names(occ)
occ = occ.rename(columns= {'ctrh2d':'cntrh2d', 'ctrl2':'cntrl2'})


In [19]:
occ.head()

Unnamed: 0,tstno,vehno,occloc,occlocd,occtyp,occtypd,occage,occsex,occsexd,occht,...,t2,clip3m,lfem,rfem,csi,lbelt,sbelt,tti,pelvg,occcom
0,1,1,1,LEFT FRONT SEAT,P5,PART 572 DUMMY,,M,MALE,,...,96.45,60.6,0.0,0.0,60.0,0.0,7099.0,,,NO COMMENTS
1,1,1,2,RIGHT FRONT SEAT,P5,PART 572 DUMMY,,M,MALE,,...,109.28,45.4,0.0,0.0,60.0,0.0,5694.0,,,NO COMMENTS
2,2,1,1,LEFT FRONT SEAT,P5,PART 572 DUMMY,,M,MALE,,...,165.23,0.0,0.0,0.0,51.0,0.0,0.0,,,NO COMMENTS
3,2,1,2,RIGHT FRONT SEAT,P5,PART 572 DUMMY,,M,MALE,,...,164.48,0.0,0.0,0.0,35.0,0.0,0.0,,,NO COMMENTS
4,2,2,1,LEFT FRONT SEAT,SD,NHTSA SIDE IMPACT DUMMY,,M,MALE,,...,127.28,16.2,0.0,0.0,60.0,0.0,0.0,,,NO COMMENTS


In [83]:
occ.columns

Index(['tstno', 'vehno', 'occloc', 'occlocd', 'occtyp', 'occtypd', 'occage',
       'occsex', 'occsexd', 'occht', 'occwt', 'mthcal', 'mthcald', 'dumsiz',
       'dumsizd', 'dumman', 'dummod', 'dumdsc', 'hh', 'hw', 'hr', 'hs', 'cd',
       'cs', 'ad', 'hd', 'kd', 'hb', 'nb', 'cb', 'kb', 'seposn', 'seposnd',
       'cntrh1', 'cntrh1d', 'cntrh2', 'cntrh2d', 'cntrc1', 'cntrc1d', 'cntrc2',
       'cntrc2d', 'cntrl1', 'cntrl1d', 'ctrl2', 'cntrl2d', 'hic', 't1', 't2',
       'clip3m', 'lfem', 'rfem', 'csi', 'lbelt', 'sbelt', 'tti', 'pelvg',
       'occcom'],
      dtype='object')

In [49]:
feature_cols_occ = ['tstno', 'vehno','occloc','occsex', 'seposn',
                    'cntrh1', 'cntrh2', 'cntrc1', 'cntrc2', 'cntrl1',
                    'ctrl2', 't1', 't2','hic'     
]

In [50]:
feature = 'pelvg'
(occ[feature]).isna().value_counts(normalize= True, sort= True), ((occ[feature])==0).value_counts(normalize=True, sort= False)

(False    0.690599
 True     0.309401
 Name: pelvg, dtype: float64,
 False    0.436869
 True     0.563131
 Name: pelvg, dtype: float64)

In [23]:
occ[feature].value_counts()

0.00        8871
9.00         280
9999.99      140
99999.00      88
50.00         17
            ... 
73.40          1
73.60          1
59.90          1
85.60          1
67.60          1
Name: pelvg, Length: 744, dtype: int64

In [93]:
# Feature list from both tables
feature_cols_veh = ['tstno', 'vehno', 'body', 'engine', 'engdsp', 'transm', 
               'whlbas','vehlen', 'vehwid', 'vehcg', 'modind', 'vehspd',
                'crbang', 'pdof', 'lencnt']

feature_cols_occ = ['tstno', 'vehno','occloc','occsex', 'seposn',
                    'cntrh1', 'cntrh2', 'cntrc1', 'cntrc2', 'cntrl1',
                    'cntrl2', 't1', 't2', 'hic'     
]

In [164]:
# Extracting data based on selected features
veh_df = veh[feature_cols_veh]
occ_df = occ[feature_cols_occ]
len(veh_df), len(occ_df)

(12131, 15753)

In [205]:
# Merge DFs on composite keys
merged_df = pd.merge(left= veh_df, right= occ_df, how= 'inner',\
                    on= ['tstno', 'vehno']
                    )
# merged_df['hic'] = merged_df['hic'].dropna(inplace= True)

merged_df = merged_df[(merged_df['hic']>150) & (merged_df['hic'] < 1500)]

In [207]:
# merged_df = merged_df[(merged_df['hic']>150) & (merged_df['hic'] < 1500)]
merged_df#.isna().value_counts(normalize=True)

Unnamed: 0,tstno,vehno,body,engine,engdsp,transm,whlbas,vehlen,vehwid,vehcg,...,seposn,cntrh1,cntrh2,cntrc1,cntrc2,cntrl1,cntrl2,t1,t2,hic
0,1,1,5H,OTHR,2.8,AF,2664.0,4488.0,1730.0,1069.0,...,CN,SW,UN,SW,UN,DP,UN,69.68,96.45,1041.0
1,1,1,5H,OTHR,2.8,AF,2664.0,4488.0,1730.0,1069.0,...,CN,NO,UN,NO,UN,NO,UN,68.55,109.28,1118.0
4,2,2,4S,S6IF,3.7,AR,2858.0,5110.0,1862.0,1240.0,...,CN,SD,UN,OT,UN,NO,UN,73.43,127.28,464.0
5,2,2,4S,S6IF,3.7,AR,2858.0,5110.0,1862.0,1240.0,...,NO,UN,UN,OT,UN,NO,UN,91.73,96.08,552.0
7,3,2,3H,4CTF,1.5,MF,2400.0,3945.0,1610.0,986.0,...,CN,SD,UN,OT,UN,OT,UN,70.95,111.00,1300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15733,11654,1,4S,NAPP,,,,,,,...,NO,NO,NO,NO,NO,NO,NO,72.60,57.60,272.0
15737,11820,2,UV,4CTF,2.0,M4,2669.0,4463.0,1776.0,1240.0,...,NO,AB,OT,OT,NO,DR,NO,46.55,72.50,303.0
15742,11825,1,UV,4CIF,2.0,A4,2992.0,5025.0,1956.0,1494.0,...,FW,AB,OT,NO,NO,DR,NO,43.50,73.30,268.0
15743,11827,1,OT,OTHR,0.0,,0.0,0.0,1800.0,0.0,...,,AB,AB,AB,UN,DP,SC,0.00,0.00,310.0


In [208]:
# Exclude the first two columns because they are primary keys
# Exclude last column and store as target
df = merged_df[merged_df.columns.to_list()[2:-1]]
target = merged_df['hic']
df.head()

Unnamed: 0,body,engine,engdsp,transm,whlbas,vehlen,vehwid,vehcg,modind,vehspd,...,occsex,seposn,cntrh1,cntrh2,cntrc1,cntrc2,cntrl1,cntrl2,t1,t2
0,5H,OTHR,2.8,AF,2664.0,4488.0,1730.0,1069.0,P,64.2,...,M,CN,SW,UN,SW,UN,DP,UN,69.68,96.45
1,5H,OTHR,2.8,AF,2664.0,4488.0,1730.0,1069.0,P,64.2,...,M,CN,NO,UN,NO,UN,NO,UN,68.55,109.28
4,4S,S6IF,3.7,AR,2858.0,5110.0,1862.0,1240.0,M,0.0,...,M,CN,SD,UN,OT,UN,NO,UN,73.43,127.28
5,4S,S6IF,3.7,AR,2858.0,5110.0,1862.0,1240.0,M,0.0,...,M,NO,UN,UN,OT,UN,NO,UN,91.73,96.08
7,3H,4CTF,1.5,MF,2400.0,3945.0,1610.0,986.0,M,0.0,...,M,CN,SD,UN,OT,UN,OT,UN,70.95,111.0


In [257]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.to_list()
cat_cols = df.select_dtypes(include=['object']).columns.to_list()

cat_cols

['body',
 'engine',
 'transm',
 'modind',
 'occloc',
 'occsex',
 'seposn',
 'cntrh1',
 'cntrh2',
 'cntrc1',
 'cntrc2',
 'cntrl1',
 'cntrl2']

In [281]:
map1={feature: list(zip(veh[feature].unique(), veh[feature+'d'].unique())) for feature in cat_cols[:4]}
map2 = {feature: list(zip(occ[feature].unique(), occ[feature+'d'].unique())) for feature in cat_cols[4:]}
feature_map = {**map1, **map2}

feature_map['cntrl2'], occ['cntrl2'].unique(),occ['cntrl2d'].unique()

([('UN', 'UNKNOWN'),
  ('NO', 'NONE'),
  ('DP', 'DASHPANEL'),
  ('OT', 'OTHER'),
  ('SB', 'SEAT BACK'),
  ('SC', 'STEERING COLUMN'),
  ('DR', 'DOOR'),
  (nan, 'DOOR PANEL')],
 array(['UN', 'NO', 'DP', 'OT', 'SB', 'SC', 'DR', nan], dtype=object),
 array(['UNKNOWN', 'NONE', 'DASHPANEL', 'OTHER', 'SEAT BACK',
        'STEERING COLUMN', 'DOOR', 'DOOR PANEL', nan], dtype=object))

In [284]:
feature_map

{'body': [('5H', 'FIVE DOOR HATCHBACK'),
  ('4S', 'FOUR DOOR SEDAN'),
  ('3H', 'THREE DOOR HATCHBACK'),
  (nan, 'NOT APPLICABLE'),
  ('VN', 'VAN'),
  ('2S', 'TWO DOOR SEDAN'),
  ('2C', 'TWO DOOR COUPE'),
  ('OT', 'OTHER'),
  ('PU', 'PICKUP TRUCK'),
  ('TR', 'TRUCK'),
  ('SW', 'STATION WAGON'),
  ('CV', 'CONVERTIBLE'),
  ('UV', 'UTILITY VEHICLE'),
  ('MH', 'MOTOR HOME'),
  ('BU', 'C-10'),
  ('LM', 'BUS'),
  ('MV', 'LIMOUSINE'),
  ('EX', 'SILVERADO'),
  ('4P', 'MINIVAN'),
  ('UN', 'EXTENDED CAB PICKUP'),
  ('3C', nan)],
 'engine': [('OTHR', 'OTHER'),
  ('V8IF', 'V8 INLINE FRONT'),
  ('S6IF', 'STRAIGHT 6 INLINE FRONT'),
  ('4CTF', '4 CYLINDER TRANSVERSE FRONT'),
  ('S6TF', 'STRAIGHT 6 TRANSVERSE FRONT'),
  ('NAPP', 'NOT APPLICABLE'),
  ('4CIF', '4 CYLINDER INLINE FRONT'),
  ('V6IF', 'V6 INLINE FRONT'),
  ('ELEC', 'ELECTRIC MOTOR'),
  ('ROTR', 'ROTARY'),
  ('4CLR', '4 CYLINDER REAR'),
  ('S5IF', 'STRAIGHT 5 INLINE FRONT'),
  ('4CLM', '4 CYLINDER MID'),
  ('3CTF', '3 CYLINDER TRANSVERSE FRO

In [283]:
occ.loc[occ['cntrl2']=='DR',['cntrl2d']]

Unnamed: 0,cntrl2d
4100,DOOR
4101,DOOR PANEL
5564,DOOR PANEL
5720,DOOR PANEL
7307,DOOR
7308,DOOR
7672,DOOR
11147,DOOR
11394,DOOR
11554,DOOR


In [156]:
merged_df['time'] = abs(merged_df['t1']-merged_df['t2'])

In [157]:
corr_matrix = merged_df.corr()
abs(corr_matrix['hic']).sort_values(ascending=False)

hic       1.000000
vehlen    0.105932
t2        0.101504
whlbas    0.094760
t1        0.092671
tstno     0.077424
time      0.074768
vehwid    0.044345
lencnt    0.031490
vehcg     0.026001
vehspd    0.014842
engdsp    0.009401
crbang    0.005048
pdof      0.004009
vehno     0.003300
Name: hic, dtype: float64

In [229]:
len(df)

10362

In [247]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df, target, shuffle= True, random_state=42, test_size=0.05)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, shuffle= True, random_state= 42, test_size=.055)

In [248]:
print(
f'Train size: {len(x_train), len(y_train)}\n'
f'Val size: {len(x_val), len(y_val)}\n'
f'Test size: {len(x_test), len(y_test)}'
)

Train size: (9301, 9301)
Val size: (542, 542)
Test size: (519, 519)


In [264]:
df[num_cols].isna().sum()

engdsp    491
whlbas    421
vehlen    386
vehwid    642
vehcg     510
vehspd     66
crbang    191
pdof      136
lencnt    817
t1         21
t2         34
dtype: int64