In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.metrics import recall_score
import mlflow

##### What will happen to you if you were in a car accident?

In [2]:
# import chardet

# with open('./data/accident.CSV', 'rb') as filedata:
#     result = chardet.detect(filedata.read(10000000))
# result

In [3]:
df = pd.read_csv('./data/Person.CSV', encoding='Windows-1252')
df.head()

  df = pd.read_csv('./data/Person.CSV', encoding='Windows-1252')


Unnamed: 0,STATE,STATENAME,ST_CASE,VE_FORMS,VEH_NO,PER_NO,STR_VEH,COUNTY,DAY,DAYNAME,...,WORK_INJ,WORK_INJNAME,HISPANIC,HISPANICNAME,LOCATION,LOCATIONNAME,HELM_USE,HELM_USENAME,HELM_MIS,HELM_MISNAME
0,1,Alabama,10001,2,1,1,0,81,7,7,...,8,Not Applicable (not a fatality),0,Not A Fatality (not Applicable),0,Occupant of a Motor Vehicle,20,Not Applicable,7,None Used/Not Applicable
1,1,Alabama,10001,2,1,2,0,81,7,7,...,0,No,7,Non-Hispanic,0,Occupant of a Motor Vehicle,20,Not Applicable,7,None Used/Not Applicable
2,1,Alabama,10001,2,2,1,0,81,7,7,...,8,Not Applicable (not a fatality),0,Not A Fatality (not Applicable),0,Occupant of a Motor Vehicle,20,Not Applicable,7,None Used/Not Applicable
3,1,Alabama,10002,2,1,1,0,55,23,23,...,0,No,7,Non-Hispanic,0,Occupant of a Motor Vehicle,20,Not Applicable,7,None Used/Not Applicable
4,1,Alabama,10002,2,2,1,0,55,23,23,...,8,Not Applicable (not a fatality),0,Not A Fatality (not Applicable),0,Occupant of a Motor Vehicle,20,Not Applicable,7,None Used/Not Applicable


In [4]:
df.columns.values

array(['STATE', 'STATENAME', 'ST_CASE', 'VE_FORMS', 'VEH_NO', 'PER_NO',
       'STR_VEH', 'COUNTY', 'DAY', 'DAYNAME', 'MONTH', 'MONTHNAME',
       'HOUR', 'HOURNAME', 'MINUTE', 'MINUTENAME', 'RUR_URB',
       'RUR_URBNAME', 'FUNC_SYS', 'FUNC_SYSNAME', 'HARM_EV',
       'HARM_EVNAME', 'MAN_COLL', 'MAN_COLLNAME', 'SCH_BUS',
       'SCH_BUSNAME', 'MAKE', 'MAKENAME', 'MAK_MOD', 'BODY_TYP',
       'BODY_TYPNAME', 'MOD_YEAR', 'MOD_YEARNAME', 'TOW_VEH',
       'TOW_VEHNAME', 'SPEC_USE', 'SPEC_USENAME', 'EMER_USE',
       'EMER_USENAME', 'ROLLOVER', 'ROLLOVERNAME', 'IMPACT1',
       'IMPACT1NAME', 'FIRE_EXP', 'FIRE_EXPNAME', 'AGE', 'AGENAME', 'SEX',
       'SEXNAME', 'PER_TYP', 'PER_TYPNAME', 'INJ_SEV', 'INJ_SEVNAME',
       'SEAT_POS', 'SEAT_POSNAME', 'REST_USE', 'REST_USENAME', 'REST_MIS',
       'REST_MISNAME', 'AIR_BAG', 'AIR_BAGNAME', 'EJECTION',
       'EJECTIONNAME', 'EJ_PATH', 'EJ_PATHNAME', 'EXTRICAT',
       'EXTRICATNAME', 'DRINKING', 'DRINKINGNAME', 'ALC_DET',
       'ALC_DETNAME',

In [5]:
MODEL_COLUMNS = [
    'ST_CASE', 
    'STATE',
    'STATENAME',
    'VEH_NO',
    'VE_FORMS',
    'PER_NO',
    'COUNTY',
    'DAY',
    'MONTH',
    'HOUR',
    'AGE',
    'SEX',
    'INJ_SEV',
    'INJ_SEVNAME',
    'DOA',
    'DOANAME',
    'SEAT_POS',
    'REST_USE',
]
TARGET = 'INJ_SEV'

In [6]:
# df = original_df

In [7]:
original_df = df.copy()
df = df[MODEL_COLUMNS]

#### Dropping irrelevant injury categories

The dataset user manual states these are the possible values for the injury severity field (`INJ_SEV`):

- 0 - No Apparent Injury
- 1 - Possible Injury
- 2 - Suspected Minor Injury
- 3 - Suspected Serious Injury
- 4 - Fatal Injury 
- 5 - Injured, Severity Unknown
- 6 - Died Prior to Crash
- 9 - Unknown/Not Reported 

Since we want to teach our model to predict a specific injury severity, we'll only use categories 0-4

In addition, we'll consider death as another type of injury. </br>
The user manual describes the column detailing death (`DOA`) as such:

- 0 Not Applicable 
- 7 Died at Scene
- 8 Died En Route (to a hospital)
- 9 Unknown

Again, we'll ignore cases where death is unknown and focus on categories 0,7 and 8

We'll create a new `crash_result` field that will be our prediction target and will be a combination of both of the described fields.

In [8]:
df = df[(df['INJ_SEV'] <= 4) & (df['DOA'] != 9)]

Since we want to combine the injury severity and death outcome results to one field, it's important to see what's the relationship between them

In [9]:
df[df['DOA'] >= 8].groupby('INJ_SEVNAME').size()

INJ_SEVNAME
Fatal Injury (K)    241
dtype: int64

So when a person dies, he will always be tagged as having a fatal injury. </br>
Does that imply all fatal injuries are tagged as death?

In [10]:
df[df['INJ_SEV'] == 4].groupby('DOANAME').size()

DOANAME
Died En Route       241
Died at Scene     20121
Not Applicable    15925
dtype: int64

It doesn't seem to be the case.</br>
So we'll tag an accident result as a fatal injury only when the person did not die.

In [11]:
df['accident_result'] = df['INJ_SEV']
df['accident_result_name'] = df['INJ_SEVNAME']

df.loc[df['DOA'] > 0, 'accident_result'] = df['DOA']
df.loc[df['DOA'] > 0, 'accident_result_name'] = df['DOANAME']

df.loc[df['accident_result'] == 4, 'accident_result_name'] = 'Fatal Injury without Death'

In [12]:
# Cleaning the accident_result labels
df['accident_result_name'] = df['accident_result_name'].str.extract(r'(?P<accident_result_name>[^()]+)')
df['accident_result_name'] = df['accident_result_name'].str.strip()

#### Dropping the labels
We'll drop the name label columns for now, since they're only really useful when we'll want to present our prediction in a user friendly way

In [13]:
def create_id_label_dict(df, id_column, label_column):
    """
    Creates a dictionary from a pandas DataFrame with IDs as keys and labels as values.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame containing ID and label columns.
    id_column (str): The name of the column containing the IDs.
    label_column (str): The name of the column containing the labels.
    
    Returns:
    dict: A dictionary with IDs as keys and labels as values.
    """
    # Drop duplicates to ensure unique ID-label pairs
    unique_pairs = df[[id_column, label_column]].drop_duplicates()
    
    # Convert the unique pairs to a dictionary
    id_label_dict = dict(zip(unique_pairs[id_column], unique_pairs[label_column]))
    
    return id_label_dict

In [14]:
# Keeping a dictionary of the labels to be used only when displaying results to a user
ACCIDENT_RESULT_NAMES = create_id_label_dict(df, 'accident_result', 'accident_result_name')
ACCIDENT_RESULT_NAMES

{3: 'Suspected Serious Injury',
 7: 'Died at Scene',
 1: 'Possible Injury',
 2: 'Suspected Minor Injury',
 0: 'No Apparent Injury',
 4: 'Fatal Injury without Death',
 8: 'Died En Route'}

In [None]:
df = df.drop(['INJ_SEV', 'INJ_SEVNAME', 'DOA', 'DOANAME', 'accident_result_name'], axis=1)

### Cleaning the data

In [16]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 81521 entries, 0 to 82842
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ST_CASE          81521 non-null  int64 
 1   STATE            81521 non-null  int64 
 2   STATENAME        81521 non-null  object
 3   VEH_NO           81521 non-null  int64 
 4   VE_FORMS         81521 non-null  int64 
 5   PER_NO           81521 non-null  int64 
 6   COUNTY           81521 non-null  int64 
 7   DAY              81521 non-null  int64 
 8   MONTH            81521 non-null  int64 
 9   HOUR             81521 non-null  int64 
 10  AGE              81521 non-null  int64 
 11  SEX              81521 non-null  int64 
 12  INJ_SEV          81521 non-null  int64 
 13  DOA              81521 non-null  int64 
 14  SEAT_POS         81521 non-null  int64 
 15  REST_USE         81521 non-null  int64 
 16  accident_result  81521 non-null  int64 
dtypes: int64(16), object(1)
memory usage

In [17]:
df.describe()

Unnamed: 0,ST_CASE,STATE,VEH_NO,VE_FORMS,PER_NO,COUNTY,DAY,MONTH,HOUR,AGE,SEX,INJ_SEV,DOA,SEAT_POS,REST_USE,accident_result
count,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0,81521.0
mean,270314.092025,26.953178,1.36885,1.911802,1.514579,92.566688,15.694766,6.709523,13.556397,53.061407,1.40711,2.394181,1.751389,13.000454,23.831148,3.146465
std,164632.985687,16.475802,1.273783,1.950195,1.379468,96.959194,8.829369,3.373724,8.582249,110.555037,0.859091,1.690066,3.035807,11.211551,34.128658,2.641058
min,10001.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
25%,121609.0,12.0,1.0,1.0,1.0,31.0,8.0,4.0,8.0,24.0,1.0,0.0,0.0,11.0,3.0,0.0
50%,260609.0,26.0,1.0,2.0,1.0,71.0,16.0,7.0,14.0,38.0,1.0,3.0,0.0,11.0,3.0,3.0
75%,420469.0,42.0,2.0,2.0,2.0,115.0,23.0,10.0,19.0,57.0,2.0,4.0,0.0,13.0,20.0,4.0
max,560121.0,56.0,58.0,59.0,57.0,997.0,31.0,12.0,99.0,999.0,9.0,4.0,8.0,99.0,99.0,8.0


In [18]:
df[df['VEH_NO'] > 3].shape

(1711, 17)

In [19]:
df[df['VE_FORMS'] > 3].shape

(4547, 17)

In [20]:
percent = len(df[df['VE_FORMS'] > 3]) / len(df) * 100
print(f'Percent of cases with outlier number of cars: {percent:0.2f}%')

Percent of cases with outlier number of cars: 5.58%


# WIP
# Should be changed to a different cat value of > 3

In [21]:
df = df[df['VE_FORMS'] <= 3]

In [31]:
df = df.drop(['STATENAME'], axis=1)

### Running models
For starters we'll try to create models that learn on the data as is.</br>
This makes sense because not having specific information about a person in a motor accident could be valuable information by itself in predicting the person's injury

### Thinking about evaluation

The evaluation metric we'll use for our model will be a weighted average of recall per class.

The more severe an injury gets, the more important it is to decrease the amount of False Negative predictions of it, since the price of an error becomes more severe. 
Therefore, it makes sense to calculate the recall score of each injury class separately and then calculate an overall weighted average that gives higher importance to more severe injuries.

For the sake of this exercise, we'll focus for now only on this metric as the only maximising metric and not take into account other satisfising factors like the predition speed of our model

In [27]:
# Seperating the target column
train_target = df['accident_result']
df = df.drop(['accident_result'], axis=1)

#### Trying out some simple models

In [25]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [26]:
mlflow.set_experiment("NHTSA FARS Injury prediction")

2024/07/21 09:52:36 INFO mlflow.tracking.fluent: Experiment with name 'NHTSA FARS Injury prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='gs://mlops_zoomcamp-mlflow-artifacts/artifacts/1', creation_time=1721555556438, experiment_id='1', last_update_time=1721555556438, lifecycle_stage='active', name='NHTSA FARS Injury prediction', tags={}>

In [None]:
with mlflow.start_run():

    params = {"C": 0.1, "random_state": 42}
    mlflow.log_params(params)

    lr = LogisticRegression(**params).fit(X, y)
    y_pred = lr.predict(X)
    mlflow.log_metric("accuracy", accuracy_score(y, y_pred))

    mlflow.sklearn.log_model(lr, artifact_path="models")