# Preparing Inference Data

In [1]:
import pandas as pd

In [4]:
inf_df = pd.read_csv('../backend/inference_data_cleaned.csv')

In [3]:
predictors = [
    'driving_history_score',
    'credit_score',
    'low_education_ind',
    'marital_status',
    'time_driven',
    'area',
    'agecat_grouped',
    'gender',
    'veh_color',
    'max_power',
    'engine_type',
    'veh_age',
    'veh_body_grouped',
    'veh_value'
]

In [5]:
# Re-encoding functions for agecat and veh_body
def group_age_category(cat):
    if cat == 1:
        return 'Young'
    elif cat == 6:
        return 'Elder'
    else:
        return 'Middle'

keep_bodies = {'SEDAN', 'STNWG', 'SUV', 'TRUCK', 'UTE', 'COUPE'}

def group_vehicle_body(body_style):
    if pd.isna(body_style):
        return 'OTHER'

    normalized_style = str(body_style).upper()

    if normalized_style in keep_bodies:
        return normalized_style
    else:
        return 'OTHER'

In [7]:
from sklearn.preprocessing import OrdinalEncoder

inf_df['veh_body_grouped'] = inf_df['veh_body'].apply(group_vehicle_body)
inf_df['agecat_grouped'] = inf_df['agecat'].apply(group_age_category)

X = inf_df[['id'] + predictors].copy()
y = inf_df[['exposure']].copy()

cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical features to encode: {cat_cols}")

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X[cat_cols] = encoder.fit_transform(X[cat_cols])

inf_df_final = X.join(y)
inf_df_final.head()

Categorical features to encode: ['marital_status', 'time_driven', 'area', 'agecat_grouped', 'gender', 'veh_color', 'engine_type', 'veh_body_grouped']


Unnamed: 0,id,driving_history_score,credit_score,low_education_ind,marital_status,time_driven,area,agecat_grouped,gender,veh_color,max_power,engine_type,veh_age,veh_body_grouped,veh_value,exposure
0,8295,81,644.721808,0,1.0,2.0,0.0,1.0,1.0,6.0,128,2.0,3,4.0,6.36,0.777085
1,17625,94,634.306196,0,1.0,1.0,0.0,2.0,0.0,7.0,178,0.0,1,3.0,2.56,0.528369
2,3802,77,649.245139,0,0.0,3.0,3.0,1.0,0.0,8.0,270,3.0,4,1.0,6.18,0.384591
3,12865,59,647.594655,0,1.0,2.0,3.0,1.0,1.0,3.0,120,3.0,3,4.0,6.49,0.116378
4,6495,81,657.5505,0,1.0,2.0,2.0,1.0,0.0,4.0,94,3.0,2,2.0,6.46,0.688417


In [8]:
inf_df_final.to_csv('../backend/inf_df_final.csv', index=False)