<a href="https://colab.research.google.com/github/bagniewski/bagniewski.github.io/blob/master/AutoInsurance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Auto Insurance Test

Runs a code that makes a prediction on the dataset Auto Insurance based on the [Kaggle competition](https://www.kaggle.com/c/auto-insurance-fall-2017). 

The target for this project is TARGET_FLAG.

In [7]:
#@title Import Libraries

#@markdown Please execute this cell by pressing the Play button on the left 

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [8]:
#@title Read the CSV files

#@markdown Import train_auto.csv and test_auto.csv datasets

data_test = pd.read_csv('https://raw.githubusercontent.com/descartes-underwriting/data-scientist-technical-test/main/data/auto-insurance-fall-2017/test_auto.csv')
data_train = pd.read_csv('https://raw.githubusercontent.com/descartes-underwriting/data-scientist-technical-test/main/data/auto-insurance-fall-2017/train_auto.csv')



In [9]:
#@title Encode Data

#@markdown Encode the data using Ordinal Encoding, and One-Hot Encoding

# Ordinal Encoding
data_train.INCOME = data_train.INCOME.str.replace("$","").str.replace(",","")
data_train.INCOME = pd.to_numeric(data_train.INCOME)
data_train.PARENT1 = data_train.PARENT1.replace("No",0).replace("Yes",1)
data_train.HOME_VAL = data_train.HOME_VAL.str.replace("$","").str.replace(",","")
data_train.HOME_VAL = pd.to_numeric(data_train.HOME_VAL)
data_train.MSTATUS = data_train.MSTATUS.replace("z_No",0).replace("Yes",1)
data_train.SEX = data_train.SEX.replace("M",0).replace("z_F",1)
data_train.EDUCATION = data_train.EDUCATION.replace("<High School",0).replace("z_High School",1).replace("Bachelors",2).replace("Masters",3).replace("PhD",4)
data_train.CAR_USE = data_train.CAR_USE.replace("Commercial",0).replace("Private",1)
data_train.BLUEBOOK = data_train.BLUEBOOK.str.replace("$","").str.replace(",","")
data_train.BLUEBOOK = pd.to_numeric(data_train.BLUEBOOK)
data_train.RED_CAR = data_train.RED_CAR.replace("no",0).replace("yes",1)
data_train.OLDCLAIM = data_train.OLDCLAIM.str.replace("$","").str.replace(",","")
data_train.OLDCLAIM = pd.to_numeric(data_train.OLDCLAIM)
data_train.REVOKED = data_train.REVOKED.replace("No",0).replace("Yes",1)
data_train.URBANICITY = data_train.URBANICITY.replace("z_Highly Rural/ Rural",0).replace("Highly Urban/ Urban",1)
data_test.INCOME = data_test.INCOME.str.replace("$","").str.replace(",","")
data_test.INCOME = pd.to_numeric(data_test.INCOME)
data_test.PARENT1 = data_test.PARENT1.replace("No",0).replace("Yes",1)
data_test.HOME_VAL = data_test.HOME_VAL.str.replace("$","").str.replace(",","")
data_test.HOME_VAL = pd.to_numeric(data_test.HOME_VAL)
data_test.MSTATUS = data_test.MSTATUS.replace("z_No",0).replace("Yes",1)
data_test.SEX = data_test.SEX.replace("M",0).replace("z_F",1)
data_test.EDUCATION = data_test.EDUCATION.replace("<High School",0).replace("z_High School",1).replace("Bachelors",2).replace("Masters",3).replace("PhD",4)
data_test.CAR_USE = data_test.CAR_USE.replace("Commercial",0).replace("Private",1)
data_test.BLUEBOOK = data_test.BLUEBOOK.str.replace("$","").str.replace(",","")
data_test.BLUEBOOK = pd.to_numeric(data_test.BLUEBOOK)
data_test.RED_CAR = data_test.RED_CAR.replace("no",0).replace("yes",1)
data_test.OLDCLAIM = data_test.OLDCLAIM.str.replace("$","").str.replace(",","")
data_test.OLDCLAIM = pd.to_numeric(data_test.OLDCLAIM)
data_test.REVOKED = data_test.REVOKED.replace("No",0).replace("Yes",1)
data_test.URBANICITY = data_test.URBANICITY.replace("z_Highly Rural/ Rural",0).replace("Highly Urban/ Urban",1)

# One-Hot Encoding
data_train = pd.get_dummies(data=data_train,columns=['JOB','CAR_TYPE'])
data_test = pd.get_dummies(data=data_test,columns=['JOB','CAR_TYPE'])

#@markdown Imput missing values

# Fill in the missing values with the mean for the column 
data_train.AGE = data_train.AGE.fillna(np.nanmean(data_train.AGE))
data_train.YOJ = data_train.YOJ.fillna(np.nanmean(data_train.YOJ))
data_train.INCOME = data_train.INCOME.fillna(np.nanmean(data_train.INCOME))
data_train.HOME_VAL = data_train.HOME_VAL.fillna(np.nanmean(data_train.HOME_VAL))
data_train.CAR_AGE = data_train.CAR_AGE.fillna(np.nanmean(data_train.CAR_AGE))
data_test.AGE = data_test.AGE.fillna(np.nanmean(data_test.AGE))
data_test.YOJ = data_test.YOJ.fillna(np.nanmean(data_test.YOJ))
data_test.INCOME = data_test.INCOME.fillna(np.nanmean(data_test.INCOME))
data_test.HOME_VAL = data_test.HOME_VAL.fillna(np.nanmean(data_test.HOME_VAL))
data_test.CAR_AGE = data_test.CAR_AGE.fillna(np.nanmean(data_test.CAR_AGE))


In [10]:
#@title Linear regression model

# Define X_train, y_train, and X_test
X_train = data_train.drop(['INDEX','TARGET_FLAG','TARGET_AMT'], axis=1)
y_train = data_train['TARGET_FLAG']
X_test = data_test.drop(['INDEX','TARGET_FLAG','TARGET_AMT'], axis=1)
# Create the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

#@markdown Assess the model’s performance

y_pred = model.predict(X_train)
print('Root Mean Squared Error: %.2f' % np.sqrt(mean_squared_error(y_train, y_pred))) 
print('R²: %.2f' % r2_score(y_train, y_pred))

#@markdown Predict TARGET_FLAG for test_auto

y_test = model.predict(X_test)


Root Mean Squared Error: 0.39
R²: 0.23


In [11]:
#@title Export the data

df = pd.DataFrame({'p_target': y_test})
df.to_csv('p_target.csv', index = False)