In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, roc_auc_score

%matplotlib inline

In [2]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [3]:
df.shape

(381109, 12)

In [4]:
df_test.shape

(127037, 11)

In [5]:
def create_dummy(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables to dummy
    cat_cols - list of strings associated with the names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. dummy columns for each of the categorical columns in cat_cols
            3. if dummy_na is True - it also contains dummy columns for the NaN values
            4. Use a prefix of the column name with an underscore (_) for separating 
    '''
    
    df = pd.concat([df.select_dtypes(exclude='object'), pd.get_dummies(df[cat_cols], prefix_sep='_', drop_first=False, dummy_na=dummy_na)], axis=1)
    return df

In [6]:
# See if there are any NaNs
df.isnull().sum()


id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [8]:
df_test.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,381110,Male,25,1,11.0,1,< 1 Year,No,35786.0,152.0,53
1,381111,Male,40,1,28.0,0,1-2 Year,Yes,33762.0,7.0,111
2,381112,Male,47,1,28.0,0,1-2 Year,Yes,40050.0,124.0,199
3,381113,Male,24,1,27.0,1,< 1 Year,Yes,37356.0,152.0,187
4,381114,Male,27,1,28.0,1,< 1 Year,No,59097.0,152.0,297


# Data preprocessing

In [None]:
# 1. Remove rows with NaNs in the Response column (here there aren't any)

# df = df.dropna(subset=['Response'], axis=0)

In [None]:
# 2.  Create X with all feature columns

X = df.drop(['Response','id'], axis=1)
# Create y with Response values
y = df['Response'] 
# Drop 'id' column also from df_test
df_test = df_test.drop(['id'], axis=1)

In [None]:
# 3. Convert all categorical variables to dtype=object
# ----------------------------------------------------
# NOTE: We would need this step if there were any NaNs in the numerical variables, because 
# we want to fill the numerical NaNs with the mean(), which in the case of 'Region_Code', 'Policy_Sales_Channel', etc. 
# wouldn't make sense. However, we don't have any NaNs now, so we can keep those variables as they are.
# ----------------------------------------------------
# X = X.astype({'Driving_License': object, 'Region_Code': object, 'Previously_Insured': object, 'Policy_Sales_Channel': object})
# df_test = df_test.astype({'Driving_License': object, 'Region_Code': object, 'Previously_Insured': object, 'Policy_Sales_Channel': object})


In [None]:
# 4. Select categorical variables and
# create dummy columns for all the categorical variables in X and df_test
cat_vars = X.select_dtypes(include=['object']).columns
X = create_dummy(X, cat_vars, dummy_na=False)
df_test = create_dummy(df_test, cat_vars, dummy_na=False) 
# -------------
# QUESTION:
# What happens if during this step a value is missing in a categorical variable 
# and therefore there is a dummy column less in here?
# -------------

In [None]:
# 5. Fill numerical NaNs with mean
# --------------------------------
# NOTE: There aren't any numerical NaNs, so we can skip this step
# We apply to X, because any NaNs in the categorical variables have been dummied.
# --------------------------------
# fill_mean = lambda col: col.fillna(col.mean())
# X = X.apply(fill_mean, axis=0)
# df_test = df_test.apply(fill_mean, axis=0)

In [None]:
# 6. Split into train/validation/test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.20, random_state=42)
X_test = df_test.copy()

In [None]:
# 7. Instantiate model
lm_model = LinearRegression(normalize=True)

In [None]:
# 8. Fit to model
lm_model.fit(X_train, y_train)

# Results

In [None]:
# 9. Predict using validation set
y_val_preds = lm_model.predict(X_val)

In [None]:
# 10. Score model predictions
score = roc_auc_score(y_val, y_val_preds) # Suggested Kaggle metric

In [7]:
# 11. Predict on unknown test data
y_test_preds = lm_model.predict(X_test)

In [None]:
score