### This notebook contains the python code for predicting the retention of an auto insurance policy. Specifically, we are predicting whether a policy will renew to the 2nd term or not. Here it is framed as a supervised classification problem. 

#### Import necessary modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#### Load and Inspect the dataset

In [None]:
DataSet = pd.read_csv('policydata_05222018.csv',engine='python')
DataSet.head()
DataSet.info()
DataSet.describe()

#### Plot numercial attributes in the dataset to understand the distribution

In [None]:
DataSet.hist(bins=50, figsize=(20,15))
plt.show()

#### Create Latitude and Longitude columns from zipcode

In [None]:
from uszipcode import ZipcodeSearchEngine
DataSet['Latitude'] = DataSet['PostalCode'].apply(lambda pcode: ZipcodeSearchEngine().by_zipcode(pcode)['Latitude'])
DataSet['Longitude'] = DataSet['PostalCode'].apply(lambda pcode: ZipcodeSearchEngine().by_zipcode(pcode)['Longitude'])

#Replace missing values for Latitude & Longitude with its median
DataSet.loc[DataSet['Latitude'].isna(),'Latitude']=DataSet['Latitude'].median()
DataSet.loc[DataSet['Longitude'].isna(),'Longitude']=DataSet['Longitude'].median()

DataSet = DataSet.drop(['PostalCode'], axis=1)

#### Create a category column (Here we are using written premium) in order to use for StratifiedShuffleSplit for splitting training & testing datasets

In [None]:
#Creating a category column for WP in order to use for StratifiedShuffleSplit for splitting training & testing datasets
DataSet["WP_cat"] = np.ceil(DataSet["WP"] / 750) 
DataSet["WP_cat"]. where(DataSet["WP_cat"] < 5, 5.0, inplace = True)

# Check the Distribution in the entire dataset
DataSet['WP_cat'].value_counts() / len(DataSet)

#### Split the dataset into training and test

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit 
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) 
for train_index, test_index in split.split(DataSet, DataSet["WP_cat"]): 
    strat_train_set = DataSet.loc[train_index] 
    strat_test_set = DataSet.loc[test_index]
    

#Check the Distribution in the test dataset
strat_test_set['WP_cat'].value_counts() / len(strat_test_set)

In [None]:
#Drop WP_cat column
for set_ in (strat_train_set, strat_test_set): 
    set_. drop("WP_cat", axis = 1, inplace = True)

#Make a copy of train dataset
DataSet = strat_train_set.copy()

#### Plot scatter plot of attributes of interest to identify the correlation between the attributes

In [None]:
from pandas.plotting import scatter_matrix 
attributes = ["numerical_attribute_1","numerical_attribute_2","numerical_attribute_3","numerical_attribute_4"
              ,"numerical_attribute_5","numerical_attribute_6"]
scatter_matrix(DataSet[attributes], figsize =(16, 12))

#Check correlations of an attribute of intreset with other attributes in the dataset
DataSet.corr()["numerical_attribute_1"].sort_values(ascending=False)


#### Split training data into features and labels. Here the label or target is 'RenewalStatus' with value 'Yes' or 'No'

In [None]:
DataSet = strat_train_set.drop("RenewalStatus", axis = 1) 
DataSet_labels = strat_train_set["RenewalStatus"]. copy()

#### Check and make sure there are no attributes with missing values

In [None]:
DataSet.columns[DataSet.isnull().any()] 

#### Define categorical attributes of the training dataset and convert to datatype 'category'

In [None]:
Cat_columns = ["Categorical_attribute_1", "Categorical_attribute_2", "Categorical_attribute_3", "Categorical_attribute_4"
              ,"Categorical_attribute_5", "Categorical_attribute_6"]

for c in Cat_columns:
    DataSet[c] = DataSet[c].astype('category')

#### Convert categorical attributes to numercial factors

In [None]:
for c in category_columns:
    DataSet[c],_ = DataSet[c].factorize()

#### Create a subset of data containing only categorical attributes

In [None]:
DataSet_cat = DataSet.loc[:,category_columns]

#### Create a subset of data containing only numerical attributes

In [None]:
object_columns = list(DataSet.dtypes[DataSet.dtypes == 'object'].index)
category_columns = list(DataSet.dtypes[DataSet.dtypes == 'category'].index)

DataSet_num = DataSet.drop(object_columns, axis=1)
DataSet_num = DataSet.drop(category_columns, axis=1)

#### Create a transformer function to select just a subset of the Pandas DataFrame columns:

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

#### Create separate data pipelines to deal with numerical & categorical attributes

In [None]:
# Numerical attributes:
    #Impute (replace) missing numerical attributes with their median
    # Scale the numerical attributes using StandardScaler()

#Categorical attributes:
    #One hot encoding for categorical attributes

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(DataSet_num)
cat_attribs = category_columns

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False))
    ])

#### Join the pipelines into a big pipeline that will process both the numerical and the categorical features:

In [None]:
from  sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

DataSet_prepared = full_pipeline.fit_transform(DataSet)
DataSet_prepared.shape

### Train following models using the training data and find their average accuracy scores using cross validation
#### RandomForest
#### Support Vector Machine with different kernels
#### Logistic regression
#### Stochastic Gradient Descent

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_cl = RandomForestClassifier(random_state=42)
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

models = [forest_cl, svm_cl_linear, svm_cl_rbf, svm_cl_poly, svm_cl_sigmoid, svm_cl_linearSVC, logistic, SGD]

svm_cl_linear = SVC(kernel="linear",random_state=42)
svm_cl_rbf = SVC(kernel="rbf",random_state=42)
svm_cl_poly = SVC(kernel="poly",random_state=42)
svm_cl_sigmoid = SVC(kernel="sigmoid",random_state=42)
svm_cl_linearSVC = LinearSVC(random_state=42)
logistic = linear_model.LogisticRegression(random_state=42)
SGD = linear_model.SGDClassifier(random_state=42)


for m in models:
    m.fit(DataSet_prepared, DataSet_labels)
    score = cross_val_score(m, DataSet_prepared, DataSet_labels, scoring="accuracy", cv=10,n_jobs=-1)
    print(m,': ', '\n','Mean of Accuracy: ', score.mean(), ', ', 'Standard Deviation of Accuracy: ', score.std(), '\n')

#### From the model scores; its evident that the promising models are SVM(kernel='rbf') and logistic regression. 

#### Let's try to find the best hyper parameter for Logistic regression model

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
from sklearn.model_selection import RandomizedSearchCV
n_iter_search = 7
random_search = RandomizedSearchCV(logistic, param_distributions=param_grid,n_iter=n_iter_search,cv=5
                                   , random_state=42, n_jobs=-1)
random_search.fit(DataSet_prepared,DataSet_labels)

print('Best score : ', '\n',random_search.best_score_)
print('Best parameters : ', '\n',random_search.best_estimator_)

#### Let's try to find the best hyper parameters for SVM(kernel='rbf') model

In [None]:
C_range = np.logspace(-2, 7, 10)
gamma_range = np.logspace(-6, 3, 10)
param_grid = dict(gamma=gamma_range, C=C_range)

random_search = RandomizedSearchCV(svm_cl_rbf, param_distributions=param_grid,cv=5, random_state=42, n_jobs=2)
random_search.fit(DataSet_prepared,DataSet_labels)

print('Best score : ', '\n',random_search.best_score_)
print('Best parameters : ', '\n',random_search.best_estimator_)

Final_Model = random_search.best_estimator_