## Q3: Conduct a customer lifetime value prediction for an insurance company.

* Student ID: p233340 
* Name: Kwok Tsz Yi
* Course: COM6101

In [178]:
# import dataset
import pandas as pd
df = pd.read_csv('Q3.csv')
df.head()

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,1,Male,Urban,Bachelor,5L-10L,1.0,5.0,5790.0,More than 1,A,Platinum,64308.0
1,2,Male,Rural,High School,5L-10L,0.0,8.0,5080.0,More than 1,A,Platinum,515400.0
2,3,Male,Urban,Bachelor,5L-10L,1.0,8.0,2599.0,More than 1,A,Platinum,64212.0
3,4,Female,Rural,High School,5L-10L,0.0,7.0,0.0,More than 1,A,Platinum,97920.0
4,5,Male,Urban,High School,More than 10L,1.0,6.0,3508.0,More than 1,A,Gold,59736.0


### Exploratory Data Analysis

Find the shape of the data, the data types of the columns, and the first few rows of the data.

In [179]:
df.shape

(9954, 12)

In [180]:
df.dtypes

id                  int64
gender             object
area               object
qualification      object
income             object
marital_status    float64
vintage           float64
claim_amount      float64
num_policies       object
policy             object
type_of_policy     object
cltv              float64
dtype: object

In [181]:
df.describe()

Unnamed: 0,id,marital_status,vintage,claim_amount,cltv
count,9954.0,9953.0,9953.0,9953.0,9952.0
mean,4977.5,0.579323,4.641515,4361.435346,98710.847267
std,2873.616624,0.493693,2.269143,3281.160576,91154.356054
min,1.0,0.0,-1.0,0.0,27432.0
25%,2489.25,0.0,3.0,2415.0,53280.0
50%,4977.5,1.0,5.0,4099.0,66108.0
75%,7465.75,1.0,6.0,6100.0,103812.0
max,9954.0,1.0,8.0,31894.0,650964.0


Find missing values in each columns:

In [182]:
df.isnull().sum()

id                0
gender            0
area              0
qualification     0
income            1
marital_status    1
vintage           1
claim_amount      1
num_policies      1
policy            1
type_of_policy    1
cltv              2
dtype: int64

As the number of missing values is small, we can drop the rows with missing values.

In [183]:
df = df.dropna()
df.isnull().sum()

id                0
gender            0
area              0
qualification     0
income            0
marital_status    0
vintage           0
claim_amount      0
num_policies      0
policy            0
type_of_policy    0
cltv              0
dtype: int64

### Data Preprocessing

In [184]:
df.dtypes

id                  int64
gender             object
area               object
qualification      object
income             object
marital_status    float64
vintage           float64
claim_amount      float64
num_policies       object
policy             object
type_of_policy     object
cltv              float64
dtype: object

As per the dtype, there are 6 columns need to be transformed to numerical data type. For column gender, area, policy and qualification, as they are non-ordinal categorical data, we can use one-hot encoding to transform them. For column income, num_policies and type_of_policy, as they are ordinal categorical data, we can use label encoding to transform them.

#### One-hot encoding

Columns to perform one-hot encoding:
* gender
* area
* policy
* qualification

In [185]:
# replace df['gender'] with dummy variables
df = df.join(pd.get_dummies(df['gender']))
df = df.join(pd.get_dummies(df['area']))
df = df.join(pd.get_dummies(df['policy']))
df = df.join(pd.get_dummies(df['qualification']))
df = df.drop(['gender', 'area', 'policy', 'qualification'], axis=1)
df.head()

Unnamed: 0,id,income,marital_status,vintage,claim_amount,num_policies,type_of_policy,cltv,Female,Male,Rural,Urban,A,B,C,D,Bachelor,High School,Others
0,1,5L-10L,1.0,5.0,5790.0,More than 1,Platinum,64308.0,False,True,False,True,True,False,False,False,True,False,False
1,2,5L-10L,0.0,8.0,5080.0,More than 1,Platinum,515400.0,False,True,True,False,True,False,False,False,False,True,False
2,3,5L-10L,1.0,8.0,2599.0,More than 1,Platinum,64212.0,False,True,False,True,True,False,False,False,True,False,False
3,4,5L-10L,0.0,7.0,0.0,More than 1,Platinum,97920.0,True,False,True,False,True,False,False,False,False,True,False
4,5,More than 10L,1.0,6.0,3508.0,More than 1,Gold,59736.0,False,True,False,True,True,False,False,False,False,True,False


In [186]:
# transform boolean to int
for i in range(0, len(df.columns)):
    if df.iloc[:, i].dtype == bool:
        df.iloc[:, i] = df.iloc[:, i].astype(int)
df.dtypes

id                  int64
income             object
marital_status    float64
vintage           float64
claim_amount      float64
num_policies       object
type_of_policy     object
cltv              float64
Female              int64
Male                int64
Rural               int64
Urban               int64
A                   int64
B                   int64
C                   int64
D                   int64
Bachelor            int64
High School         int64
Others              int64
dtype: object

In [187]:
df.head()

Unnamed: 0,id,income,marital_status,vintage,claim_amount,num_policies,type_of_policy,cltv,Female,Male,Rural,Urban,A,B,C,D,Bachelor,High School,Others
0,1,5L-10L,1.0,5.0,5790.0,More than 1,Platinum,64308.0,0,1,0,1,1,0,0,0,1,0,0
1,2,5L-10L,0.0,8.0,5080.0,More than 1,Platinum,515400.0,0,1,1,0,1,0,0,0,0,1,0
2,3,5L-10L,1.0,8.0,2599.0,More than 1,Platinum,64212.0,0,1,0,1,1,0,0,0,1,0,0
3,4,5L-10L,0.0,7.0,0.0,More than 1,Platinum,97920.0,1,0,1,0,1,0,0,0,0,1,0
4,5,More than 10L,1.0,6.0,3508.0,More than 1,Gold,59736.0,0,1,0,1,1,0,0,0,0,1,0


#### Ordinal encoding

In [188]:
# transform type_of_policy to numeric
# display value_counts
df['type_of_policy'].value_counts()

type_of_policy
Platinum    5395
Gold        2285
Silver      2272
Name: count, dtype: int64

In [189]:
# silver = 0, gold = 1, platinum = 2
df['type_of_policy'] = df['type_of_policy'].map({'Silver': 0, 'Gold': 1, 'Platinum': 2})
df['type_of_policy'].value_counts()

type_of_policy
2    5395
1    2285
0    2272
Name: count, dtype: int64

In [190]:
# transform num_policies to numeric 
# display value_counts
df['num_policies'].value_counts()

num_policies
More than 1    6719
1              3233
Name: count, dtype: int64

In [191]:
# map more than 1 policy to 1, 1 policy to 0
df['num_policies'] = df['num_policies'].map({'1': 0, 'More than 1':1})
df['num_policies'].value_counts()

num_policies
1    6719
0    3233
Name: count, dtype: int64

In [192]:
# transform income column to numeric
# discover unique values in income column first
df['income'].value_counts()

income
5L-10L           5797
2L-5L            2417
More than 10L    1507
<=2L              231
Name: count, dtype: int64

Encoding the df['income'] column to numerical data type, e.g. '<=2L' to 1, '2L-5L' to 2.

In [193]:
# encode income column
df['income'] = df['income'].map({'<=2L': 0, '2L-5L': 1, '5L-10L': 2, 'More than 10L': 3})
df['income'].value_counts()

income
2    5797
1    2417
3    1507
0     231
Name: count, dtype: int64

In [194]:
df.head()

Unnamed: 0,id,income,marital_status,vintage,claim_amount,num_policies,type_of_policy,cltv,Female,Male,Rural,Urban,A,B,C,D,Bachelor,High School,Others
0,1,2,1.0,5.0,5790.0,1,2,64308.0,0,1,0,1,1,0,0,0,1,0,0
1,2,2,0.0,8.0,5080.0,1,2,515400.0,0,1,1,0,1,0,0,0,0,1,0
2,3,2,1.0,8.0,2599.0,1,2,64212.0,0,1,0,1,1,0,0,0,1,0,0
3,4,2,0.0,7.0,0.0,1,2,97920.0,1,0,1,0,1,0,0,0,0,1,0
4,5,3,1.0,6.0,3508.0,1,1,59736.0,0,1,0,1,1,0,0,0,0,1,0
