In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox

In [2]:
file1 = pd.read_csv('Data/file1.csv')
file2 = pd.read_csv('Data/file2.csv')
file3 = pd.read_csv('Data/file3.csv')

In [3]:
df1 = file1

In [4]:
df1.columns

Index(['Customer', 'ST', 'GENDER', 'Education', 'Customer Lifetime Value',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Policy Type', 'Vehicle Class', 'Total Claim Amount'],
      dtype='object')

In [5]:
df2 = file2

In [6]:
df2.columns

Index(['Customer', 'ST', 'GENDER', 'Education', 'Customer Lifetime Value',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Total Claim Amount', 'Policy Type', 'Vehicle Class'],
      dtype='object')

In [7]:
df3 = file3

In [8]:
df3.columns

Index(['Customer', 'State', 'Customer Lifetime Value', 'Education', 'Gender',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Policy Type', 'Total Claim Amount', 'Vehicle Class'],
      dtype='object')

In [9]:
df3.rename(columns = {'State': 'ST'}, inplace = True)  
df3.rename(columns = {'Gender': 'GENDER'}, inplace = True)  


In [10]:
df3.columns

Index(['Customer', 'ST', 'Customer Lifetime Value', 'Education', 'GENDER',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Policy Type', 'Total Claim Amount', 'Vehicle Class'],
      dtype='object')

In [11]:
df = pd.concat([df1,df2,df3], axis=0)
df

Unnamed: 0,Customer,ST,GENDER,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [12]:
def lower_case_column_names(df):
    df.columns=[i.lower() for i in df.columns]
    return df

In [13]:
df=lower_case_column_names(df)
df

Unnamed: 0,customer,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [14]:
print(df.duplicated().sum())

2939


In [15]:
df = df.drop_duplicates()

In [16]:
df

Unnamed: 0,customer,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [17]:
df = df.drop(columns=['customer'])


In [18]:
df

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [19]:
df.head()

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   st                         9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9127 non-null   object 
 4   income                     9134 non-null   float64
 5   monthly premium auto       9134 non-null   float64
 6   number of open complaints  9134 non-null   object 
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9134 non-null   float64
dtypes: float64(3), object(7)
memory usage: 785.0+ KB


In [21]:
df["number of open complaints"]

0       1/0/00
1       1/0/00
2       1/0/00
3       1/0/00
4       1/0/00
         ...  
7065         0
7066         0
7067         3
7068         0
7069         0
Name: number of open complaints, Length: 9135, dtype: object

In [22]:
'''
if df["number of open complaints"].dtype == object:
    df["number of open complaints"] = df["number of open complaints"].str[0]
else:
    pass
'''

df["number of open complaints"] = df["number of open complaints"].apply(lambda x: x[2] if isinstance(x, str) else x)

In [23]:
df["number of open complaints"] = df["number of open complaints"].astype(float)
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   st                         9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9127 non-null   object 
 4   income                     9134 non-null   float64
 5   monthly premium auto       9134 non-null   float64
 6   number of open complaints  9134 non-null   float64
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9134 non-null   float64
dtypes: float64(4), object(6)
memory usage: 785.0+ KB


In [24]:
#df = df.dropna()

In [25]:
#df["number of open complaints"] = df["number of open complaints"].astype(float)
#df.info()

In [26]:
df["customer lifetime value"]

0               NaN
1        697953.59%
2       1288743.17%
3        764586.18%
4        536307.65%
           ...     
7065    23405.98798
7066    3096.511217
7067    8163.890428
7068    7524.442436
7069    2611.836866
Name: customer lifetime value, Length: 9135, dtype: object

In [27]:
''''
if df["customer lifetime value"].dtype == object :
    df["customer lifetime value"] = df["customer lifetime value"].str.replace('%', '')
else:
    pass
'''''
#df["customer lifetime value"] = df["customer lifetime value"].apply(lambda x: str(x).replace("%", ""))
df["customer lifetime value"] = df["customer lifetime value"].apply(lambda x: x if isinstance(x, (float, int)) else x.replace("%", ""))
#df["customer lifetime value"] = df["customer lifetime value"].str.replace('%', '')
df["customer lifetime value"]

0               NaN
1         697953.59
2        1288743.17
3         764586.18
4         536307.65
           ...     
7065    23405.98798
7066    3096.511217
7067    8163.890428
7068    7524.442436
7069    2611.836866
Name: customer lifetime value, Length: 9135, dtype: object

In [28]:
df["customer lifetime value"] = df["customer lifetime value"].astype(float)
#df = df.dropna()

In [29]:
#df["customer lifetime value"].isna().value_counts()
df["customer lifetime value"]

0                NaN
1       6.979536e+05
2       1.288743e+06
3       7.645862e+05
4       5.363077e+05
            ...     
7065    2.340599e+04
7066    3.096511e+03
7067    8.163890e+03
7068    7.524442e+03
7069    2.611837e+03
Name: customer lifetime value, Length: 9135, dtype: float64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   st                         9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9127 non-null   float64
 4   income                     9134 non-null   float64
 5   monthly premium auto       9134 non-null   float64
 6   number of open complaints  9134 non-null   float64
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9134 non-null   float64
dtypes: float64(5), object(5)
memory usage: 785.0+ KB


In [31]:
df

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,0.0,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,6.979536e+05,0.0,94.0,0.0,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1.288743e+06,48767.0,108.0,0.0,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,7.645862e+05,0.0,106.0,0.0,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,5.363077e+05,36357.0,68.0,0.0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,2.340599e+04,71941.0,73.0,0.0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,3.096511e+03,21604.0,79.0,0.0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,8.163890e+03,0.0,85.0,3.0,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,7.524442e+03,21941.0,96.0,0.0,Personal Auto,Four-Door Car,691.200000


In [32]:
df["st"].value_counts()

California    3030
Oregon        2601
Arizona       1629
Nevada         882
Washington     768
Cali           120
AZ              74
WA              30
Name: st, dtype: int64

In [33]:
df["st"] = df["st"].replace('AZ', 'Arizona')
df["st"] = df["st"].replace('WA', 'Washington')
df["st"] = df["st"].replace('Cali', 'California')
df["st"] = df["st"].replace('Washinton', 'Washington')


In [34]:
df["st"].value_counts()

California    3150
Oregon        2601
Arizona       1703
Nevada         882
Washington     798
Name: st, dtype: int64

In [35]:
df["gender"].value_counts()

F         4557
M         4368
Male        40
female      30
Femal       17
Name: gender, dtype: int64

In [36]:
def clean_gender(x):
    if x in ['M', 'Male']:
        return 'Male'
    elif x in ['F', 'female',"Femal"]:
        return 'Female'
    elif np.nan:  pass
    else:
        return 'U'

In [37]:
df['gender'] = list(map(clean_gender, df['gender'])) 

In [38]:
df["gender"].value_counts()

Female    4604
Male      4408
Name: gender, dtype: int64

In [39]:
#df.isna().value_count()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   st                         9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9127 non-null   float64
 4   income                     9134 non-null   float64
 5   monthly premium auto       9134 non-null   float64
 6   number of open complaints  9134 non-null   float64
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9134 non-null   float64
dtypes: float64(5), object(5)
memory usage: 785.0+ KB


In [40]:
df.columns


Index(['st', 'gender', 'education', 'customer lifetime value', 'income',
       'monthly premium auto', 'number of open complaints', 'policy type',
       'vehicle class', 'total claim amount'],
      dtype='object')

In [41]:
df['customer lifetime value'] = df['customer lifetime value'].fillna(df['customer lifetime value'].mean(), inplace = False)

In [42]:
df['income'] = df['income'].replace(0, df['income'].mean(), inplace = False)


In [43]:
df['number of open complaints'] = df['number of open complaints'].fillna(df['number of open complaints'].mean(), inplace = False)

In [44]:
df['monthly premium auto'] = df['monthly premium auto'].fillna(df['monthly premium auto'].mean(), inplace = False)

In [45]:
df['total claim amount'] = df['total claim amount'].fillna(df['total claim amount'].mean(), inplace = False)

In [46]:
df['customer lifetime value'].value_counts()


182071.517615    8
6689.022728      6
4270.034394      6
5568.947534      6
5246.278375      6
                ..
15700.284360     1
2968.077571      1
5452.171237      1
2558.762931      1
2611.836866      1
Name: customer lifetime value, Length: 8212, dtype: int64

In [47]:
df.isna()

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,False,True,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
7065,False,False,False,False,False,False,False,False,False,False
7066,False,False,False,False,False,False,False,False,False,False
7067,False,False,False,False,False,False,False,False,False,False
7068,False,False,False,False,False,False,False,False,False,False


In [48]:
df["customer lifetime value"] = df["customer lifetime value"].astype(int)
df["number of open complaints"] = df["number of open complaints"].astype(int)

In [49]:
#Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central


def map_state_to_zone(df):
    df["st"] = df["st"].replace({"California": "West Region", "Oregon": "North West", "Washington": "East", "Arizona": "Central", "Nevada":"Central"})
    return df

In [50]:
map_state_to_zone(df)


Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,East,,Master,182071,37824.847055,1000.0,0,Personal Auto,Four-Door Car,2.704934
1,Central,Female,Bachelor,697953,37824.847055,94.0,0,Personal Auto,Four-Door Car,1131.464935
2,Central,Female,Bachelor,1288743,48767.000000,108.0,0,Personal Auto,Two-Door Car,566.472247
3,West Region,Male,Bachelor,764586,37824.847055,106.0,0,Corporate Auto,SUV,529.881344
4,East,Male,High School or Below,536307,36357.000000,68.0,0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,West Region,Male,Bachelor,23405,71941.000000,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,West Region,Female,College,3096,21604.000000,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,West Region,Male,Bachelor,8163,37824.847055,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,West Region,Male,College,7524,21941.000000,96.0,0,Personal Auto,Four-Door Car,691.200000


In [51]:
df["vehicle class"] = df["vehicle class"].replace({"Luxury SUV": "Luxury Vehicle", "Luxury Car": "Luxury Vehicle"})

In [52]:
df["vehicle class"].value_counts()

Four-Door Car     4640
Two-Door Car      1895
SUV               1773
Sports Car         483
Luxury Vehicle     343
Name: vehicle class, dtype: int64

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   st                         9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9135 non-null   int32  
 4   income                     9134 non-null   float64
 5   monthly premium auto       9135 non-null   float64
 6   number of open complaints  9135 non-null   int32  
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9135 non-null   float64
dtypes: float64(3), int32(2), object(5)
memory usage: 713.7+ KB


In [54]:
def remove_outliers(df):
    for col in df.select_dtypes(include=np.number).columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        df.drop(df[(df[col] < lower_bound) | (df[col] > upper_bound)].index, inplace=True)
        

In [55]:
remove_outliers(df)

In [56]:
df

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
1072,North West,Male,Doctor,8653,69704.000000,72.0,0,Personal Auto,Two-Door Car,89.921281
1073,North West,Male,Doctor,3776,20101.000000,96.0,0,Corporate Auto,Four-Door Car,291.536748
1075,East,Male,Bachelor,14947,22139.000000,100.0,0,Personal Auto,SUV,480.000000
1076,Central,Male,Bachelor,9011,88708.000000,112.0,0,Personal Auto,Sports Car,499.663591
1077,Central,Female,High School or Below,10083,11828.000000,90.0,0,Personal Auto,Four-Door Car,571.479602
...,...,...,...,...,...,...,...,...,...,...
7064,West Region,Female,College,4100,47761.000000,104.0,0,Personal Auto,Four-Door Car,541.282007
7065,West Region,Male,Bachelor,23405,71941.000000,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,West Region,Female,College,3096,21604.000000,79.0,0,Corporate Auto,Four-Door Car,379.200000
7068,West Region,Male,College,7524,21941.000000,96.0,0,Personal Auto,Four-Door Car,691.200000


In [57]:
def standardize_text(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.lower()

In [58]:
standardize_text(df)

In [59]:
df

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
1072,north west,male,doctor,8653,69704.000000,72.0,0,personal auto,two-door car,89.921281
1073,north west,male,doctor,3776,20101.000000,96.0,0,corporate auto,four-door car,291.536748
1075,east,male,bachelor,14947,22139.000000,100.0,0,personal auto,suv,480.000000
1076,central,male,bachelor,9011,88708.000000,112.0,0,personal auto,sports car,499.663591
1077,central,female,high school or below,10083,11828.000000,90.0,0,personal auto,four-door car,571.479602
...,...,...,...,...,...,...,...,...,...,...
7064,west region,female,college,4100,47761.000000,104.0,0,personal auto,four-door car,541.282007
7065,west region,male,bachelor,23405,71941.000000,73.0,0,personal auto,four-door car,198.234764
7066,west region,female,college,3096,21604.000000,79.0,0,corporate auto,four-door car,379.200000
7068,west region,male,college,7524,21941.000000,96.0,0,personal auto,four-door car,691.200000
