In [1]:
#!conda install openpyxl
#!conda install xlrd

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox

pd.options.display.max_rows = 50

# Activites List

### Activity 1

- Aggregate data into one Data Frame using Pandas.
- Standardizing header names
- Deleting and rearranging columns – delete the column customer as it is only a unique identifier for each row of data
- Working with data types – Check the data types of all the columns and fix the incorrect ones (for ex. customer lifetime value and number of complaints )
- Filtering data and Correcting typos – Filter the data in state and gender column to standardize the texts in those columns
- Removing duplicates
- Replacing null values – Replace missing values with means of the column (for numerical columns)

In [34]:
file1 = pd.read_csv('data/file1.csv')

In [35]:
file1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4008 entries, 0 to 4007
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer                   1071 non-null   object 
 1   ST                         1071 non-null   object 
 2   GENDER                     954 non-null    object 
 3   Education                  1071 non-null   object 
 4   Customer Lifetime Value    1068 non-null   object 
 5   Income                     1071 non-null   float64
 6   Monthly Premium Auto       1071 non-null   float64
 7   Number of Open Complaints  1071 non-null   object 
 8   Policy Type                1071 non-null   object 
 9   Vehicle Class              1071 non-null   object 
 10  Total Claim Amount         1071 non-null   float64
dtypes: float64(3), object(8)
memory usage: 344.6+ KB


In [36]:
file2 = pd.read_csv('data/file2.csv')

In [37]:
file2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 996 entries, 0 to 995
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer                   996 non-null    object 
 1   ST                         996 non-null    object 
 2   GENDER                     991 non-null    object 
 3   Education                  996 non-null    object 
 4   Customer Lifetime Value    992 non-null    object 
 5   Income                     996 non-null    int64  
 6   Monthly Premium Auto       996 non-null    int64  
 7   Number of Open Complaints  996 non-null    object 
 8   Total Claim Amount         996 non-null    float64
 9   Policy Type                996 non-null    object 
 10  Vehicle Class              996 non-null    object 
dtypes: float64(1), int64(2), object(8)
memory usage: 85.7+ KB


In [38]:
file3 = pd.read_csv('data/file3.csv')

In [39]:
file3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7070 entries, 0 to 7069
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer                   7070 non-null   object 
 1   State                      7070 non-null   object 
 2   Customer Lifetime Value    7070 non-null   float64
 3   Education                  7070 non-null   object 
 4   Gender                     7070 non-null   object 
 5   Income                     7070 non-null   int64  
 6   Monthly Premium Auto       7070 non-null   int64  
 7   Number of Open Complaints  7070 non-null   int64  
 8   Policy Type                7070 non-null   object 
 9   Total Claim Amount         7070 non-null   float64
 10  Vehicle Class              7070 non-null   object 
dtypes: float64(2), int64(3), object(6)
memory usage: 607.7+ KB


In [40]:
# call the function for file3
def rename_columns(file3):
    file3.rename(columns={'State': 'ST', 'Gender': 'GENDER'}, inplace=True )
    return file3

In [41]:
rename_file3 = rename_columns(file3)

In [42]:
file1.info()
file2.info()
rename_file3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4008 entries, 0 to 4007
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer                   1071 non-null   object 
 1   ST                         1071 non-null   object 
 2   GENDER                     954 non-null    object 
 3   Education                  1071 non-null   object 
 4   Customer Lifetime Value    1068 non-null   object 
 5   Income                     1071 non-null   float64
 6   Monthly Premium Auto       1071 non-null   float64
 7   Number of Open Complaints  1071 non-null   object 
 8   Policy Type                1071 non-null   object 
 9   Vehicle Class              1071 non-null   object 
 10  Total Claim Amount         1071 non-null   float64
dtypes: float64(3), object(8)
memory usage: 344.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 996 entries, 0 to 995
Data columns (total 11 columns):
 #   C

In [43]:
def aggregate ():
    file1 = pd.read_csv('data/file1.csv')
    file2 = pd.read_csv('data/file2.csv')
    rename_file3
    return pd.concat([file1, file2, rename_file3], axis=0)


In [44]:
ca_df = aggregate ()

In [45]:
list(ca_df.columns)

['Customer',
 'ST',
 'GENDER',
 'Education',
 'Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Number of Open Complaints',
 'Policy Type',
 'Vehicle Class',
 'Total Claim Amount']

In [46]:
def lower_case_column_names(ca_df):
    ca_df.columns=[i.lower() for i in ca_df.columns]
    return ca_df

In [47]:
lower_case_column_names(ca_df)

Unnamed: 0,customer,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [48]:
# delete the column customer as it is only a unique identifier for each row of data
def drop_columns(ca_df) :
    ca_df.drop(columns=['customer'], inplace=True)
    return ca_df

In [49]:
ca_df = drop_columns(ca_df)

In [50]:
ca_df

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [51]:
#Check the data types of all the columns and 
#fix the incorrect ones (for ex. customer lifetime value and number of complaints)
ca_df.dtypes

st                            object
gender                        object
education                     object
customer lifetime value       object
income                       float64
monthly premium auto         float64
number of open complaints     object
policy type                   object
vehicle class                 object
total claim amount           float64
dtype: object

In [52]:
ca_df['customer lifetime value'] =  pd.to_numeric(ca_df['customer lifetime value'], errors='coerce')
ca_df['number of open complaints'] =  pd.to_numeric(ca_df['number of open complaints'], errors='coerce')

In [53]:
ca_df.dtypes

st                            object
gender                        object
education                     object
customer lifetime value      float64
income                       float64
monthly premium auto         float64
number of open complaints    float64
policy type                   object
vehicle class                 object
total claim amount           float64
dtype: object

In [54]:
#Filter the data in state and gender column to standardize the texts in those columns
ca_df['gender'].unique()

array([nan, 'F', 'M', 'Femal', 'Male', 'female'], dtype=object)

In [55]:
ca_df['st'].unique()

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', 'Cali',
       'AZ', 'WA', nan], dtype=object)

In [58]:
# list(map(lambda x: "F" if ("[fF]" in x) else x, ca_df['gender'])) 
#no working with  float.. why? gender is an object
ca_df['gender'] = ca_df['gender'].astype(str)

In [79]:
ca_df['gender'] = list(map(lambda x: "F" if ("emal" in x) else x, ca_df['gender']))

In [77]:
ca_df['gender'] = list(map(lambda x: "M" if ("Male" in x) else x, ca_df['gender']))

In [74]:
ca_df['gender'] = list(map(lambda x: "other" if ("nan" in x) else x, ca_df['gender']))

In [78]:
ca_df['gender'].unique()

array(['other', 'F', 'M'], dtype=object)

In [81]:
# Removing duplicates
ca_df.drop_duplicates()

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,other,Master,,0.0,1000.0,,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,,0.0,94.0,,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,,48767.0,108.0,,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,,0.0,106.0,,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,,36357.0,68.0,,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,23405.987980,71941.0,73.0,0.0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,3096.511217,21604.0,79.0,0.0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,8163.890428,0.0,85.0,3.0,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,7524.442436,21941.0,96.0,0.0,Personal Auto,Four-Door Car,691.200000


In [83]:
# Replacing null values – Replace missing values with means of the column (for numerical columns)
ca_df[ca_df['customer lifetime value'].isna()==True]

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,other,Master,,0.0,1000.0,,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,,0.0,94.0,,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,,48767.0,108.0,,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,,0.0,106.0,,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,,36357.0,68.0,,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
991,Arizona,M,Master,,63513.0,70.0,,Personal Auto,Four-Door Car,185.667213
992,Arizona,F,College,,58161.0,68.0,,Corporate Auto,Four-Door Car,140.747286
993,Nevada,F,College,,83640.0,70.0,,Corporate Auto,Two-Door Car,471.050488
994,California,F,Master,,0.0,96.0,,Personal Auto,Two-Door Car,28.460568


In [86]:
# mean_customer_lifetime_value
mean_customer_lifetime_value = round(np.mean(ca_df['customer lifetime value']), 2)

In [87]:
ca_df['customer lifetime value'] = ca_df['customer lifetime value'].fillna(mean_customer_lifetime_value)

In [90]:
ca_df['customer lifetime value'].unique()

array([8028.8     , 3479.137523, 2502.637401, ..., 8163.890428,
       7524.442436, 2611.836866])

In [93]:
# mean_income
mean_income = round(np.mean(ca_df['income']), 2)

In [94]:
ca_df['income'] = ca_df['income'].fillna(mean_income)

In [95]:
ca_df['income'].unique()

array([    0., 48767., 36357., ..., 66367., 71941., 21941.])

In [99]:
# mean_monthly_premium_auto
mean_monthly_premium_auto = round(np.mean(ca_df['monthly premium auto']), 2)

In [100]:
ca_df['monthly premium auto'] = ca_df['monthly premium auto'].fillna(mean_monthly_premium_auto)

In [101]:
#mean_number_of_open_complaints
mean_number_of_open_complaints = round(np.mean(ca_df['number of open complaints']), 2)

In [102]:
ca_df['number of open complaints'] = ca_df['number of open complaints'].fillna(mean_number_of_open_complaints)

In [103]:
ca_df['number of open complaints'].unique()

array([0.38, 0.  , 2.  , 3.  , 1.  , 5.  , 4.  ])

In [107]:
# mean_total_claim_amount
mean_total_claim_amount = round(np.mean(ca_df['total claim amount']))

In [105]:
ca_df['total claim amount'] = ca_df['total claim amount'].fillna(mean_total_claim_amount)

In [106]:
ca_df['total claim amount'].unique()

array([   2.704934, 1131.464935,  566.472247, ...,  541.282007,
        198.234764,  790.784983])