# Round 5 | Ainara Guerra 

#### But first, libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import matplotlib.ticker as mk
pd.set_option('display.max_columns', None)
#%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# FIRST PART: Get the data

In [None]:
df = pd.read_csv('marketing_customer_analysis.csv')
df

# SECOND PART: Dealing with the data

In [None]:
#First, columns: Standarize their names

In [None]:
df.columns

In [None]:
cols = [col_name.lower().replace(' ', '_') for col_name in df]
df.columns = cols
df = df.rename(columns={'employmentstatus': 'employment_status'})
df.columns

In [None]:
#let's check the type of each columns
df.info()

In [None]:
df.describe()
#looks good

In [None]:
#what about nans??
df.isna().sum()
#looks good

In [None]:
# let's store categoricals and numericals for later
Z = df.copy()
Z_num = Z.select_dtypes(include = np.number)
Z_num_2 = Z_num.drop(['total_claim_amount'], axis=1)
Z_cat = Z.select_dtypes(include = np.object)

In [None]:
#let's explore a little bit more the categorical data to see if there's anything wrong
df['state'].value_counts()

In [None]:
df['response'].value_counts()

In [None]:
df['coverage'].value_counts()

In [None]:
df['education'].value_counts()

In [None]:
df['employment_status'].value_counts()

In [None]:
df['gender'].value_counts()

In [None]:
df['marital_status'].value_counts()

In [None]:
df['policy_type'].value_counts()

In [None]:
df['policy'].value_counts()

In [None]:
df['renew_offer_type'].value_counts()

In [None]:
df['sales_channel'].value_counts()

In [None]:
df['vehicle_class'].value_counts()

In [None]:
df['vehicle_size'].value_counts()

# THIRD PART: EXPLORE THE DATA

In [None]:
df['response'].hist()

In [None]:
# Plot of the response rate by sales channel

In [None]:
#Cormac showed us this method and It's more efficient that what I did for my lab. 
x,y = 'response', 'sales_channel'

df1 = df.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1)
g.ax.set_ylim(0,100)

for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

In [None]:
# A plot of the response rate by the total claim amount.
#This is inspired by Nathi's work to try another type of exercise
df2 = df.copy()
bins = [0, 200, 400, 600, 1000, 3000]
labels = ['0-200', '200-400', '400-600', '600-1000', '1000-3000']
df2['binned'] = pd.cut(df2['total_claim_amount'], bins=bins, labels=labels) # Bin the "total_claim_amount" column into ranges
df2['response'] = df2['response'].replace({'Yes': 1, 'No': 0}) # turn responses into numerical values
grouped = df2.groupby('binned').agg({'response': 'sum', 'total_claim_amount': 'count'})
grouped['response_rate'] = grouped['response'] / grouped['total_claim_amount'] # Calculate the response rate for each bin
plt.bar(grouped.index, grouped['response_rate'], color = "pink")
plt.xlabel('Total Claim Amount Range')
plt.ylabel('Response Rate')
plt.title('Response Rate by Total Claim Amount')
plt.show() 

In [None]:
# A plot of the response rate by the income.
#This is also inspired by Nathi's work to try another type of exercise
df3 = df.copy()
bins2 = [0, 25000, 50000, 75000, 100000]
labels2 = ['0-25000', '25000-50000', '50000-75000', '75000-100000']
df3['binned'] = pd.cut(df3['income'], bins=bins2, labels=labels2) 
df3['response'] = df3['response'].replace({'Yes': 1, 'No': 0}) 
grouped1 = df3.groupby('binned').agg({'response': 'sum', 'income': 'count'}) 
grouped1['response_rate'] = grouped1['response'] / grouped1['income']
plt.bar(grouped.index, grouped['response_rate'], color = "purple")
plt.xlabel('Total Income Range')
plt.ylabel('Response Rate')
plt.title('Response Rate by Total Income Amount')
plt.show() 

In [None]:
#Now we will try to check the normality of the numerical variables visually
# Use seaborn library to construct distribution plots for the numerical variables

for column in Z_num.columns:
    sns.distplot(Z_num[column])
    plt.show()

In [None]:

# Use Matplotlib to construct histograms
fig, axs = plt.subplots(4, 2)
fig.set_size_inches(8,8)
axs[0, 0].hist(Z_num['customer_lifetime_value'])
axs[0, 1].hist(Z_num['income'])
axs[1, 0].hist(Z_num['monthly_premium_auto'])
axs[1, 1].hist(Z_num['months_since_last_claim'])
axs[2, 0].hist(Z_num["months_since_policy_inception"])
axs[2, 1].hist(Z_num["number_of_open_complaints"])
axs[3, 0].hist(Z_num["number_of_policies"])
axs[3, 1].hist(Z_num["total_claim_amount"])
plt.show()

# FOURTH PART: Processing Data

#### NORMALIZE

In [None]:
#Do the distributions for different numerical variables look like a normal distribution
# we are not going to change anything in total claim amount, that's why we will use X_num_2

transformer = MinMaxScaler().fit(Z_num_2)
Z_minmax = transformer.transform(Z_num_2)
Z_num_norm = pd.DataFrame(Z_minmax,columns=Z_num_2.columns)
Z_num_norm.head() 

#### CORRELATION

In [None]:
#For the numerical variables, check the multicollinearity between the features. Please note that we will use the column total_claim_amount later as the target variable.
sns.pairplot(Z_num)
data_corr = Z_num.corr()
data_corr = round(data_corr,2)
data_corr


In [None]:
#Drop one of the two features that show a high correlation between them (greater than 0.9). 
#Write code for both the correlation matrix and for seaborn heatmap. 
#If there is no pair of features that have a high correlation, then do not drop any features
#I appreciated that months since last claim, months since policy inception, 
#number of total complaints and number of policies are all highly correlated
Z_num_3 = Z_num.drop(['months_since_policy_inception', 'number_of_policies', 'months_since_last_claim' ], axis=1)
Z_num_3.head()
data_corr_1 = Z_num_3.corr()
data_corr_1 = round(data_corr_1,2)
data_corr_1


In [None]:
sns_plot = sns.heatmap(data_corr_1, annot=True)
figure = sns_plot.get_figure()    
figure.savefig('heatmap.png', dpi=400)
mask = np.zeros_like(data_corr_1)
mask[np.triu_indices_from(mask)] = True # optional, to hide repeat half of the matrix
fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.heatmap(data_corr_1, mask=mask, annot=True)
plt.show()

# X-Y SPLIT

In [None]:
y = df['total_claim_amount']
X = df.drop(['total_claim_amount'], axis=1)
x_num = df.select_dtypes(include = np.number)

In [None]:
lm = LinearRegression()
lm.fit(x_num,y) 
LinearRegression()

In [None]:
predictions = lm.predict(x_num) 
rmse = mean_squared_error(y, predictions, squared=False)
mae = mean_absolute_error(y, predictions)
print("R2_score:", round(lm.score(x_num,y),2)) 
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
# what about if we use "Z_num_3" that we deleted the highly correlated values?
Z_num_3_2 = Z_num_3.drop(['total_claim_amount'], axis=1)
lm1 = LinearRegression()
lm1.fit(Z_num_3_2,y) 
LinearRegression()

In [None]:
predictions = lm1.predict(Z_num_3_2) 
rmse1 = mean_squared_error(y, predictions, squared=False)
mae2 = mean_absolute_error(y, predictions)
print("R2_score:", round(lm1.score(Z_num_3_2,y),2)) 
print("RMSE:", rmse1)
print("MAE:", mae2)

#It is worse so we forget it about that.