# 0.0. Importações

## 0.1. Bibliotecas

In [6]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import inflection
from sklearn.model_selection import train_test_split

## 0.2. Dados

In [7]:
df = pd.read_csv('../data/train.csv')

x = df.copy()
y = df['Satisfaction'].copy()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)

In [8]:
df.shape

(103904, 24)

In [11]:
df["Satisfaction"].value_counts(normalize = True)

Neutral or Dissatisfaction    0.566667
Satisfied                     0.433333
Name: Satisfaction, dtype: float64

In [9]:
x_train.shape

(83123, 24)

In [12]:
x_train["Satisfaction"].value_counts(normalize = True)

Neutral or Dissatisfaction    0.566666
Satisfied                     0.433334
Name: Satisfaction, dtype: float64

In [10]:
x_test.shape

(20781, 24)

In [13]:
x_test["Satisfaction"].value_counts(normalize = True)

Neutral or Dissatisfaction    0.566671
Satisfied                     0.433329
Name: Satisfaction, dtype: float64

## 0.3. Funções Auxiliares

In [14]:
def cross_val( X, y, model, cv, limiar = 0.5, verbose = True):

    scores = []

    for cv in np.arange(1,cv+1):

        X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2 )

        model.fit(X_train, y_train)

        # Predictions
        probas = model.predict_proba(X_test)
        probas = probas.T[1]

        predictions = []

        for proba in probas:
            if proba > limiar:
                predictions.append(1)
            else:
                predictions.append(0)

        # Model evaluation
        score = mt.f1_score(y_test, predictions)
        scores.append(score)

    if verbose == True:
        print('Score:',round(100*np.mean(scores),2))

    return np.mean(scores)



def rename_columns(dataframe):
    df = dataframe.copy()
    title = lambda x: inflection.titleize(x)
    snakecase = lambda x: inflection.underscore(x)
    accent = lambda x: inflection.transliterate(x)
    spaces = lambda x: x.replace(" ", "")
    cols_old = list(df.columns)
    cols_old = list(map(title, cols_old))
    cols_old = list(map(spaces, cols_old))
    cols_old = list(map(accent, cols_old))
    cols_new = list(map(snakecase, cols_old))
    df.columns = cols_new
    return df

# 1.0. Descrição dos dados

In [15]:
df1 = x_train.copy()

In [16]:
df1.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Purchase,Store size,Store distance,InStore wifi,Open/Close time convenient,Easy of online shopping,...,Kids entertainment,Seller service,Showroom,Self-Store,Purchase service,Store Service,Cleanliness,Carrier delay in minutes,Delivery delay in minutes,Satisfaction
81452,35541,Female,Loyal Customer,25,Personal,Medium,1074,4,5,4,...,2,4,3,4,4,4,2,0,2.0,Satisfied
50353,117031,Male,Loyal Customer,41,Gift,Large,2171,3,3,3,...,4,4,4,4,5,4,3,68,61.0,Satisfied
30197,71420,Female,Loyal Customer,43,Gift,Large,867,4,4,4,...,5,5,5,5,5,5,4,7,6.0,Satisfied
65127,64707,Female,disloyal Customer,13,Gift,Large,551,0,0,0,...,3,3,4,5,4,5,3,76,69.0,Satisfied
10686,4947,Male,Loyal Customer,62,Gift,Small,268,5,5,5,...,5,4,1,2,4,1,5,0,3.0,Satisfied


## 1.1 Rename Columns 

In [20]:
df1.columns

Index(['id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase',
       'Store size', 'Store distance', 'InStore wifi',
       'Open/Close time convenient', 'Easy of online shopping',
       'Store location', 'Toilet cleaning', 'Dressing room', 'Waiting room',
       'Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store',
       'Purchase service', 'Store Service', 'Cleanliness',
       'Carrier delay in minutes', 'Delivery delay in minutes',
       'Satisfaction'],
      dtype='object')

In [21]:
df1 = rename_columns(df1)

In [22]:
df1.columns

Index(['id', 'gender', 'customer_type', 'age', 'type_of_purchase',
       'store_size', 'store_distance', 'in_store_wifi',
       'open/close_time_convenient', 'easy_of_online_shopping',
       'store_location', 'toilet_cleaning', 'dressing_room', 'waiting_room',
       'kids_entertainment', 'seller_service', 'showroom', 'self_store',
       'purchase_service', 'store_service', 'cleanliness',
       'carrier_delay_in_minutes', 'delivery_delay_in_minutes',
       'satisfaction'],
      dtype='object')

In [30]:
df1['store_location'].unique()

array([5, 3, 4, 1, 2, 0], dtype=int64)

In [None]:
#renomeando nome das colunas para façam mais sentido durante análise, evitando interpretações erradas e levando em conta tamanho do nome

# policy_sales_channel -> sales_channel
# previously_insured -> vehicle_insured
# vintage -> days_client_associate
# response -> interested 


df1.rename(columns = {'policy_sales_channel':'sales_channel', 'previously_insured' : 'vehicle_insured', 'vintage': 'days_client_associate',
                      'response':'interested'}, inplace = True)

## 1.1 Columns Meanings

In [None]:

- **id:** uma identificação única para cada cliente;


- **gender:** gênero do cliente;


- **customer_type:** Tipo de cliente (fidelizado ou não);
    

- **age:** idade do cliente;


- **type_of_purchase:** tipo de compras (pessoal ou presente);


- **store_size:** tamanho da loja;
    
    
- **store_distance:** distância da loja ao centro da cidade;


- **in_store_wifi:** nível de qualidade do wifi na loja (0 a 5);


- **open/close_time_convenient:** nível de sat. com horário de abertura e fechamento(0 a 5);


- **easy_of_online_shopping:** nível de sat. com acesso ao shopping online(0 a 5);


- **store_location:** nível de sat. com a localização loja(0 a 5);;


- **annual_premium:** montante que o cliente necessita pagar como prêmio anualmente;

## 1.1. Dimensões

In [17]:
print("Este dataset tem {} linhas e {} colunas".format(df1.shape[0], df1.shape[1]))

Este dataset tem 83123 linhas e 24 colunas


## 1.2. Tipos

In [18]:
df1.dtypes

id                              int64
Gender                         object
Customer Type                  object
Age                             int64
Type of Purchase               object
Store size                     object
Store distance                  int64
InStore wifi                    int64
Open/Close time convenient      int64
Easy of online shopping         int64
Store location                  int64
Toilet cleaning                 int64
Dressing room                   int64
Waiting room                    int64
Kids entertainment              int64
Seller service                  int64
Showroom                        int64
Self-Store                      int64
Purchase service                int64
Store Service                   int64
Cleanliness                     int64
Carrier delay in minutes        int64
Delivery delay in minutes     float64
Satisfaction                   object
dtype: object

## 1.4. NaNs

In [13]:
df1.isna().sum()

id                            0
Gender                        0
Customer Type                 0
Age                           0
Type of Purchase              0
Store size                    0
Store distance                0
InStore wifi                  0
Open/Close time convenient    0
Easy of online shopping       0
Store location                0
Toilet cleaning               0
Dressing room                 0
Waiting room                  0
Kids entertainment            0
Seller service                0
Showroom                      0
Self-Store                    0
Purchase service              0
Store Service                 0
Cleanliness                   0
Carrier delay in minutes      0
Delivery delay in minutes     0
Satisfaction                  0
dtype: int64

In [12]:
df1['Delivery delay in minutes'] = df1['Delivery delay in minutes'].fillna(df1['Delivery delay in minutes'].median())

## 1.5. Estatística Descritiva

In [14]:
var_num = df1.select_dtypes(include=['int64', 'float64'])
var_cat = df1.select_dtypes(exclude=['int64', 'float64'])

In [15]:
var_num.describe()

Unnamed: 0,id,Age,Store distance,InStore wifi,Open/Close time convenient,Easy of online shopping,Store location,Toilet cleaning,Dressing room,Waiting room,Kids entertainment,Seller service,Showroom,Self-Store,Purchase service,Store Service,Cleanliness,Carrier delay in minutes,Delivery delay in minutes
count,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0
mean,64924.210502,39.379706,1189.448375,2.729683,3.060296,2.756901,2.976883,3.202129,3.250375,3.439396,3.358158,3.382363,3.351055,3.631833,3.30429,3.640428,3.286351,14.815618,15.133392
std,37463.812252,15.114964,997.147281,1.327829,1.525075,1.398929,1.277621,1.329533,1.349509,1.319088,1.332991,1.288354,1.315605,1.180903,1.265396,1.175663,1.312273,38.230901,38.649776
min,1.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,32533.75,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,64856.5,40.0,843.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,97368.25,51.0,1743.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,129880.0,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


In [16]:
var_cat.describe(include='object')

Unnamed: 0,Gender,Customer Type,Type of Purchase,Store size,Satisfaction
count,103904,103904,103904,103904,103904
unique,2,2,2,3,2
top,Female,Loyal Customer,Gift,Large,Neutral or Dissatisfaction
freq,52727,84923,71655,49665,58879


# 2.0. Feature Engineering

In [17]:
df2 = df1.copy()

# 3.0. Análise Exploratória de Dados

In [18]:
df3 = df2.copy()

## 3.1. Univariada

### 3.1.1. Variável Alvo

In [20]:
px.bar(df3, x='Satisfaction')

### 3.1.2. Variáveis Numéricas

### 3.1.3. Variáveis Categóricas

## 3.2. Bivariada

### 3.2.1. Variáveis Numéricas

### 3.2.2. Variáveis Categóricas

## 3.3. Multivariada

### 3.3.1. Variávei Numéricas

### 3.3.2. Variáveis Categóricas

# 4.0. Preparação dos Dados

In [None]:
df4 = df3.copy()

## 4.1. Filtragem

## 4.2. Encoding

## 4.3. Rescaling

# 5.0. Feature Selection

In [None]:
df5 = df4.copy()

# 6.0. Modelos de Machine Learning

In [None]:
df6 = df5.copy()

# 7.0. Hiperparâmetros

In [None]:
df7 = df6.copy()

# 8.0. Interpretação do Erro

In [None]:
df8 = df7.copy()