# 0.0 Imports

In [18]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import psycopg2 as pg
import sqlalchemy as db

from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split

from IPython.display import display, HTML

 ## 0.1 Helper Functions

In [6]:
def jupyter_settings():
    %matplotlib inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display(HTML("<style>.container { width:95% !important; }</style>"))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    sns.set

In [7]:
jupyter_settings()

## 0.2 Loading Data

In [8]:
#credentials

host = 'comunidade-ds-postgres.c50pcakiuwi3.us-east-1.rds.amazonaws.com'
port = 5432
database = 'comunidadedsdb'
username = 'member'
pwd = 'cdspa'

In [9]:
#create string connection and postgre engine

string_connect = 'postgresql://' + username + ':' + pwd + '@' + host + ':' + str(port) + '/' + database
engine = db.create_engine(string_connect)
conn1 = engine.connect()


In [10]:
# make query

query_tables = """

SELECT * 
FROM pa004.users u LEFT JOIN pa004.vehicle v ON (u.id = v.id)
				   LEFT JOIN pa004.insurance i ON (u.id = i.id)



"""

In [11]:
# Read dataFrame

df = pd.read_sql(query_tables, conn1)
df.head()

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,driving_license,vehicle_age,vehicle_damage,previously_insured,annual_premium,vintage,response
0,7,Male,23,11.0,152.0,1,< 1 Year,Yes,0,23367.0,249,0
1,13,Female,41,15.0,14.0,1,1-2 Year,No,1,31409.0,221,0
2,18,Female,25,35.0,152.0,1,< 1 Year,No,1,46622.0,299,0
3,31,Female,26,8.0,160.0,1,< 1 Year,No,0,2630.0,136,0
4,39,Male,45,8.0,124.0,1,1-2 Year,Yes,0,42297.0,264,0


In [None]:
# Split data frame in Traon, test and validation

In [23]:
X = df.drop(['response'], axis=1)
y = df['response']

In [24]:
#Split train-test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [25]:
# Split Train_validation

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state=42)

# 1.0 Descrição dos Dados

In [27]:
df1 = X_train.copy()

## 1.1 Rename Columns

In [28]:
df1.columns

Index(['id', 'gender', 'age', 'region_code', 'policy_sales_channel',
       'driving_license', 'vehicle_age', 'vehicle_damage',
       'previously_insured', 'annual_premium', 'vintage'],
      dtype='object')

## 1.2 Data Dimension

In [29]:
df1.shape

(243909, 11)

## 1.3 Data Types

In [30]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 243909 entries, 217927 to 169494
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    243909 non-null  int64  
 1   gender                243909 non-null  object 
 2   age                   243909 non-null  int64  
 3   region_code           243909 non-null  float64
 4   policy_sales_channel  243909 non-null  float64
 5   driving_license       243909 non-null  int64  
 6   vehicle_age           243909 non-null  object 
 7   vehicle_damage        243909 non-null  object 
 8   previously_insured    243909 non-null  int64  
 9   annual_premium        243909 non-null  float64
 10  vintage               243909 non-null  int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 22.3+ MB


## 1.4 Check NA

In [31]:
df1.isna().sum()

id                      0
gender                  0
age                     0
region_code             0
policy_sales_channel    0
driving_license         0
vehicle_age             0
vehicle_damage          0
previously_insured      0
annual_premium          0
vintage                 0
dtype: int64

## 1.5 Change Types

In [32]:
df1.dtypes

id                        int64
gender                   object
age                       int64
region_code             float64
policy_sales_channel    float64
driving_license           int64
vehicle_age              object
vehicle_damage           object
previously_insured        int64
annual_premium          float64
vintage                   int64
dtype: object

In [33]:
df1['region_code'] = df1['region_code'].astype(int)
df1['policy_sales_channel'] = df1['policy_sales_channel'].astype(int)

## 1.6 Descriptive Statistical

In [34]:
num_attributes = df1.select_dtypes( include = ['int32, int64', 'float'] )
cat_attributes = df1.select_dtypes( exclude = ['int32', 'int64', 'float'] )

### 1.6.1 Numerical Attributes

In [None]:
# Central tendency metrics

