# ETL

## LOAD DATA
---

### Install the required packages

In [16]:
# remove the '#' to run the command, this is a python script
#pip install -r requirements.txt

### Import and read the csv file

In [7]:
import pandas as pd

raw = pd.read_csv("insurance.csv")

## TRANSFORM DATA
--- 

### Rename the English column names to Portuguese language

In [8]:
raw.rename(
    columns={
        "age": "idade",
        "sex": "genero",
        "bmi": "imc",
        "children": "filhos",
        "smoker": "fumante",
        "region": "regiao",
        "charges": "encargos",
    },
    inplace=True,
)

### Convert the column names and values to Portuguese language

In [9]:
def region_replace(value):
    return {
        "southwest": "sudoeste",
        "southeast": "sudeste",
        "northwest": "noroeste",
        "northeast": "nordeste",
    }.get(value, value)


raw["regiao"] = raw["regiao"].apply(region_replace)
raw["genero"] = raw["genero"].apply(lambda value: "masculino" if value == "male" else "feminino")
raw["fumante"] = raw["fumante"].apply(lambda value: "sim" if value == "yes" else "não")

raw

Unnamed: 0,idade,genero,imc,filhos,fumante,regiao,encargos
0,19,feminino,27.900,0,sim,sudoeste,16884.92400
1,18,masculino,33.770,1,não,sudeste,1725.55230
2,28,masculino,33.000,3,não,sudeste,4449.46200
3,33,masculino,22.705,0,não,noroeste,21984.47061
4,32,masculino,28.880,0,não,noroeste,3866.85520
...,...,...,...,...,...,...,...
1333,50,masculino,30.970,3,não,noroeste,10600.54830
1334,18,feminino,31.920,0,não,nordeste,2205.98080
1335,18,feminino,36.850,0,não,sudeste,1629.83350
1336,21,feminino,25.800,0,não,sudoeste,2007.94500


# Data Display
---

We need to show the data macros and main caracteristics for the data set, as types, values, using statistics and graphs.

### Exploratory Data Analysis (EDA)

#### Dataset Information

In [10]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   idade     1338 non-null   int64  
 1   genero    1338 non-null   object 
 2   imc       1338 non-null   float64
 3   filhos    1338 non-null   int64  
 4   fumante   1338 non-null   object 
 5   regiao    1338 non-null   object 
 6   encargos  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


#### Dataset Shape

In [11]:
raw.describe()

Unnamed: 0,idade,imc,filhos,encargos
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


#### Dataset Null Values

In [12]:
raw.isnull().sum()

idade       0
genero      0
imc         0
filhos      0
fumante     0
regiao      0
encargos    0
dtype: int64

### Distribuição de idades