In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chi2_contingency

sns.set_style("whitegrid")

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

In [22]:
df_costs = pd.read_csv('./datasets/health.csv')

In [19]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   object 
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df_costs.shape

(1338, 7)

In [6]:
df_costs.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
df_costs.isnull().sum()

age                0
sex                0
bmi                0
children           0
smoker             0
region             0
medical charges    0
dtype: int64

In [8]:
for column in df_costs.select_dtypes(include=['object']).columns:
  if df_costs[column].nunique() == 1:
    print(column)

In [10]:
for column in df_costs.select_dtypes(include=['object']).columns:
    print(f"A coluna {column} possui os seguintes valores unicos: {df_costs[column].unique()}")

A coluna sex possui os seguintes valores unicos: ['female' 'male']
A coluna smoker possui os seguintes valores unicos: ['yes' 'no']
A coluna region possui os seguintes valores unicos: ['southwest' 'southeast' 'northwest' 'northeast']


In [11]:
for column in df_costs.select_dtypes(include=['number']).columns:
  if df_costs[column].nunique() == 1:
    print(column)

In [12]:
for column in df_costs.select_dtypes(include=['number']).columns:
    print(f"A coluna {column} possui os seguintes valores unicos: {df_costs[column].unique()}")

A coluna age possui os seguintes valores unicos: [19 18 28 33 32 31 46 37 60 25 62 23 56 27 52 30 34 59 63 55 22 26 35 24
 41 38 36 21 48 40 58 53 43 64 20 61 44 57 29 45 54 49 47 51 42 50 39]
A coluna bmi possui os seguintes valores unicos: [27.9   33.77  33.    22.705 28.88  25.74  33.44  27.74  29.83  25.84
 26.22  26.29  34.4   39.82  42.13  24.6   30.78  23.845 40.3   35.3
 36.005 32.4   34.1   31.92  28.025 27.72  23.085 32.775 17.385 36.3
 35.6   26.315 28.6   28.31  36.4   20.425 32.965 20.8   36.67  39.9
 26.6   36.63  21.78  30.8   37.05  37.3   38.665 34.77  24.53  35.2
 35.625 33.63  28.    34.43  28.69  36.955 31.825 31.68  22.88  37.335
 27.36  33.66  24.7   25.935 22.42  28.9   39.1   36.19  23.98  24.75
 28.5   28.1   32.01  27.4   34.01  29.59  35.53  39.805 26.885 38.285
 37.62  41.23  34.8   22.895 31.16  27.2   26.98  39.49  24.795 31.3
 38.28  19.95  19.3   31.6   25.46  30.115 29.92  27.5   28.4   30.875
 27.94  35.09  29.7   35.72  32.205 28.595 49.06  27.17  23.

In [13]:
df_costs.describe()

Unnamed: 0,age,bmi,children,medical charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [24]:
df_costs['smoker'] = df_costs['smoker'].apply(lambda x: 1 if x == 'yes' else 0)

In [27]:
df_costs['smoker'].value_counts()

smoker
0    1064
1     274
Name: count, dtype: int64

In [28]:
fig = px.histogram(df_costs, x='age', nbins=30, title='Distribuição de idade')
fig.show()

In [38]:
fig = px.histogram(df_costs, x='medical charges', nbins=30, title='Distribuição de gastos com saúde')
fig.show()

In [37]:
fig = px.histogram(df_costs, x='children', title='Distribuição de quantidade de filhos')
fig.show()

In [36]:
fig = px.histogram(df_costs, x='bmi', nbins=30, title='Distribuição de BMI')
fig.show()

In [58]:
grouped_data = df_costs.groupby(['age', 'sex'])['medical charges'].mean().reset_index()
grouped_data
fig = px.bar(grouped_data, x='age', y='medical charges', color='sex')
fig.show()

In [59]:
grouped_data = df_costs.groupby(['region', 'sex'])['medical charges'].mean().reset_index()
grouped_data
fig = px.bar(grouped_data, x='region', y='medical charges', color='sex')
fig.show()

In [62]:
grouped_data = df_costs.groupby(['smoker'])['medical charges'].mean().reset_index()
grouped_data
fig = px.bar(grouped_data, x='smoker', y='medical charges')
fig.show()

In [63]:
fig = px.box(df_costs, x='age', y='medical charges')
fig.show()

In [64]:
fig = px.box(df_costs, x='sex', y='medical charges')
fig.show()

In [65]:
fig = px.box(df_costs, x='region', y='medical charges')
fig.show()

In [76]:
fig = px.box(df_costs, x='smoker', y='medical charges')
fig.show()

In [74]:
fig = px.scatter(
    df_costs,
    y="age",
    x="medical charges",
    color="sex",  
)

fig.show()

In [68]:
fig = px.scatter(
    df_costs,
    y="bmi",
    x="medical charges",
    color="sex",  
)

fig.show()

In [75]:
fig = px.scatter(
    df_costs,
    y="bmi",
    x="medical charges",
    color="smoker",  
)

fig.show()

In [72]:
fig = px.scatter(
    df_costs,
    y="children",
    x="medical charges",
    color="sex",  
)

fig.show()

In [137]:
fig = px.scatter(
    df_costs,
    y="smoker",
    x="medical charges")

fig.show()

In [87]:
# Plot da Matriz de correlação das variáveis
corr_matrix = df_costs.select_dtypes(include=['number']).corr()

fig = go.Figure()

fig.add_trace(
  go.Heatmap(
    x = corr_matrix.columns,
    y = corr_matrix.index,
    z = np.array(corr_matrix),
    text = corr_matrix.values,
    texttemplate='%{text:.2f}',
    colorscale='Blues',
    zmin=-1,
    zmax=2
  )
)

fig.show()

In [88]:
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [89]:
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
  transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
  ]
)

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

X_train_tranformed = preprocessor.fit_transform(X_train)
# Uso somente o transform pois o preprocessor já foi treinado na linha acima
X_test_tranformed = preprocessor.transform(X_test)

In [144]:
bagging_model = BaggingRegressor(
  estimator=LinearRegression(),
  n_estimators=100,
  random_state=51
  )

In [145]:
bagging_model.fit(X_train_tranformed, y_train)

In [146]:
y_pred = bagging_model.predict(X_test_tranformed)

In [147]:
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [148]:
rmse

np.float64(6620.278582909106)

In [149]:
r2

0.7479443215970111