In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('/kaggle/input/medical-insurance-price-prediction/Medical_insurance.csv')
df.head()

In [None]:
# Início da análise exploratória
df.info()

In [None]:
df.describe(include='all')

In [None]:
cat_features = list(pd.DataFrame(df.select_dtypes(exclude=[np.number])))
for column in cat_features:
    print('\n','Modalidades da variável',column,'\n',df[column].unique())

In [None]:
fig, ax = plt.subplots()
plt.title('Distribution of Charges Variable')
df['charges'].hist(bins=40)

In [None]:
color = df['smoker'].map({'yes': '#084887', 'no': '#F58A07'})
fig, axs = plt.subplots(2,3,figsize=(20, 5))
ic = 0
for r in range(2):
  for c in range(3):
    if not (r == 2 and c == 2):
      axs[r,c].scatter(df.iloc[:,ic], df['charges'], c= color)
      ic += 1

In [None]:
# Início do pipeline de treinamento
# Criação de lista para numerical e Categorical features

numerical_features = [
    'age',
    'bmi',
    'children'
]

categorical_features = [
    'sex',
    'smoker',
    'region'
]

# Selecionando features e target

features = [
    'age',
    'bmi',
    'children',
    'sex',
    'smoker',
    'region'
]

target = [
    'charges'
]

# separação de dados treino e teste
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessamento de colunas numéricas
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessamento de colunas categóricas
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', TargetEncoder())
])

# Combinando pré-processadores de colunas numéricas e categóricas
preprocessor = ColumnTransformer([
    ('numeric', numeric_transformer, numerical_features),
    ('categorical', categorical_transformer, categorical_features)
])

# Criando o pipeline com etapas de pré-processamento e modelo
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=0))
])

# Resetando o índice
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

# Treinando o pipeline
pipeline.fit(X_train, y_train)

In [None]:
# Fazendo predição com o modelo treinado no pipeline
y_pred = pipeline.predict(X_test)

In [None]:
# visualização dos resultados obtidos
print(f'R2: {r2_score(y_test, y_pred)} --> Quantos % a predição representa o resultado real')
print(f'MAE: {mean_absolute_error(y_test, y_pred)} --> Erro médio absoluto')
print(f'MSE: {mean_squared_error(y_test, y_pred)} --> Erro médio quadrático')