#### **IMPORTAR BIBLIOTECAS**

In [1]:
from pyspark.sql.functions import *
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

StatementMeta(, ce2805eb-b690-47be-a836-69f28df96049, 3, Finished, Available, Finished)

#### **LEITURA DOS DADOS**

In [2]:
df = spark.sql("SELECT* FROM obesity") \
          .select('Genero', 'Idade', 'Peso', 'Historico_Familiar', 'FAVC', 'CAEC', 'SCC', 'FAF', 'CALC', 'MTRANS', 'Nivel_Obesidade')

df = df.toPandas()

display(df)

StatementMeta(, ce2805eb-b690-47be-a836-69f28df96049, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6b8061c0-cfe9-456a-b459-332821f6ca55)

In [3]:
cat_cols = ['Genero', 'Historico_Familiar', 'FAVC', 'CAEC', 'SCC', 'CALC', 'MTRANS']

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='first'), cat_cols)
], remainder='passthrough')

StatementMeta(, ce2805eb-b690-47be-a836-69f28df96049, 5, Finished, Available, Finished)

#### **SEPARAÇÃO VARIÁVEIS**

In [4]:
X = df.drop('Nivel_Obesidade', axis=1)
y = df['Nivel_Obesidade']

StatementMeta(, ce2805eb-b690-47be-a836-69f28df96049, 6, Finished, Available, Finished)

### **TREINAMENTO DO MODELO**

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

StatementMeta(, ce2805eb-b690-47be-a836-69f28df96049, 7, Finished, Available, Finished)

In [6]:
pipeline = Pipeline(steps=[
    ('preprocessamento', preprocessor),
    ('modelo', RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)

StatementMeta(, ce2805eb-b690-47be-a836-69f28df96049, 8, Finished, Available, Finished)

#### **ACURÁCIA DO MODELO**

In [7]:
y_pred = pipeline.predict(X_test)
print("Acurácia:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

StatementMeta(, ce2805eb-b690-47be-a836-69f28df96049, 9, Finished, Available, Finished)

Acurácia: 0.9196217494089834
                    precision    recall  f1-score   support

  Obesidade Grau I       0.96      0.96      0.96        70
 Obesidade Grau II       0.98      1.00      0.99        60
Obesidade Grau III       1.00      1.00      1.00        65
 Peso Insuficiente       0.88      0.93      0.90        54
       Peso Normal       0.81      0.81      0.81        58
 Sobrepeso Nível I       0.89      0.86      0.88        58
Sobrepeso Nível II       0.89      0.86      0.88        58

          accuracy                           0.92       423
         macro avg       0.92      0.92      0.92       423
      weighted avg       0.92      0.92      0.92       423



### **TESTES DO MODELO**

In [8]:
# Selecionar uma amostra de teste
amostra = X_test.iloc[[76]]
real = y_test.iloc[76]

# Fazer a predição
predito = pipeline.predict(amostra)

print(f"Valor real: {real}")
print(f"Predição:   {predito[0]}")

StatementMeta(, ce2805eb-b690-47be-a836-69f28df96049, 10, Finished, Available, Finished)

Valor real: Obesidade Grau I
Predição:   Obesidade Grau I


In [9]:
# Selecionar uma linha aleatória do X_test
amostra = X_test.sample(1, random_state=79)
real = y_test.loc[amostra.index[0]]

# Fazer a predição
predito = pipeline.predict(amostra)

print(f"Valor real: {real}")
print(f"Predição:   {predito[0]}")

StatementMeta(, ce2805eb-b690-47be-a836-69f28df96049, 11, Finished, Available, Finished)

Valor real: Peso Insuficiente
Predição:   Peso Insuficiente


#### **ENCERRAR SESSÃO**

In [None]:
mssparkutils.session.stop()