# Projeto de Estatística II
## Parte II - Regressão Logística e Hipóteses mais complexas

### Alexsander Vieira

Importando as bibliotecas do projeto.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression


In [None]:
df = pd.read_csv('diabetes.csv')

df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.histplot(df, x="Outcome")

In [None]:
df_model = df.select_dtypes(include=np.number)

df_model.isnull().sum()

X = df_model.drop(columns='Outcome')
y = df_model['Outcome']


In [None]:
logit = LogisticRegression()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

logit.fit(X_train, y_train)

In [None]:
logit.intercept_

In [None]:
X_train.columns.tolist()

In [None]:
logit.coef_

In [None]:
X_test

In [None]:
y_test

In [None]:
y_predict = logit.predict(X_test)

In [None]:
y_predict[:50]

In [None]:
y_test[:50]

In [None]:
# verificando os coeficientes
print("Coeficientes:\n", logit.coef_)
print("\nIntercept:", logit.intercept_)

# dê uma olhada nas classes do modelo
classes =  logit.classes_
print("\nClasses:", classes)

# 5) probabilidades das previsões
probs = logit.predict_proba(X_test)
print("\nProbabilidades:", probs)

# probabilidade de pertencimento à classe 1
probs_1 = probs[:, 1]
print("\nProbabilidades Classe 1:", probs_1)

# avaliação  do modelo

# cutoffs
threshold_list = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

print("\n###################################################\n")
print("Avaliação de modelos com diferentes valores de cutoff")
print("\n###################################################\n")

for threshold in threshold_list:
    
    print("\n Cutoff:", threshold)
    
    # previsões
    y_pred = np.where(probs_1 >= threshold, 1, 0)

    print("\nMatriz de confusão do modelo nos dados de teste:")
    print(confusion_matrix(y_test, y_pred))
    
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap="viridis")
    plt.show()

    print("\nClassification report do modelo nos dados de teste:")
    print(classification_report(y_test, y_pred))
    
    print("\n##########################################################\n")

## PRIMEIRAS RESPOSTAS E CONCLUSÕES:

### 1 - O cutoff de 0.6 é o que retorna a maior acurácia = 0.76;
### 2 - o cutoff de 0.5 é o que retorna o mesmo valor de precision e recall;
### 3 - Os valores são de 0.8 para o outcome 0 e 0.62 para o outcome.

In [None]:
# criando features polinomiais
# pra deixar a hipotese mais complexa (regressão linear em espaço polinomial)

pf = PolynomialFeatures(degree=2, include_bias=False)

# fit nos dados de treino
pf.fit(X_train)

print(f"Número original de features: {pf.n_features_in_}")
print(f"Número de features no espaço transformado: {pf.n_output_features_}\n\n")

# redefinindo as features de treino e de teste
X_train = pf.transform(X_train)
X_test = pf.transform(X_test)

# ======================================

reg_lin = LinearRegression()

# ======================================

reg_lin.fit(X_train, y_train)

print(f"Intercepto (b0): {reg_lin.intercept_}")
print(f"Demais parâmetros (b1, ..., bn): {reg_lin.coef_}")

# ======================================

# predições de treino
y_pred_train = reg_lin.predict(X_train)

print("\nMétricas de treino:\n")
print(f"R^2: {r2_score(y_train, y_pred_train):.2f}")
print(f"MAE: {mean_absolute_error(y_train, y_pred_train):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_train)):.2f}")

# predições de teste
y_pred_test = reg_lin.predict(X_test)

print("\nMétricas de teste:\n")
print(f"R^2: {r2_score(y_test, y_pred_test):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_test):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")