O desafio é criar 3 modelos diferentes para o mesmo conjunto de dados e organizar, comparar e selecionar os resultados mais apropriados. ("*ensemble* manual")

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.decomposition import PCA

In [None]:
def metricas(y_test, y_pred):
  return [accuracy_score(y_test, y_pred),precision_score(y_test, y_pred),recall_score(y_test, y_pred)]

In [None]:
df = pd.read_csv('/content/dataR2.csv') 
df.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


In [None]:
X = df.drop('Classification', axis=1)
y = df['Classification']

y = y.map({1:0,
           2:1})

In [None]:
# Divisão e modelos, alguns modelos lidam com distancias então os dados serão normalizados
# Todos os classificadores serão utilizados com suas configurações padrão

X_std = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_std, y, random_state = 10, test_size = 0.3)

rf = RandomForestClassifier(random_state=10).fit(X_train, y_train)
sv = SVC(random_state=10).fit(X_train, y_train)
knn = KNeighborsClassifier().fit(X_train, y_train)
dt = DecisionTreeClassifier(random_state=10).fit(X_train, y_train)
reg_log = LogisticRegression(random_state=10).fit(X_train, y_train)

y_rf = rf.predict(X_test)
y_sv = sv.predict(X_test)
y_knn = knn.predict(X_test)
y_dt = dt.predict(X_test)
y_reg_log = reg_log.predict(X_test)

In [None]:
df_resultados = pd.DataFrame([y_rf, y_sv, y_knn, y_dt, y_reg_log]).T.rename(columns={0: "y_rf", 1: "y_sv", 2:"y_knn", 3:"y_dt", 4:"y_reg_log"})

In [None]:
df_resultados.head(3)

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log
0,0,0,0,1,0
1,1,1,1,1,1
2,1,1,1,1,1


In [None]:
df_resultados['votação'] = df_resultados.mode(axis = 1)

In [None]:
df_resultados.head(3)

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log,votação
0,0,0,0,1,0,0
1,1,1,1,1,1,1
2,1,1,1,1,1,1


In [None]:
df_metricas = pd.DataFrame({"y_rf":metricas(y_test, y_rf),
                         "y_sv":metricas(y_test, y_sv),
                         "y_knn":metricas(y_test, y_knn),
                         "y_dt":metricas(y_test, y_dt),
                         "y_reg_log":metricas(y_test, y_reg_log),
                         "ensemble":metricas(y_test, df_resultados['votação'])}, index=['Acurácia', 'Precisão', "Sensitividade"])
df_metricas

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log,ensemble
Acurácia,0.742857,0.857143,0.771429,0.485714,0.828571,0.8
Precisão,0.777778,0.85,0.789474,0.52381,0.842105,0.8
Sensitividade,0.736842,0.894737,0.789474,0.578947,0.842105,0.842105


# Seleção de Variáveis pela RandomForest

In [None]:
importances = rf.feature_importances_
print(np.argsort(importances)[::-1])

[2 4 7 0 1 3 5 6 8]


In [None]:
ind_importance = np.argsort(importances)[::-1][0:5]
print(f"Os indices mais importantes são: {ind_importance}")

Os indices mais importantes são: [2 4 7 0 1]


In [None]:
# Divisão e modelos, alguns modelos lidam com distancias então os dados serão normalizados
# Todos os classificadores serão utilizados com suas configurações padrão

X_train, X_test, y_train, y_test = train_test_split(X_std.iloc[:,ind_importance], y, random_state = 10, test_size = 0.3)

rf = RandomForestClassifier(random_state=10).fit(X_train, y_train)
sv = SVC(random_state=10).fit(X_train, y_train)
knn = KNeighborsClassifier().fit(X_train, y_train)
dt = DecisionTreeClassifier(random_state=10).fit(X_train, y_train)
reg_log = LogisticRegression(random_state=10).fit(X_train, y_train)

y_rf = rf.predict(X_test)
y_sv = sv.predict(X_test)
y_knn = knn.predict(X_test)
y_dt = dt.predict(X_test)
y_reg_log = reg_log.predict(X_test)

In [None]:
df_resultados = pd.DataFrame([y_rf, y_sv, y_knn, y_dt, y_reg_log]).T.rename(columns={0: "y_rf", 1: "y_sv", 2:"y_knn", 3:"y_dt", 4:"y_reg_log"})

In [None]:
df_resultados.head(3)

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log
0,0,0,1,0,0
1,1,1,1,1,1
2,1,1,1,0,1


In [None]:
df_resultados['votação'] = df_resultados.mode(axis = 1)

In [None]:
df_resultados.head(3)

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log,votação
0,0,0,1,0,0,0
1,1,1,1,1,1,1
2,1,1,1,0,1,1


In [None]:
df_metricas = pd.DataFrame({"y_rf":metricas(y_test, y_rf),
                         "y_sv":metricas(y_test, y_sv),
                         "y_knn":metricas(y_test, y_knn),
                         "y_dt":metricas(y_test, y_dt),
                         "y_reg_log":metricas(y_test, y_reg_log),
                         "ensemble":metricas(y_test, df_resultados['votação'])}, index=['Acurácia', 'Precisão', "Sensitividade"])
df_metricas

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log,ensemble
Acurácia,0.771429,0.885714,0.714286,0.771429,0.857143,0.857143
Precisão,0.789474,0.894737,0.764706,0.823529,0.85,0.888889
Sensitividade,0.789474,0.894737,0.684211,0.736842,0.894737,0.842105


# Utilizando o PCA

In [None]:
# O PCA já necessita de dados normalizados, então X_std será utilizado

X_pca = PCA(n_components=9).fit(X_std)
for i in range(1,10):
  print(f"{i}: {sum(X_pca.explained_variance_ratio_[:i])} \n")

1: 0.33983774273222583 

2: 0.5089712134580022 

3: 0.638698303213312 

4: 0.7615536484409763 

5: 0.8418361314885592 

6: 0.9148671503811057 

7: 0.9639283167853158 

8: 0.9964420866143773 

9: 1.0000000000000002 



In [None]:
# 95% dos dados explicados com 7 dimensões
X_pca = PCA(n_components=7).fit_transform(X_std)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state = 10, test_size = 0.3)

rf = RandomForestClassifier(random_state=10).fit(X_train, y_train)
sv = SVC(random_state=10).fit(X_train, y_train)
knn = KNeighborsClassifier().fit(X_train, y_train)
dt = DecisionTreeClassifier(random_state=10).fit(X_train, y_train)
reg_log = LogisticRegression(random_state=10).fit(X_train, y_train)

y_rf = rf.predict(X_test)
y_sv = sv.predict(X_test)
y_knn = knn.predict(X_test)
y_dt = dt.predict(X_test)
y_reg_log = reg_log.predict(X_test)

In [None]:
df_resultados = pd.DataFrame([y_rf, y_sv, y_knn, y_dt, y_reg_log]).T.rename(columns={0: "y_rf", 1: "y_sv", 2:"y_knn", 3:"y_dt", 4:"y_reg_log"})

In [None]:
df_resultados.head(3)

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log
0,0,0,1,0,0
1,1,1,1,1,1
2,1,1,1,1,1


In [None]:
df_resultados['votação'] = df_resultados.mode(axis = 1)

In [None]:
df_resultados.head(3)

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log,votação
0,0,0,1,0,0,0
1,1,1,1,1,1,1
2,1,1,1,1,1,1


In [None]:
df_metricas = pd.DataFrame({"y_rf":metricas(y_test, y_rf),
                         "y_sv":metricas(y_test, y_sv),
                         "y_knn":metricas(y_test, y_knn),
                         "y_dt":metricas(y_test, y_dt),
                         "y_reg_log":metricas(y_test, y_reg_log),
                         "ensemble":metricas(y_test, df_resultados['votação'])}, index=['Acurácia', 'Precisão', "Sensitividade"])
df_metricas

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log,ensemble
Acurácia,0.8,0.857143,0.685714,0.685714,0.857143,0.8
Precisão,0.8,0.85,0.7,0.7,0.85,0.8
Sensitividade,0.842105,0.894737,0.736842,0.736842,0.894737,0.842105


# PCA com caracteristicas mais importantes

In [None]:
ind_importance

array([2, 4, 7, 0, 1])

In [None]:
# O PCA já necessita de dados normalizados, então X_std será utilizado

X_pca = PCA(n_components=5).fit(X_std.iloc[:,ind_importance])
for i in range(1,6):
  print(f"{i}: {sum(X_pca.explained_variance_ratio_[:i])} \n")

1: 0.39469560225306516 

2: 0.6107366734538953 

3: 0.7902699716902456 

4: 0.9416765932518152 

5: 0.9999999999999999 



In [None]:
# 95% dos dados explicados com 7 dimensões
X_pca = PCA(n_components=4).fit_transform(X_std.iloc[:,ind_importance])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state = 10, test_size = 0.3)

rf = RandomForestClassifier().fit(X_train, y_train)
sv = SVC().fit(X_train, y_train)
knn = KNeighborsClassifier().fit(X_train, y_train)
dt = DecisionTreeClassifier().fit(X_train, y_train)
reg_log = LogisticRegression().fit(X_train, y_train)

y_rf = rf.predict(X_test)
y_sv = sv.predict(X_test)
y_knn = knn.predict(X_test)
y_dt = dt.predict(X_test)
y_reg_log = reg_log.predict(X_test)

In [None]:
df_resultados = pd.DataFrame([y_rf, y_sv, y_knn, y_dt, y_reg_log]).T.rename(columns={0: "y_rf", 1: "y_sv", 2:"y_knn", 3:"y_dt", 4:"y_reg_log"})

In [None]:
df_resultados.head(3)

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log
0,0,0,1,1,0
1,1,1,1,1,1
2,1,1,1,1,1


In [None]:
df_resultados['votação'] = df_resultados.mode(axis = 1)

In [None]:
df_resultados.head(3)

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log,votação
0,0,0,1,1,0,0
1,1,1,1,1,1,1
2,1,1,1,1,1,1


In [None]:
df_metricas = pd.DataFrame({"y_rf":metricas(y_test, y_rf),
                         "y_sv":metricas(y_test, y_sv),
                         "y_knn":metricas(y_test, y_knn),
                         "y_dt":metricas(y_test, y_dt),
                         "y_reg_log":metricas(y_test, y_reg_log),
                         "ensemble":metricas(y_test, df_resultados['votação'])}, index=['Acurácia', 'Precisão', "Sensitividade"])
df_metricas

Unnamed: 0,y_rf,y_sv,y_knn,y_dt,y_reg_log,ensemble
Acurácia,0.742857,0.857143,0.771429,0.742857,0.828571,0.828571
Precisão,0.727273,0.85,0.761905,0.777778,0.809524,0.842105
Sensitividade,0.842105,0.894737,0.842105,0.736842,0.894737,0.842105
