In [41]:
import pandas as pd

In [42]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

## Carregando o dataset

In [43]:
df = pd.read_csv("/content/drive/MyDrive/data.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [44]:
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [45]:
X = df.drop(columns=['diagnosis','Unnamed: 32'])
y = df['diagnosis']

## Separando em treino e teste

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Inicializando os modelos

In [48]:
dt = tree.DecisionTreeClassifier()
rfc = RandomForestClassifier()
abc = AdaBoostClassifier()

## Seleção de hiperparâmetros com GridSearch CV 5

In [49]:
from sklearn.model_selection import GridSearchCV

In [50]:
hyp_rfc = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}

In [51]:
hyp_abc = {'learning_rate': [0.1, 0.05, 0.01 ,0.5], 'n_estimators': [50, 100, 200]}

In [52]:
clf_rfc = GridSearchCV(rfc, hyp_rfc, cv=5)
clf_abc = GridSearchCV(abc, hyp_abc, cv=5)

In [53]:
clf_rfc.fit(X_train,y_train)

In [54]:
clf_abc.fit(X_train,y_train)

In [55]:
dt.fit(X_train,y_train)

## Obter o modelo com melhor combinação de hiperparâmetros

In [56]:
best_rfc = clf_rfc.best_estimator_
best_abc = clf_abc.best_estimator_

In [57]:
best_rfc.fit(X_train, y_train)
best_abc.fit(X_train, y_train)

In [58]:
score_rfc = best_rfc.score(X_test,y_test)
score_abc = best_abc.score(X_test,y_test)
score_dt = dt.score(X_test,y_test)

In [59]:
print(score_rfc)
print(score_abc)
print(score_dt)

0.9736842105263158
0.9649122807017544
0.9122807017543859


In [61]:
y_rfc_pred = best_rfc.predict(X_test)
y_abc_pred = best_abc.predict(X_test)
y_dt_pred = dt.predict(X_test)

## Avaliando os resultados

In [62]:
from sklearn.metrics import classification_report

In [63]:
print(classification_report(y_test, y_rfc_pred, target_names=['B','M']))

              precision    recall  f1-score   support

           B       0.96      1.00      0.98        72
           M       1.00      0.93      0.96        42

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114



In [64]:
print(classification_report(y_test, y_abc_pred, target_names=['B','M']))

              precision    recall  f1-score   support

           B       0.95      1.00      0.97        72
           M       1.00      0.90      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114



In [65]:
print(classification_report(y_test, y_dt_pred, target_names=['B','M']))

              precision    recall  f1-score   support

           B       0.94      0.92      0.93        72
           M       0.86      0.90      0.88        42

    accuracy                           0.91       114
   macro avg       0.90      0.91      0.91       114
weighted avg       0.91      0.91      0.91       114



### O melhor modelo foi o RandomForest

## Importância das features

In [68]:
import numpy as np

In [67]:
rfc_feature_importances = best_rfc.feature_importances_
abc_feature_importances = best_abc.feature_importances_
dt_feature_importances = dt.feature_importances_

In [71]:
len(X_train.columns)

31

In [82]:
rfc_top_10_indices = np.argsort(rfc_feature_importances)[::-1][:10]
rfc_top_10_columns = X_train.columns[rfc_top_10_indices]
rfc_df_top_10 = X_train[rfc_top_10_columns]

In [84]:
abc_top_10_indices = np.argsort(abc_feature_importances)[::-1][:10]
abc_top_10_columns = X_train.columns[abc_top_10_indices]
abc_df_top_10 = X_train[abc_top_10_columns]

In [85]:
dt_top_10_indices = np.argsort(dt_feature_importances)[::-1][:10]
dt_top_10_columns = X_train.columns[dt_top_10_indices]
dt_df_top_10 = X_train[dt_top_10_columns]

## Retreinando o melhor modelo com as 10 features mais importantes

In [86]:
best_rfc.fit(rfc_df_top_10, y_train)

In [88]:
X_test_top_10_features = X_test[rfc_top_10_columns]
best_rfc.score(X_test_top_10_features,y_test)

0.956140350877193

In [91]:
rfc_y_pred_top_10_features = best_rfc.predict(X_test_top_10_features)

Classification report antes da seleção de features

|     ...     | precision  | recall | f1-score  | support |
|-------------|--------|-----------|---------|------------|
|           B |  0.96  |    1.00   |   0.98  |      72
|           M |  1.00  |    0.93   |   0.96  |      42
|             |        |           |         |
|    accuracy |        |           |     0.97|       114
|   macro avg |  0.98  |    0.96   |   0.97  |     114
| weighted avg|  0.97  |    0.97   |   0.97  |     114

Classification report depois da seleção de features

In [92]:
print(classification_report(y_test, rfc_y_pred_top_10_features, target_names=['B','M']))

              precision    recall  f1-score   support

           B       0.94      1.00      0.97        72
           M       1.00      0.88      0.94        42

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



## Podemos perceber que a seleção de features piorou a classificação