In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
dfT = pd.read_csv('./data/thyroidDF.csv')
# retirando as colunas que não iremos utilizar para treinar o modelo
dfT.drop(columns=['patient_id', 'TSH_measured', 'T3_measured', 'TT4_measured', 
                  'T4U_measured', 'FTI_measured', 'TBG', 'TBG_measured', 
                  'referral_source'], axis=1, inplace=True)

#### Precisamos fazer grupos que dividam a classe 'target' nos subgrupos de diagnóstico
<br />
1. hyperthyroid conditions:

    A   hyperthyroid
    B   T3 toxic
    C   toxic goitre
    D   secondary toxic
    
<br />
2. hypothyroid conditions:

    E   hypothyroid
    F   primary hypothyroid
    G   compensated hypothyroid
    H   secondary hypothyroid

<br />

##### Nós fizemos isso com esse código no início do entendimento do dataset

<br />

```

diagnoses = {'-': 'negative',
             'A': 'hyperthyroid', 
             'B': 'hyperthyroid', 
             'C': 'hyperthyroid', 
             'D': 'hyperthyroid',
             'E': 'hypothyroid', 
             'F': 'hypothyroid', 
             'G': 'hypothyroid', 
             'H': 'hypothyroid'}

dfT['target'] = dfT['target'].map(diagnoses) # re-mapping
# dropping observations with 'target' null after re-mapping
dfT.dropna(subset=['target'], inplace=True)

```


In [None]:
dfT.info()

In [None]:
# porcentagem de linhas vazias que sobraram depois de termos retirados algumas colunas

(dfT.isnull().sum() / 9172) * 100

In [None]:
diagnoses = {'-': 'negative',
             'A': 'hyperthyroid', 
             'B': 'hyperthyroid', 
             'C': 'hyperthyroid', 
             'D': 'hyperthyroid',
             'E': 'hypothyroid', 
             'F': 'hypothyroid', 
             'G': 'hypothyroid', 
             'H': 'hypothyroid'}

dfT['target'] = dfT['target'].map(diagnoses) # re-mapping
# dropping observations with 'target' null after re-mapping
dfT.dropna(subset=['target'], inplace=True)

In [None]:
dfT.head()

In [None]:
dfT.tail()

In [None]:
#dfT['age'].max()

In [None]:
dfT.describe()


In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(dfT.corr(), annot=True, cmap="magma");

In [None]:
dfT.shape, dfT.columns

# indices que são numéricos
# 0 17 19 21 23 25 27

In [None]:
# Esse valor tem os dados desbalanceados devido uma quantidade de NaN presente
# E por termos valores vazios no dataset

dfT.mean()

In [None]:
#sns.histplot(x=dfT['TBG'], y=dfT['T4U'], kde=True, color='darkblue');

In [None]:
sns.lmplot(data=dfT,
          x='TT4', y='FTI',
           hue='sex');

In [None]:
sns.lmplot(data=dfT,
          x='TSH', y='FTI');

In [None]:
sns.scatterplot(data=dfT, x='T4U', y='TT4', s=5, hue='sex')

In [None]:
sns.scatterplot(data=dfT, x='T4U', y='TSH', s=5, hue='sex')

In [None]:
sns.countplot(dfT['age'], palette='mako');
# dfT[dfT['age'] <= 0] >>>> sem idades negativas

In [None]:
# Pessoas com mais de 100 anos de idade apresentam valor de diagnóstico 'negativo'
dfT[dfT['age'] >= 100]

In [None]:
# changing age of observations with ('age' > 100) to null
# dfT['age'] = np.where((dfT.age > 100), np.nan, dfT.age)
# dfT[dfT['age'] >= 100]

In [None]:
# setting up grid for multiple seaborn plots
fig, axes = plt.subplots(3,2,figsize=(20,16))
fig.suptitle('Numerical Attributes vs. Target', color='white')
sns.set_style('whitegrid');

# TSH vs. 'target'
sns.stripplot(x=dfT.target, y=dfT.TSH, linewidth=0.6, jitter= 0.3, ax=axes[0, 0])

# T3 vs. 'target'
sns.stripplot(x=dfT.target, y=dfT.T3, linewidth=0.6, jitter= 0.3, ax=axes[0, 1])

# TT4 vs. 'target'
sns.stripplot(x=dfT.target, y=dfT.TT4, linewidth=0.6, jitter= 0.3, ax=axes[1, 0])

# T4U vs. 'target'
sns.stripplot(x=dfT.target, y=dfT.T4U, linewidth=0.6, jitter= 0.3, ax=axes[1, 1])

# FTI vs. 'target'
sns.stripplot(x=dfT.target, y=dfT.FTI, linewidth=0.6, jitter= 0.3, ax=axes[2, 0])

# TBG vs. 'target'
# sns.stripplot(x=dfT.target, y=dfT.TBG, linewidth=0.6, jitter= 0.3, ax=axes[2, 1])

In [None]:
numericalDF = dfT[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'target']].copy()
sns.set_style('whitegrid');
sns.pairplot(numericalDF, hue='target', height=3);
plt.show()

In [None]:
dfT.loc[dfT['age'] < 0]

In [None]:
dfT.loc[dfT['T3'].isnull()]

In [None]:
dfT = dfT.select_dtypes(include='object')
dfT['sex'] = dfT['sex'].map({'F' : 0, 'M' : 1})
# to get all the columns except the sex & target
cols = dfT.loc[:, ~dfT.columns.isin(['sex','target','referral_source'])] 

for c in cols:
    dfT[c] = dfT[c].map({'f' : 0, 't' : 1})

In [None]:
dfT

In [None]:
dfT.isnull().sum()

In [None]:
# dataset bem desbalanceado
sns.countplot(x=dfT['query_on_thyroxine'])

In [None]:
X = dfT.iloc[:, 0:15].values
X

In [None]:
y = dfT.iloc[:, 15].values
y

### Treinando os modelos

In [None]:
## Dividindo a base de dados em treino e teste

from sklearn.model_selection import train_test_split

X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
naive = GaussianNB()
naive.fit(X_treinamento, y_treinamento)
previsoes = naive.predict(X_teste)


In [None]:
accuracy_score(y_teste, previsoes) * 100

In [None]:
confusion_matrix(y_teste, previsoes)

In [None]:

cm = ConfusionMatrix(naive)
cm.fit(X_treinamento, y_treinamento)
cm.score(X_teste, y_teste)

In [None]:
print(classification_report(y_teste, previsoes))

#### Árvore de Decisão

In [None]:
arvore = DecisionTreeClassifier(criterion='entropy')
arvore.fit(X_treinamento, y_treinamento)

In [None]:
arvore.feature_importances_

In [None]:
from sklearn import tree
previsores = ['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_meds', 'sick', 'pregnant',
              'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
              'goitre', 'tumor', 'hypopituitary', 'psych']

figura, eixos = plt.subplots(figsize=(150,150),nrows=1, ncols=1)
tree.plot_tree(arvore, feature_names=previsores, 
               class_names=arvore.classes_, filled=True)

In [None]:
previsoesArvore = arvore.predict(X_teste)

In [None]:
accuracy_score(y_teste, previsoesArvore)

In [None]:
cm = ConfusionMatrix(arvore)
cm.fit(X_treinamento, y_treinamento)
cm.score(X_teste, y_teste)

In [None]:
print(classification_report(y_teste, previsoesArvore))

#### Random Florest

In [None]:
random_forest = RandomForestClassifier(n_estimators=10, 
                                       criterion='entropy', 
                                       random_state = 0)

random_forest.fit(X_treinamento, y_treinamento)

In [None]:
previsoes_RandomFlorest = random_forest.predict(X_teste)


In [None]:
accuracy_score(y_teste, previsoes_RandomFlorest)

In [None]:
cm = ConfusionMatrix(random_forest)
cm.fit(X_treinamento, y_treinamento)
cm.score(X_teste, y_teste)

In [None]:
print(classification_report(y_teste, previsoes_RandomFlorest))

#### kNN

In [None]:
knn = KNeighborsClassifier(n_neighbors= 5, metric='minkowski', p = 2)
knn.fit(X_treinamento, y_treinamento)

In [None]:
previsoes_knn = knn.predict(X_teste)


In [None]:
accuracy_score(y_teste, previsoes_knn)

In [None]:
cm = ConfusionMatrix(knn)
cm.fit(X_treinamento, y_treinamento)
cm.score(X_teste, y_teste)

In [None]:
print(classification_report(y_teste, previsoes_knn))

#### Logistic Regression

In [None]:
logistic = LogisticRegression(random_state=1)
logistic.fit(X_treinamento, y_treinamento)

In [None]:
previsoes_LogisticReg = logistic.predict(X_teste)

In [None]:
accuracy_score(y_teste, previsoes_LogisticReg)

In [None]:
cm = ConfusionMatrix(logistic)
cm.fit(X_treinamento, y_treinamento)
cm.score(X_teste, y_teste)

#### SVM

In [None]:
svm = SVC(kernel='rbf', random_state=1, C = 2.0) # 2 -> 4
svm.fit(X_treinamento, y_treinamento)

In [None]:
previsoes_svm = svm.predict(X_teste)

In [None]:
cm = ConfusionMatrix(svm)
cm.fit(X_treinamento, y_treinamento)
cm.score(X_teste, y_teste)

In [None]:
print(classification_report(y_teste, previsoes_svm))