In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



- _Hemoglobin_: Hemoglobin is a protein in your red blood cells that carries oxygen to your body's organs and tissues and transports carbon dioxide from your organs and tissues back to your lungs

<br />

- _MCH_: MCH is short for "mean corpuscular hemoglobin." It's the average amount in each of your red blood cells of a protein called hemoglobin, which carries oxygen around your body.

<br />

- _MCHC_: MCHC stands for mean corpuscular hemoglobin concentration. It's a measure of the average concentration of hemoglobin inside a single red blood cell.

<br />

- _MCV_: MCV stands for mean corpuscular volume. An MCV blood test measures the average size of your red blood cells.

<br />

- _Results_: 0- not anemic, 1-anemic

In [None]:
df = pd.read_csv('anemia.csv')

In [None]:
# mostrando como estão preenchidas as tabelas, e mostrando as primeiras linhas do dataframe
df.info(),df.head()

In [None]:
# Male = 0 
# Femlae = 1
# proportion is not so different between groups

sns.countplot(df['Gender']);

In [None]:
# Not anemic = 0
# Anemic = 1

sns.countplot(df['Result']);

In [None]:
# Observando os valores médios de cada variável
Hb_mean = np.mean(df['Hemoglobin']), 
MCH_mean = np.mean(df['MCH']), 
MCHC_mean = np.mean(df['MCHC']), 
MCV_mean = np.mean(df['MCV'])

Hb_mean, MCH_mean, MCHC_mean, MCV_mean

In [None]:
Hb_median = np.median(df['Hemoglobin'])
Hb_median

In [None]:
sns.heatmap(df.corr(), annot=True,linewidths=.5)

In [None]:
sns.histplot(df['Hemoglobin'], kde=True);

In [None]:
sns.histplot(df['MCH'], kde=True);

In [None]:
sns.histplot(df['MCHC'], kde=True);

In [None]:
sns.histplot(df['MCV'], kde=True);

In [None]:
# resultado que mais se difereciou entre os positivos e os negativos
sns.ecdfplot(data=df, x="Hemoglobin", hue='Result',
            palette="colorblind");

In [None]:
fig, ax = plt.subplots(ncols = 2, nrows=2, figsize=(15,10));
fig.suptitle("HB VARIATIONS", size=30, color='white')

ax[0, 0].plot(df['Hemoglobin'], color="red")
ax[0, 0].set_title("Hemoglobin", color='white', fontsize=15)

ax[0, 1].plot(df['MCH'], color="red")
ax[0, 1].set_title("mean corpuscular hemoglobin", color='white', fontsize=15)

ax[1, 0].plot(df['MCHC'], color="red")
ax[1, 0].set_title("mean corpuscular hemoglobin concentration", color='white', fontsize=15)


ax[1, 1].plot(df['MCV'], color="red")
ax[1, 1].set_title("mean corpuscular volume", color='white', fontsize=15)

In [None]:
g = sns.PairGrid(df)
g.map_diag(sns.histplot, kde=True, color='Green')
g.map_offdiag(sns.scatterplot, color="gray", s=6);

In [None]:
fig, ax = plt.subplots(ncols = 3, nrows=1, figsize=(15,5), sharex='col', sharey='row');
plt.subplots_adjust(wspace=0, hspace=0)

# Como as variáveis se relacionam com a Concentração de Hemoglobina
# Encontramos uma relação interessante entre Hemoglobina e o MCHC.
# Vamores baixos de Hb são seguidos de valores baixos de MCHC

sns.lineplot(ax=ax[0], data=df, x='Hemoglobin', y='MCHC', color='red');
sns.lineplot(ax=ax[1], data=df, x='MCH', y='MCHC', color='green');
sns.lineplot(ax=ax[2], data=df, x='MCV', y='MCHC', color='purple');

In [None]:
sns.lmplot(
    data=df, x="Hemoglobin", y="MCHC",
    fit_reg = True, scatter=True, hue='Result', palette='flare',
);

### Treinando o Modelo de dados

In [None]:
X = df.iloc[:, 0:5].values
X

In [None]:
y = df.iloc[:, 5].values
y

### Realizando os testes 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
naive = GaussianNB()
naive.fit(X_test, y_test)

In [None]:
# Resultado do algoritmo Naive Bayes
naive.score(X_test, y_test)

In [None]:
predictions_naive = naive.predict(X_test)

In [None]:
confusion_matrix(predictions_naive, y_test)

In [None]:
sns.heatmap(confusion_matrix(predictions_naive, y_test), 
            annot= True, cmap="mako");

In [None]:
logistic = LogisticRegression()
logistic.fit(X, y)

In [None]:
logistic.score(X, y)

In [None]:
predictions_logistica = logistic.predict(X_test)

In [None]:
sns.heatmap(confusion_matrix(predictions_logistica, y_test), 
            annot= True, cmap="mako");

In [None]:
floresta = RandomForestClassifier()
floresta.fit(X, y)

In [None]:
floresta.score(X, y)

In [None]:
predictions_florest = floresta.predict(X_test)

In [None]:
sns.heatmap(confusion_matrix(predictions_florest, y_test), 
            annot= True, cmap="mako");