In [1]:
# import lib
import os
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display

%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', False)

In [2]:
# Load heart disease dataframe
df_heart = pd.read_csv('maladie_cardiaque.csv', index_col="id").iloc[:, 1:]

In [3]:
df_heart

Unnamed: 0_level_0,age,genre,taille,Poids,pression_systo,pression_diasto,cholesterol,glycemie,fumeur,conso_alco,activite_physique,malade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,homme,168,62.0,110,80,normal,1,non fumeur,non,1,non
1,20228,femme,156,85.0,140,90,tres eleve,1,non fumeur,non,1,oui
2,18857,femme,165,64.0,130,70,tres eleve,1,non fumeur,non,0,oui
3,17623,homme,169,82.0,150,100,normal,1,non fumeur,non,1,oui
4,17474,femme,156,56.0,100,60,normal,1,non fumeur,non,0,non
...,...,...,...,...,...,...,...,...,...,...,...,...
99993,19240,homme,168,76.0,120,80,normal,1,fumeur,non,1,non
99995,22601,femme,158,126.0,140,90,eleve,2,non fumeur,non,1,oui
99996,19066,homme,183,105.0,180,90,tres eleve,1,non fumeur,oui,0,oui
99998,22431,femme,163,72.0,135,80,normal,2,non fumeur,non,0,oui


In [4]:
duplicated = df_heart.duplicated().sum()

if duplicated : 
    print('Nombre de doublons dans le dataset : {}'.format(duplicated))
else :
    print("Il n'y a pas de doublons")

# 24 doublons sur 70k lignes, ils sont négligeables et nous choisissons de les conserver

Nombre de doublons dans le dataset : 24


In [5]:
df_heart["pression_diasto"] = df_heart["pression_diasto"].abs()
df_heart["pression_systo"] = df_heart["pression_systo"].abs()

In [6]:
df_heart['imc'] = df_heart.apply(lambda x: x["Poids"]/((x["taille"]/100)**2), axis=1)

In [7]:
def pression_arterielle(press_systo, press_diasto):
    if press_systo < 120 and press_diasto < 80:
        return "optimale"
    
    if press_systo < 130 and press_diasto < 85:
        return "normale"
    
    if press_systo < 140 and press_diasto < 90:
        return "noramel élevée"
    
    return "élevée"

df_heart["pression_arterielle"] = df_heart.apply(lambda x: pression_arterielle(x["pression_systo"], x["pression_diasto"]), axis=1)

In [8]:
df_heart['age_annee'] = df_heart.apply(lambda x: int(round(x["age"]/365, 0)), axis=1)

In [9]:
df_heart

Unnamed: 0_level_0,age,genre,taille,Poids,pression_systo,pression_diasto,cholesterol,glycemie,fumeur,conso_alco,activite_physique,malade,imc,pression_arterielle,age_annee
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,18393,homme,168,62.0,110,80,normal,1,non fumeur,non,1,non,21.967120,normale,50
1,20228,femme,156,85.0,140,90,tres eleve,1,non fumeur,non,1,oui,34.927679,élevée,55
2,18857,femme,165,64.0,130,70,tres eleve,1,non fumeur,non,0,oui,23.507805,noramel élevée,52
3,17623,homme,169,82.0,150,100,normal,1,non fumeur,non,1,oui,28.710479,élevée,48
4,17474,femme,156,56.0,100,60,normal,1,non fumeur,non,0,non,23.011177,optimale,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99993,19240,homme,168,76.0,120,80,normal,1,fumeur,non,1,non,26.927438,normale,53
99995,22601,femme,158,126.0,140,90,eleve,2,non fumeur,non,1,oui,50.472681,élevée,62
99996,19066,homme,183,105.0,180,90,tres eleve,1,non fumeur,oui,0,oui,31.353579,élevée,52
99998,22431,femme,163,72.0,135,80,normal,2,non fumeur,non,0,oui,27.099251,noramel élevée,61


In [33]:
def outlier_detection_box_1(target, df):
    print(target)
    q1 = df_heart[target].quantile(q=0.25)
    q3 = df_heart[target].quantile(q=0.75)
    med = df_heart[target].median()

    iqr = q3 - q1

    upper = q3 + (4 * iqr)
    lower = q1 - (4 * iqr)

    print("\tUpper : %s | shape : %s" % (upper, df_heart[df_heart[target] > upper].shape))
    print("\tLower : %s | shape : %s\n" % (lower, df_heart[df_heart[target] < lower].shape))

    return upper, lower

In [34]:
upper_systo, lower_systo = outlier_detection_box_1("pression_systo", df_heart)
upper_diasto, lower_diasto = outlier_detection_box_1("pression_diasto", df_heart)

pression_systo
	Upper : 220.0 | shape : (48, 15)
	Lower : 40.0 | shape : (181, 15)

pression_diasto
	Upper : 130.0 | shape : (1015, 15)
	Lower : 40.0 | shape : (58, 15)



In [35]:
df_heart = df_heart[df_heart["pression_systo"] < upper_systo]
df_heart = df_heart[df_heart["pression_systo"] > lower_systo]

df_heart = df_heart[df_heart["pression_diasto"] < upper_diasto]
df_heart = df_heart[df_heart["pression_diasto"] > lower_diasto]

In [39]:
df_heart.describe()

Unnamed: 0,age,taille,Poids,pression_systo,pression_diasto,glycemie,activite_physique,imc,age_annee
count,68652.0,68652.0,68652.0,68652.0,68652.0,68652.0,68652.0,68652.0,68652.0
mean,19463.452995,164.361242,74.104797,126.571404,81.297165,1.225572,0.803312,27.516472,53.324026
std,2468.632352,8.183559,14.31518,16.557948,9.35212,0.571507,0.397497,6.046428,6.769125
min,10798.0,55.0,11.0,70.0,45.0,1.0,0.0,3.471784,30.0
25%,17656.0,159.0,65.0,120.0,80.0,1.0,1.0,23.875115,48.0
50%,19700.0,165.0,72.0,120.0,80.0,1.0,1.0,26.346494,54.0
75%,21324.0,170.0,82.0,140.0,90.0,1.0,1.0,30.119376,58.0
max,23713.0,250.0,200.0,215.0,126.0,3.0,1.0,298.666667,65.0
