In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

Le fichier <a href='http://qa.epidemium.cc/data/mortality_dataset/2017-25-10_mortality/mortality.csv'>Mortality.txt</a> contient les informations sur le nombre de décès par type de cancer, sex, tranche d'age et par pays. Les colonnes correspondantes sont :
- Nb_deaths
- Cancer_code
- Sex
- Age_bucket
- Location
- Year

En complément, les descriptions pour les codes sont fournies dans les fichiers :
- tranches d'age : <a href='http://qa.epidemium.cc/data/mortality_dataset/2017-25-10_mortality/age.txt'>age.txt</a>
- codes cancer : <a href='http://qa.epidemium.cc/data/mortality_dataset/2017-25-10_mortality/cancer_codes.csv'>cancer_codes.txt</a>

Le fichier <a href='http://qa.epidemium.cc/data/epidemiology_dataset/world_bank_data/WorldBank_Data.csv'>WorldBank_Data.csv</a>  contient les colonnes : 
- area_code : code du pays / de la region
- area : nom du pays / de la region
- year : année pour laquelle les indicateurs sont calculés

Les autres colonnes de 3 à 875 représentes les différents indicateurs. Leurs libellés complètes, ainsi que les méthodologies de calcul respectives sont détaillés dans un fichier complémentaire, <a href='http://qa.epidemium.cc/data/epidemiology_dataset/world_bank_data/WorldBank_Indicators.csv' target='_blank'>WorldBank_Indicators.csv</a> 

Pour construir notre base de modélisation, on souhaite joindre ces deux bases. Pour cela, certaines retraitements seront nécessaires : 
#### Modifications sur la base Mortality :
1 - modifications sur le découpage par tranches d'age pour harmoniser avec la base WorldBank

2 - modifications sur les noms des pays pour pour harmoniser avec la base WorldBank

#### Modifications sur la base WorldBank :
1 - modifications sur les codes sexe et tranches d'ages pour harmoniser avec la base Mortality

2 - remplacement des valeurs manquantes

<i>Un schéma qui résume les jointures et les transformations se trouve à la fin de ce notebook.</i>

In [2]:
#lecture base de données Mortality brute
df_mort= pd.read_table('mortality.txt',delimiter=';',names=['Cancer_code', 'Sex', 'Age_bucket', 'Location', 'Year', 'Nb_deaths'],low_memory=False)
df_mort.describe()

Unnamed: 0,Sex,Age_bucket,Year,Nb_deaths
count,6359642.0,6359642.0,6359642.0,6359642.0
mean,1.540061,10.0,1992.326,32.30369
std,0.6528709,5.477226,16.09652,264.9799
min,1.0,1.0,1950.0,0.0
25%,1.0,5.0,1981.0,0.0
50%,2.0,10.0,1996.0,0.0
75%,2.0,15.0,2006.0,5.0
max,9.0,19.0,2015.0,34935.0


In [3]:
df_mort.tail()

Unnamed: 0,Cancer_code,Sex,Age_bucket,Location,Year,Nb_deaths
6359637,C32,2,15,Spain,1988,7
6359638,C32,2,16,Spain,1988,5
6359639,C32,2,17,Spain,1988,9
6359640,C32,2,18,Spain,1988,10
6359641,C32,2,19,Spain,1988,0


In [4]:
#lecture du fichier WorldBank brut
df_wb = pd.read_csv('WorldBank_Data.csv', sep=',')

In [5]:
df_wb.head()

Unnamed: 0,area_code,area,year,SE.ADT.1524.LT.FE.ZS,SE.ADT.1524.LT.FM.ZS,SE.ADT.1524.LT.MA.ZS,SE.ADT.1524.LT.ZS,SE.ADT.LITR.FE.ZS,SE.ADT.LITR.MA.ZS,SE.ADT.LITR.ZS,...,SL.UEM.TOTL.FE.ZS,SL.UEM.TOTL.MA.NE.ZS,SL.UEM.TOTL.MA.ZS,SL.UEM.TOTL.NE.ZS,SL.UEM.TOTL.ZS,SM.POP.NETM,SM.POP.REFG,SM.POP.REFG.OR,SM.POP.TOTL,SM.POP.TOTL.ZS
0,ABW,Aruba,1970,,,,,,,,...,,,,,,,,,7466.0,
1,ABW,Aruba,1971,,,,,,,,...,,,,,,,,,,
2,ABW,Aruba,1972,,,,,,,,...,,,,,,-3537.0,,,,
3,ABW,Aruba,1973,,,,,,,,...,,,,,,,,,,
4,ABW,Aruba,1974,,,,,,,,...,,,,,,,,,,


In [6]:
df_wb.describe()

Unnamed: 0,year,SE.ADT.1524.LT.FE.ZS,SE.ADT.1524.LT.FM.ZS,SE.ADT.1524.LT.MA.ZS,SE.ADT.1524.LT.ZS,SE.ADT.LITR.FE.ZS,SE.ADT.LITR.MA.ZS,SE.ADT.LITR.ZS,SE.COM.DURS,SE.ENR.PRIM.FM.ZS,...,SL.UEM.TOTL.FE.ZS,SL.UEM.TOTL.MA.NE.ZS,SL.UEM.TOTL.MA.ZS,SL.UEM.TOTL.NE.ZS,SL.UEM.TOTL.ZS,SM.POP.NETM,SM.POP.REFG,SM.POP.REFG.OR,SM.POP.TOTL,SM.POP.TOTL.ZS
count,11648.0,947.0,947.0,947.0,948.0,960.0,960.0,962.0,3680.0,8498.0,...,5746.0,3882.0,5746.0,4576.0,5746.0,2120.0,5115.0,5790.0,2418.0,1512.0
mean,1992.588341,85.378596,0.934534,89.70475,87.513374,75.049067,83.935035,79.46713,9.119293,0.911953,...,10.768031,7.566557,8.117224,8.129132,8.948894,-301912.8,932064.3,690696.2,5387660.0,9.293558
std,13.300837,20.738517,0.131837,14.318209,17.445111,24.744161,17.348042,20.898257,2.021738,0.139949,...,8.020582,5.343605,5.487767,5.839693,6.032945,2591169.0,2437202.0,2145572.0,17161410.0,14.144178
min,1970.0,6.66406,0.24331,22.37713,14.37752,3.18277,13.51292,8.68515,0.0,0.0,...,0.078,0.02,0.067,0.05,0.1,-22729860.0,0.0,0.0,141.0,0.032596
25%,1981.0,78.18157,0.91423,85.69885,82.23664,59.46883,76.291872,68.053309,8.0,0.861855,...,5.39125,3.98,4.604697,4.254689,5.047,-179704.8,942.5,74.25,37380.25,1.358533
50%,1993.0,96.29856,0.99844,96.53054,96.320067,85.4655,90.093645,87.845605,9.0,0.97125,...,8.146,6.525,6.662949,6.91,7.285,-13013.0,19131.0,2302.5,233501.5,3.520657
75%,2004.0,99.08378,1.00213,99.010885,98.992157,94.37178,96.962153,95.530375,10.0,0.99434,...,14.113,9.5375,10.154,10.3725,11.518595,32103.5,331509.5,116901.8,1765247.0,10.461212
max,2016.0,100.0,1.24141,100.0,100.0,99.99903,99.99947,99.99924,16.0,1.50735,...,61.223,54.599998,36.959,59.5,39.299999,22686640.0,21388030.0,21388030.0,243192700.0,88.404048


In [7]:
#fichier détaillant le mode de calcul des indicateurs WorldBank
df_wb_info = pd.read_csv('WorldBank_Indicators.csv', sep=',')
df_wb_info.head()

Unnamed: 0,Category,Subcategory,Code,Indicator Name,Long definition,Source
0,Economie,National_accounts&Official_development_assista...,DC.DAC.AUSL.CD,"Net bilateral aid flows from DAC donors, Austr...",Net bilateral aid flows from DAC donors are th...,Development Assistance Committee of the Organi...
1,Economie,National_accounts&Official_development_assista...,DC.DAC.AUTL.CD,"Net bilateral aid flows from DAC donors, Austr...",Net bilateral aid flows from DAC donors are th...,Development Assistance Committee of the Organi...
2,Economie,National_accounts&Official_development_assista...,DC.DAC.BELL.CD,"Net bilateral aid flows from DAC donors, Belgi...",Net bilateral aid flows from DAC donors are th...,Development Assistance Committee of the Organi...
3,Economie,National_accounts&Official_development_assista...,DC.DAC.CANL.CD,"Net bilateral aid flows from DAC donors, Canad...",Net bilateral aid flows from DAC donors are th...,Development Assistance Committee of the Organi...
4,Economie,National_accounts&Official_development_assista...,DC.DAC.CECL.CD,"Net bilateral aid flows from DAC donors, Europ...",Net bilateral aid flows from DAC donors are th...,Development Assistance Committee of the Organi...


<u> 1 - Modifications sur le découpage par tranches d'age pour harmoniser avec la base WorldBank</u>


Le découpage par tranches d'age de l'indicateur mortalité dans la base Mortality ne correspond pas avec l'indicateur de la population de la base 'WorldBank'. On fait donc une harmonisation entre ces deux : 
- aggregation de la tranche d'age 16 et suppression des tranches d'age 17,18 et 19

In [8]:
#découpage des ages pour l'indicateur mortalité
print("Classes d'age dans base Mortality : ", df_mort.Age_bucket.unique())
df_age = pd.read_csv('age.csv', header = None , sep=' ')
df_age.columns=['Age_bucket', 'Age']
print("Décodage des tranches d'age dans la base Mortality : ")
df_age

Classes d'age dans base Mortality :  [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
Décodage des tranches d'age dans la base Mortality : 


Unnamed: 0,Age_bucket,Age
0,1,0-4
1,2,5-9
2,3,10-14
3,4,15-19
4,5,20-24
5,6,25-29
6,7,30-34
7,8,35-39
8,9,40-44
9,10,45-49


In [9]:
#découpage des ages pour l'indicateur population
print("Variables d'age dans la base WorldBank : ")
print(df_wb.columns[df_wb.columns.str.startswith('SP.POP')])

Variables d'age dans la base WorldBank : 
Index(['SP.POP.0004.FE.5Y', 'SP.POP.0004.MA.5Y', 'SP.POP.0014.TO.ZS',
       'SP.POP.0509.FE.5Y', 'SP.POP.0509.MA.5Y', 'SP.POP.1014.FE.5Y',
       'SP.POP.1014.MA.5Y', 'SP.POP.1519.FE.5Y', 'SP.POP.1519.MA.5Y',
       'SP.POP.1564.TO.ZS', 'SP.POP.2024.FE.5Y', 'SP.POP.2024.MA.5Y',
       'SP.POP.2529.FE.5Y', 'SP.POP.2529.MA.5Y', 'SP.POP.3034.FE.5Y',
       'SP.POP.3034.MA.5Y', 'SP.POP.3539.FE.5Y', 'SP.POP.3539.MA.5Y',
       'SP.POP.4044.FE.5Y', 'SP.POP.4044.MA.5Y', 'SP.POP.4549.FE.5Y',
       'SP.POP.4549.MA.5Y', 'SP.POP.5054.FE.5Y', 'SP.POP.5054.MA.5Y',
       'SP.POP.5559.FE.5Y', 'SP.POP.5559.MA.5Y', 'SP.POP.6064.FE.5Y',
       'SP.POP.6064.MA.5Y', 'SP.POP.6569.FE.5Y', 'SP.POP.6569.MA.5Y',
       'SP.POP.65UP.TO.ZS', 'SP.POP.7074.FE.5Y', 'SP.POP.7074.MA.5Y',
       'SP.POP.7579.FE.5Y', 'SP.POP.7579.MA.5Y', 'SP.POP.80UP.FE.5Y',
       'SP.POP.80UP.MA.5Y', 'SP.POP.DPND', 'SP.POP.DPND.OL', 'SP.POP.DPND.YG',
       'SP.POP.GROW', 'SP.POP.TOTL', 'S

In [10]:
length=len(df_mort[df_mort['Age_bucket']==16])
index=df_mort[df_mort['Age_bucket']==16]['Nb_deaths'].index
j=list(range(0, length, 1))
t_16=df_mort[df_mort['Age_bucket']==16]['Nb_deaths'].values
t_17=df_mort[df_mort['Age_bucket']==17]['Nb_deaths'].values
t_18=df_mort[df_mort['Age_bucket']==18]['Nb_deaths'].values
t_19=df_mort[df_mort['Age_bucket']==19]['Nb_deaths'].values
df_mort['Nb_deaths'][index]=t_16[j] + t_17[j] + t_18[j] + t_19[j]

index17_18_19=df_mort[(df_mort['Age_bucket']==17) | (df_mort['Age_bucket']==18) | (df_mort['Age_bucket']==19)]['Nb_deaths'].index
df_mort=df_mort.drop(df_mort.index[index17_18_19])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


<u>2 - Modifications sur les noms des pays pour pour harmoniser avec la base WorldBank</u>

La base WorldBank ne dispose pas des codes des pays, seuls les libellés sont présents. On est donc obligées de faire la jointure avec la base Mortality avec les libellés comme clé. Pour cela, on a vérifié les pays et on a modifié manuellement les libellés pour avoir une correspondance entre les pays en commun. 

Ainsi, dans le fichier 'correspondance.txt', on a : 
- area : libellé du pays sous la forme originale trouvée dans WorldBank
- area2 : libellé du pays corrigé manuellement pour trouver la correspondance entre WorldBank et Mortality

Par la suite, la jointure entre les deux bases se fera sur la clé area2 au lieu de area.

On a décidé de faire un inner join entre ces deux base, ce qui élimine 23 pays de la base Mortality absents de la base WorldBank (après avoir vérifié et corrigé les problèmes de mismatch des libellés). Ce sont pour la plupart des iles, des pays qui n'exitent plus, ou encore des zones qui représentent des regroupements de pays.


In [11]:
print('Nombre pays dans base Mortality: ', len(df_mort.Location.unique()))
print('Nombre pays dans base WorldBank: ', len(df_wb.area.unique()))

Nombre pays dans base Mortality:  153
Nombre pays dans base WorldBank:  258


In [12]:
#lecture fichier avec noms des pays corrigés manuellement
df_correspondance=pd.read_table('correspondance.txt',delimiter=';')
df_correspondance.tail(15)

Unnamed: 0,area,area2
243,United Arab Emirates,
244,United Kingdom,United Kingdom
245,United States,United States of America
246,Upper middle income,
247,Uruguay,Uruguay
248,Uzbekistan,Uzbekistan
249,Vanuatu,
250,"Venezuela, RB",Venezuela
251,Vietnam,
252,Virgin Islands (U.S.),Virgin Islands (USA)


In [13]:
# on crée une nouvelle colonne "area2" dane la base worldbank qui correspond au nom du pays dans la base mortality
df_wb = pd.merge(df_wb, df_correspondance, on='area', how='left')
df_wb.head()

Unnamed: 0,area_code,area,year,SE.ADT.1524.LT.FE.ZS,SE.ADT.1524.LT.FM.ZS,SE.ADT.1524.LT.MA.ZS,SE.ADT.1524.LT.ZS,SE.ADT.LITR.FE.ZS,SE.ADT.LITR.MA.ZS,SE.ADT.LITR.ZS,...,SL.UEM.TOTL.MA.NE.ZS,SL.UEM.TOTL.MA.ZS,SL.UEM.TOTL.NE.ZS,SL.UEM.TOTL.ZS,SM.POP.NETM,SM.POP.REFG,SM.POP.REFG.OR,SM.POP.TOTL,SM.POP.TOTL.ZS,area2
0,ABW,Aruba,1970,,,,,,,,...,,,,,,,,7466.0,,Aruba
1,ABW,Aruba,1971,,,,,,,,...,,,,,,,,,,Aruba
2,ABW,Aruba,1972,,,,,,,,...,,,,,-3537.0,,,,,Aruba
3,ABW,Aruba,1973,,,,,,,,...,,,,,,,,,,Aruba
4,ABW,Aruba,1974,,,,,,,,...,,,,,,,,,,Aruba


La variable Mortality de notre base représente un nombre (absolut) de déces; nous voulons la transformer en taux de mortalité pour pouvoir faire une comparaison entre les pays. 

Pour ce faire, il faudra utiliser les variables de population par tranches d'age et par sexe de la base WorldBank, que nous transformerions en variables distinctes.

La mortalité sera donc le nombre de déces divisé par la population. Le taux sera sur 100 000, comme calculé par les organismes de spécialité (<a href='http://www.who.int/mediacentre/factsheets/fs297/en/'>WHO</a>  ,<a href='https://www.cancer.gov/about-cancer/understanding/statistics'> National Cancer Institute USA</a> )

In [14]:
#naming convention = old name without the .5Y at the end and without the SP in the beginning
pop_par_tranche_dage=['POP.0004.','POP.0509.','POP.1014.','POP.1519.','POP.2024.','POP.2529.','POP.3034.','POP.3539.','POP.4044.','POP.4549.','POP.5054.','POP.5559.','POP.6064.','POP.6569.','POP.7074.','POP.7579.','POP.80UP.']
for i in range(0,17):
    df_wb[''+pop_par_tranche_dage[i]+'FE'] =(df_wb['SP.POP.TOTL']*df_wb['SP.POP.TOTL.FE.ZS']*df_wb['SP.'+pop_par_tranche_dage[i]+'FE.5Y'])/10000
    df_wb[''+pop_par_tranche_dage[i]+'MA'] =(df_wb['SP.POP.TOTL']*(100-df_wb['SP.POP.TOTL.FE.ZS'])*df_wb['SP.'+pop_par_tranche_dage[i]+'MA.5Y'])/10000

In [15]:
#extraire les colonnes sur la population de la base WorldBank
df_mini = df_wb[['area','year','POP.0004.FE', 'POP.0509.FE', 'POP.1014.FE', 'POP.1519.FE', 'POP.2024.FE', 'POP.2529.FE', 'POP.3034.FE', 'POP.3539.FE', 'POP.4044.FE', 'POP.4549.FE', 'POP.5054.FE', 'POP.5559.FE', 'POP.6064.FE', 'POP.6569.FE', 'POP.7074.FE', 'POP.7579.FE', 'POP.80UP.FE', 'POP.0004.MA', 'POP.0509.MA', 'POP.1014.MA', 'POP.1519.MA', 'POP.2024.MA', 'POP.2529.MA', 'POP.3034.MA', 'POP.3539.MA', 'POP.4044.MA', 'POP.4549.MA', 'POP.5054.MA', 'POP.5559.MA', 'POP.6064.MA', 'POP.6569.MA', 'POP.7074.MA', 'POP.7579.MA', 'POP.80UP.MA']]
df_mini.head()

Unnamed: 0,area,year,POP.0004.FE,POP.0509.FE,POP.1014.FE,POP.1519.FE,POP.2024.FE,POP.2529.FE,POP.3034.FE,POP.3539.FE,...,POP.3539.MA,POP.4044.MA,POP.4549.MA,POP.5054.MA,POP.5559.MA,POP.6064.MA,POP.6569.MA,POP.7074.MA,POP.7579.MA,POP.80UP.MA
0,Aruba,1970,3434.0,3793.0,3586.0,3365.0,2879.0,2302.0,1946.0,1915.0,...,1544.0,1316.0,1175.0,1056.0,943.0,886.0,524.0,295.0,144.0,76.0
1,Aruba,1971,3218.0,3713.0,3578.0,3365.0,2975.0,2415.0,2003.0,1955.0,...,1602.0,1348.0,1172.0,1051.0,930.0,895.0,579.0,320.0,160.0,85.0
2,Aruba,1972,3109.0,3574.0,3580.0,3343.0,3043.0,2531.0,2076.0,1973.0,...,1655.0,1393.0,1174.0,1044.0,924.0,881.0,640.0,345.0,178.0,95.0
3,Aruba,1973,3075.0,3374.0,3585.0,3311.0,3078.0,2648.0,2160.0,1980.0,...,1704.0,1449.0,1188.0,1033.0,922.0,854.0,700.0,371.0,197.0,107.0
4,Aruba,1974,3045.0,3143.0,3563.0,3289.0,3082.0,2754.0,2250.0,2000.0,...,1755.0,1512.0,1214.0,1025.0,921.0,827.0,743.0,405.0,216.0,121.0


In [16]:
#transformation pour avoir une seule variable Population éclatée par pays, année, age, sex
df_mini.set_index(['area', 'year'], inplace=True)
df_mini.columns = pd.MultiIndex.from_tuples(tuple(df_mini.columns.str.split(".")))

df_pop = df_mini.stack(level = [1,2]).reset_index()
df_pop.columns = ['Country', 'Year', 'Age_group', 'Sex', 'Population']
df_pop.head()

Unnamed: 0,Country,Year,Age_group,Sex,Population
0,Aruba,1970,4,FE,3434.0
1,Aruba,1970,4,MA,3578.0
2,Aruba,1970,509,FE,3793.0
3,Aruba,1970,509,MA,3943.0
4,Aruba,1970,1014,FE,3586.0


In [17]:
#remplacer les codes pour le sex et pour les tranches d'age, pour harmoniser les deux bases
df_pop['Sex'].replace(['MA', 'FE'],[1,2], inplace=True)
df_pop['Age_group'].replace(['0004','0509','1014','1519','2024','2529','3034','3539','4044','4549','5054','5559','6064','6569','7074','7579'],
list(range(1, 17)), inplace=True)

In [18]:
#jointure entre la base Mortality et la base transformée avec la population
df_mort = pd.merge(df_mort, 
              df_pop, 
              how='inner', 
              left_on=['Location', 'Year', 'Age_bucket', 'Sex'], 
              right_on=['Country', 'Year', 'Age_group', 'Sex'])
df_mort.head()

Unnamed: 0,Cancer_code,Sex,Age_bucket,Location,Year,Nb_deaths,Country,Age_group,Population
0,C80,2,1,Jamaica,2000,2,Jamaica,1,139549.999999
1,C56,2,1,Jamaica,2000,0,Jamaica,1,139549.999999
2,C64,2,1,Jamaica,2000,1,Jamaica,1,139549.999999
3,C44,2,1,Jamaica,2000,0,Jamaica,1,139549.999999
4,C84,2,1,Jamaica,2000,0,Jamaica,1,139549.999999


In [19]:
#calculer la mortalité comme taux
df_mort['Mortality_rate'] = df_mort['Nb_deaths'] / df_mort['Population'] * 10**5
df_mort['Mortality_rate'].describe()

count    3.652496e+06
mean     1.098289e+01
std      6.163104e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      2.171180e+00
max      8.091666e+03
Name: Mortality_rate, dtype: float64

In [20]:
df_mort.head(3)

Unnamed: 0,Cancer_code,Sex,Age_bucket,Location,Year,Nb_deaths,Country,Age_group,Population,Mortality_rate
0,C80,2,1,Jamaica,2000,2,Jamaica,1,139549.999999,1.433178
1,C56,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0
2,C64,2,1,Jamaica,2000,1,Jamaica,1,139549.999999,0.716589


### Choix des indicateurs socio-économiques
On remarque que beaucoup des indicateurs socio-économiques ont une proportion de valeurs manquantes très importante, donc on souhaite ne conserver que ceux pour lesquels le pourcentage de données renseignées est satisfaisant, par exemple au moins 75% des valeurs remplies. 

In [21]:
cutoff = 0.25 #seuil de pourcentage valeurs manquantes acceptées
nb_na = df_wb.isnull().sum() #calculer nombre de valeurs manquantes par colonne
df_na = pd.DataFrame({'indicator':nb_na.index, 'na_nb':nb_na.values}) 
df_na.sort_values(by=['na_nb'], ascending=False, inplace=True) 
df_na['na_pct'] = df_na['na_nb'] / df_wb.shape[0] #calculer pourcentage de valeurs manquantes

In [22]:
#visualiser indicateurs par ordre croissante de valeurs présentes
df_indic_info = pd.merge(left=df_wb_info, right=df_na, 
                         how = 'left',
                         left_on = 'Code', right_on = 'indicator')
df_indic_info[df_indic_info['na_pct'] < cutoff][['Category','Code', 'Indicator Name', 'Long definition','na_pct']].sort_values(by=['na_pct'], ascending=True)

Unnamed: 0,Category,Code,Indicator Name,Long definition,na_pct
507,Environnement,SP.URB.TOTL.IN.ZS,Urban population (% of total),Urban population refers to people living in ur...,0.004808
504,Environnement,SP.RUR.TOTL.ZS,Rural population (% of total population),Rural population refers to people living in ru...,0.004808
502,Environnement,SP.RUR.TOTL,Rural population,Rural population refers to people living in ru...,0.005065
645,Health,SP.POP.TOTL,"Population, total",Total population is based on the de facto defi...,0.005065
506,Environnement,SP.URB.TOTL,Urban population,Urban population refers to people living in ur...,0.005065
505,Environnement,SP.URB.GROW,Urban population growth (annual %),Urban population refers to people living in ur...,0.005237
644,Health,SP.POP.GROW,Population growth (annual %),Annual population growth rate for year t is th...,0.005409
267,Education,SE.PRM.DURS,"Primary education, duration (years)",Primary duration refers to the number of grade...,0.010903
406,Environnement,AG.SRF.TOTL.K2,Surface area (sq. km),"Surface area is a country's total area, includ...",0.013565
398,Environnement,AG.LND.TOTL.K2,Land area (sq. km),"Land area is a country's total area, excluding...",0.013994


Notre sélection d'indicateurs :

<table><tr><th>Category</th><th>Indicator code</th><th>Indicator Name</th></tr><tr><td>Education</td><td>SE.PRM.DURS</td><td>Primary education, duration (years)</td></tr><tr><td>Education</td><td>SE.PRE.DURS</td><td>Preprimary education, duration (years)</td></tr><tr><td>Education</td><td>SE.SEC.DURS</td><td>Secondary education, duration (years)</td></tr><tr><td>Environnement</td><td>SP.URB.TOTL.IN.ZS</td><td>Urban population (% of total)</td></tr><tr><td>Environnement</td><td>SP.URB.GROW</td><td>Urban population growth (annual %)</td></tr><tr><td>Environnement</td><td>AG.SRF.TOTL.K2</td><td>Surface area (sq. km)</td></tr><tr><td>Environnement</td><td>EN.POP.DNST</td><td>Population density (people per sq. km of land area)</td></tr><tr><td>Environnement</td><td>EN.ATM.NOXE.KT.CE</td><td>Nitrous oxide emissions (thousand metric tons of CO2 equivalent)</td></tr><tr><td>Environnement</td><td>EN.ATM.METH.KT.CE</td><td>Methane emissions (kt of CO2 equivalent)</td></tr><tr><td>Environnement</td><td>AG.LND.AGRI.K2</td><td>Agricultural land (sq. km)</td></tr><tr><td>Environnement</td><td>EN.ATM.GHGO.KT.CE</td><td>Other greenhouse gas emissions, HFC, PFC and SF6 (thousand metric tons of CO2 equivalent)</td></tr><tr><td>Environnement</td><td>AG.LND.AGRI.ZS</td><td>Agricultural land (% of land area)</td></tr><tr><td>Environnement</td><td>AG.PRD.FOOD.XD</td><td>Food production index (2004-2006 = 100)</td></tr><tr><td>Environnement</td><td>AG.PRD.LVSK.XD</td><td>Livestock production index (2004-2006 = 100)</td></tr><tr><td>Environnement</td><td>AG.PRD.CROP.XD</td><td>Crop production index (2004-2006 = 100)</td></tr><tr><td>Environnement</td><td>EN.ATM.GHGT.KT.CE</td><td>Total greenhouse gas emissions (kt of CO2 equivalent)</td></tr><tr><td>Environnement</td><td>EN.ATM.CO2E.GF.ZS</td><td>CO2 emissions from gaseous fuel consumption (% of total)</td></tr><tr><td>Environnement</td><td>EN.ATM.CO2E.LF.ZS</td><td>CO2 emissions from liquid fuel consumption (% of total)</td></tr><tr><td>Environnement</td><td>EN.ATM.CO2E.SF.ZS</td><td>CO2 emissions from solid fuel consumption (% of total)</td></tr><tr><td>Environnement</td><td>NY.GDP.FRST.RT.ZS</td><td>Forest rents (% of GDP)</td></tr><tr><td>Environnement</td><td>NY.GDP.MINR.RT.ZS</td><td>Mineral rents (% of GDP)</td></tr><tr><td>Environnement</td><td>EN.ATM.CO2E.LF.KT</td><td>CO2 emissions from liquid fuel consumption (kt)</td></tr><tr><td>Environnement</td><td>EN.ATM.METH.AG.KT.CE</td><td>Agricultural methane emissions (thousand metric tons of CO2 equivalent)</td></tr><tr><td>Environnement</td><td>EN.ATM.METH.EG.KT.CE</td><td>Methane emissions in energy sector (thousand metric tons of CO2 equivalent)</td></tr><tr><td>Environnement</td><td>EN.ATM.NOXE.AG.KT.CE</td><td>Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)</td></tr><tr><td>Environnement</td><td>EN.ATM.NOXE.EG.KT.CE</td><td>Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent)</td></tr><tr><td>Environnement</td><td>NY.GDP.TOTL.RT.ZS</td><td>Total natural resources rents (% of GDP)</td></tr><tr><td>Environnement</td><td>EN.ATM.CO2E.KT</td><td>CO2 emissions (kt)</td></tr><tr><td>Environnement</td><td>EN.ATM.CO2E.PC</td><td>CO2 emissions (metric tons per capita)</td></tr><tr><td>Health</td><td>SP.POP.GROW</td><td>Population growth (annual %)</td></tr><tr><td>Health</td><td>SP.DYN.CBRT.IN</td><td>Birth rate, crude (per 1,000 people)</td></tr><tr><td>Health</td><td>SP.DYN.CDRT.IN</td><td>Death rate, crude (per 1,000 people)</td></tr><tr><td>Health</td><td>SP.DYN.TFRT.IN</td><td>Fertility rate, total (births per woman)</td></tr><tr><td>Health</td><td>SP.ADO.TFRT</td><td>Adolescent fertility rate (births per 1,000 women ages 15-19)</td></tr><tr><td>Health</td><td>SP.DYN.LE00.FE.IN</td><td>Life expectancy at birth, female (years)</td></tr><tr><td>Health</td><td>SP.DYN.LE00.IN</td><td>Life expectancy at birth, total (years)</td></tr><tr><td>Health</td><td>SP.POP.DPND</td><td>Age dependency ratio (% of working-age population)</td></tr><tr><td>Health</td><td>SP.DYN.AMRT.FE</td><td>Mortality rate, adult, female (per 1,000 female adults)</td></tr><tr><td>Health</td><td>SP.DYN.AMRT.MA</td><td>Mortality rate, adult, male (per 1,000 male adults)</td></tr><tr><td>Infrastructure</td><td>IT.CEL.SETS</td><td>Mobile cellular subscriptions</td></tr><tr><td>Infrastructure</td><td>IT.MLT.MAIN</td><td>Fixed telephone subscriptions</td></tr><tr><td>National_accounts</td><td>NY.ADJ.AEDU.GN.ZS</td><td>Adjusted savings: education expenditure (% of GNI)</td></tr><tr><td>National_accounts</td><td>NY.GDP.MKTP.CD</td><td>GDP (current USD)</td></tr><tr><td>National_accounts</td><td>NY.GDP.PCAP.CD</td><td>GDP per capita (current USD)</td></tr>
</table>

## Remplacement des valeurs manquantes
Pour les valeurs manquantes restantes de nos indicateurs socio-économiques séléctionnés, on les remplacera avec les valeurs des années disponibles.

In [23]:
#séléction indicateurs
indicators = ['area','year','SP.URB.TOTL.IN.ZS',	'SP.URB.GROW',	'SP.POP.GROW',	'SE.PRM.DURS',	'AG.SRF.TOTL.K2',	'SE.PRE.DURS',	'EN.POP.DNST',	'SE.SEC.DURS',	'SP.DYN.CBRT.IN',	'SP.DYN.CDRT.IN',	'NY.ADJ.AEDU.GN.ZS',	'SP.DYN.TFRT.IN',	'SP.ADO.TFRT',	'SP.DYN.LE00.FE.IN',	'SP.DYN.LE00.IN',	'SP.POP.DPND',	'SP.DYN.AMRT.FE',	'SP.DYN.AMRT.MA',	'EN.ATM.NOXE.KT.CE',	'EN.ATM.METH.KT.CE',	'AG.LND.AGRI.K2',	'EN.ATM.GHGO.KT.CE',	'AG.LND.AGRI.ZS',	'AG.PRD.FOOD.XD',	'IT.CEL.SETS',	'AG.PRD.LVSK.XD',	'AG.PRD.CROP.XD',	'IT.MLT.MAIN',	'EN.ATM.GHGT.KT.CE',	'EN.ATM.CO2E.GF.ZS',	'EN.ATM.CO2E.LF.ZS',	'EN.ATM.CO2E.SF.ZS',	'NY.GDP.FRST.RT.ZS',	'NY.GDP.MINR.RT.ZS',	'EN.ATM.CO2E.LF.KT',	'EN.ATM.METH.AG.KT.CE',	'EN.ATM.METH.EG.KT.CE',	'EN.ATM.NOXE.AG.KT.CE',	'EN.ATM.NOXE.EG.KT.CE',	'NY.GDP.TOTL.RT.ZS',	'EN.ATM.CO2E.KT',	'EN.ATM.CO2E.PC',	'NY.GDP.MKTP.CD',	'NY.GDP.PCAP.CD']
df_wb = df_wb.loc[:,indicators]

#remplacement valeurs manquantes
df_wb=df_wb.groupby('area').apply(lambda x: (x.sort_values('year', ascending=True)))
df_wb.fillna(method='ffill',inplace=True)
df_wb.fillna(method='bfill',inplace=True)
df_wb.reset_index(drop=True,inplace=True)
df_wb.isnull().values.sum() #vérifier qu'il n'y a plus de valeurs manquantes

0

Enfin, on fait une jointure entre la base Mortality (qui contient maintenant la mortalité comme taux) et la base WorldBank avec les indicateurs socio-économiques

In [24]:
#jointure base de données complète avec toutes les années, tous les pays, tous les types de cancer
df=pd.merge(df_mort, 
              df_wb, 
              how='inner',
              left_on=['Location','Year'], 
         right_on=['area','year'])

#df.to_csv(path_or_buf = 'data_mort_wb.csv', sep=',', na_rep='')
df.head()

Unnamed: 0,Cancer_code,Sex,Age_bucket,Location,Year,Nb_deaths,Country,Age_group,Population,Mortality_rate,...,EN.ATM.CO2E.LF.KT,EN.ATM.METH.AG.KT.CE,EN.ATM.METH.EG.KT.CE,EN.ATM.NOXE.AG.KT.CE,EN.ATM.NOXE.EG.KT.CE,NY.GDP.TOTL.RT.ZS,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,NY.GDP.MKTP.CD,NY.GDP.PCAP.CD
0,C80,2,1,Jamaica,2000,2,Jamaica,1,139549.999999,1.433178,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
1,C56,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
2,C64,2,1,Jamaica,2000,1,Jamaica,1,139549.999999,0.716589,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
3,C44,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
4,C84,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741


In [34]:
#vérification
print("Base de données complète : ")
print('- années incluses : ', np.sort(df.Year.unique()), '\n =>nb années :', len(df.Year.unique()))
print()
print('- pays inclus :', np.sort(df.Country.unique()), '\n =>nb pays :', len(df.Country.unique()) )
print()
print('- types de cancer inclus :', np.sort(df.Cancer_code.unique()), '\n =>nb types cancer :', len(df.Cancer_code.unique()) )

Base de données complète : 
- années incluses :  [1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984
 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999
 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
 2015] 
 =>nb années : 46

- pays inclus : ['Albania' 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia'
 'Austria' 'Azerbaijan' 'Bahrain' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Bolivia' 'Bosnia and Herzegovina' 'Brazil' 'Brunei Darussalam' 'Bulgaria'
 'Canada' 'Chile' 'Colombia' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus'
 'Czech Republic' 'Denmark' 'Dominican Republic' 'Ecuador' 'El Salvador'
 'Estonia' 'Fiji' 'Finland' 'France' 'Georgia' 'Germany' 'Greece' 'Grenada'
 'Guatemala' 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan'
 'Kiribati' 'Kuwait' 'Latvia' 'Lithuania' 'Luxembourg' 'Malaysia'
 'Maldives' 'Malta' 'Mauritius' 'Mex

Pour nos modélisations, nous travaillerons sur une base restreinte au cancer des poumons à partir de l'an 2000, pour tous les pays.

In [39]:
df_sub = df[(df['Year'] >= 2000) & (df['Cancer_code']=='C33,C34') ].copy()
df_sub.drop(['Cancer_code','Location','Nb_deaths','Age_group','Population','area','year'],inplace=True,axis=1)

#df_sub.to_csv(path_or_buf = 'data_mort_wb_sub.csv', sep=',', na_rep='')
df_sub.head()

Unnamed: 0,Sex,Age_bucket,Year,Country,Mortality_rate,SP.URB.TOTL.IN.ZS,SP.URB.GROW,SP.POP.GROW,SE.PRM.DURS,AG.SRF.TOTL.K2,...,EN.ATM.CO2E.LF.KT,EN.ATM.METH.AG.KT.CE,EN.ATM.METH.EG.KT.CE,EN.ATM.NOXE.AG.KT.CE,EN.ATM.NOXE.EG.KT.CE,NY.GDP.TOTL.RT.ZS,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,NY.GDP.MKTP.CD,NY.GDP.PCAP.CD
40,2,1,2000,Jamaica,0.0,51.814,1.24118,0.776905,6.0,10990.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
85,2,2,2000,Jamaica,0.0,51.814,1.24118,0.776905,6.0,10990.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
130,2,3,2000,Jamaica,0.0,51.814,1.24118,0.776905,6.0,10990.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
175,2,4,2000,Jamaica,0.0,51.814,1.24118,0.776905,6.0,10990.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
220,2,5,2000,Jamaica,0.0,51.814,1.24118,0.776905,6.0,10990.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741


In [41]:
#vérification
print("Base de données à modéliser : ")
print('- années incluses : ', np.sort(df_sub.Year.unique()), '\n =>nb années :', len(df_sub.Year.unique()))
print()
print('- pays inclus :', np.sort(df_sub.Country.unique()), '\n =>nb pays :', len(df_sub.Country.unique()) )
print()
print('- 1 seul type de cancer inclus: C33, 34')

Base de données à modéliser : 
- années incluses :  [2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
 2015] 
 =>nb années : 16

- pays inclus : ['Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia' 'Austria'
 'Azerbaijan' 'Bahrain' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Bolivia'
 'Bosnia and Herzegovina' 'Brazil' 'Brunei Darussalam' 'Bulgaria' 'Canada'
 'Chile' 'Colombia' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic'
 'Denmark' 'Dominican Republic' 'Ecuador' 'El Salvador' 'Estonia' 'Fiji'
 'Finland' 'France' 'Georgia' 'Germany' 'Greece' 'Grenada' 'Guatemala'
 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland' 'Iraq' 'Ireland' 'Israel'
 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan' 'Kiribati' 'Kuwait'
 'Latvia' 'Lithuania' 'Luxembourg' 'Malaysia' 'Maldives' 'Malta'
 'Mauritius' 'Mexico' 'Montenegro' 'Morocco' 'Netherlands' 'New Zealand'
 'Nicaragua' 'Norway' 'Oman' 'Panama' 'Paraguay' 'Peru' 'Philippines'
 'Poland' 'Portugal' 'Puerto Rico

In [36]:
#base de données à prédire : tous les pays de WorldBank, y compris ceux qui ne sont pas présents dans Mortality, pour lesquels on voudrait prédire le taux
df_pred =pd.merge(df_pop[df_pop['Year'] >= 2000],
                  df_wb[df_wb['year'] >= 2000],         
                  how='left',
                  right_on=['area','year'], 
                  left_on=['Country','Year'])
df_pred[df_pred['Age_group'] !='80UP']
#df_pred.to_csv(path_or_buf = 'data_pred_wb.csv', sep=',', na_rep='', encoding='utf-8')
df_pred.tail(20)

Unnamed: 0,Country,Year,Age_group,Sex,Population,area,year,SP.URB.TOTL.IN.ZS,SP.URB.GROW,SP.POP.GROW,...,EN.ATM.CO2E.LF.KT,EN.ATM.METH.AG.KT.CE,EN.ATM.METH.EG.KT.CE,EN.ATM.NOXE.AG.KT.CE,EN.ATM.NOXE.EG.KT.CE,NY.GDP.TOTL.RT.ZS,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,NY.GDP.MKTP.CD,NY.GDP.PCAP.CD
103816,Zimbabwe,2015,8,2,460546.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819
103817,Zimbabwe,2015,8,1,437012.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819
103818,Zimbabwe,2015,9,2,284830.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819
103819,Zimbabwe,2015,9,1,288738.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819
103820,Zimbabwe,2015,10,2,194583.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819
103821,Zimbabwe,2015,10,1,195122.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819
103822,Zimbabwe,2015,11,2,160338.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819
103823,Zimbabwe,2015,11,1,148814.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819
103824,Zimbabwe,2015,12,2,139829.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819
103825,Zimbabwe,2015,12,1,121455.0,Zimbabwe,2015,32.376,1.928627,2.313958,...,4085.038,5905.300852,1030.77915,3739.774683,222.571847,8.691455,13780.586,0.92499,14419190000.0,924.143819


In [38]:
#vérification
print("Base de données à prédire : ")
print('- années incluses : ', np.sort(df_pred.Year.unique()), '\n =>nb années :', len(df_pred.Year.unique()))
print()
print('- pays inclus :', np.sort(df_pred.Country.unique()), '\n =>nb pays :', len(df_pred.Country.unique()) )
print()
print('- 1 seul type de cancer inclus: C33, 34')

Base de données à prédire : 
- années incluses :  [2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
 2015] 
 =>nb années : 16

- pays inclus : ['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Antigua and Barbuda'
 'Argentina' 'Armenia' 'Aruba' 'Australia' 'Austria' 'Azerbaijan'
 'Bahamas, The' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium'
 'Belize' 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Botswana'
 'Brazil' 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso' 'Burundi'
 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada' 'Central African Republic'
 'Chad' 'Chile' 'China' 'Colombia' 'Comoros' 'Congo, Dem. Rep.'
 'Congo, Rep.' 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba' 'Curacao'
 'Cyprus' 'Czech Republic' 'Denmark' 'Djibouti' 'Dominican Republic'
 'Ecuador' 'Egypt, Arab Rep.' 'El Salvador' 'Equatorial Guinea' 'Eritrea'
 'Estonia' 'Ethiopia' 'Fiji' 'Finland' 'France' 'French Polynesia' 'Gabon'
 'Gambia, The' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada' 'Gu

In [25]:
#base de données choix cancer
df_sub_choix = df[df['Year'] >= 2000].copy()
df_sub_choix.drop(['Location','Nb_deaths','Age_group','Population','area','year'],inplace=True,axis=1)

df_sub_choix.to_csv(path_or_buf = 'df_sub_choix.csv', sep=',', na_rep='')
df_sub_choix.head()

Unnamed: 0,Cancer_code,Sex,Age_bucket,Year,Country,Mortality_rate,SP.URB.TOTL.IN.ZS,SP.URB.GROW,SP.POP.GROW,SE.PRM.DURS,...,EN.ATM.CO2E.LF.KT,EN.ATM.METH.AG.KT.CE,EN.ATM.METH.EG.KT.CE,EN.ATM.NOXE.AG.KT.CE,EN.ATM.NOXE.EG.KT.CE,NY.GDP.TOTL.RT.ZS,EN.ATM.CO2E.KT,EN.ATM.CO2E.PC,NY.GDP.MKTP.CD,NY.GDP.PCAP.CD
0,C80,2,1,2000,Jamaica,1.433178,51.814,1.24118,0.776905,6.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
1,C56,2,1,2000,Jamaica,0.0,51.814,1.24118,0.776905,6.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
2,C64,2,1,2000,Jamaica,0.716589,51.814,1.24118,0.776905,6.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
3,C44,2,1,2000,Jamaica,0.0,51.814,1.24118,0.776905,6.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741
4,C84,2,1,2000,Jamaica,0.0,51.814,1.24118,0.776905,6.0,...,9856.896,633.005651,267.408083,361.597144,52.552558,1.762288,10307.937,3.964446,8929376000.0,3434.249741


In [27]:
#vérification
print("Base de données choix cancer : ")
print('- années incluses : ', np.sort(df_sub_choix.Year.unique()), '\n =>nb années :', len(df_sub_choix.Year.unique()))
print()
print('- pays inclus :', np.sort(df_sub_choix.Country.unique()), '\n =>nb pays :', len(df_sub_choix.Country.unique()) )
print()
print('- types de cancer inclus :', np.sort(df_sub_choix.Cancer_code.unique()), '\n =>nb types cancer :', len(df_sub_choix.Cancer_code.unique()) )

Base de données choix cancer : 
- années incluses :  [2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
 2015] 
 =>nb années : 16

- pays inclus : ['Albania' 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia'
 'Austria' 'Azerbaijan' 'Bahrain' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Bolivia' 'Bosnia and Herzegovina' 'Brazil' 'Brunei Darussalam' 'Bulgaria'
 'Canada' 'Chile' 'Colombia' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus'
 'Czech Republic' 'Denmark' 'Dominican Republic' 'Ecuador' 'El Salvador'
 'Estonia' 'Fiji' 'Finland' 'France' 'Georgia' 'Germany' 'Greece' 'Grenada'
 'Guatemala' 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan'
 'Kiribati' 'Kuwait' 'Latvia' 'Lithuania' 'Luxembourg' 'Malaysia'
 'Maldives' 'Malta' 'Mauritius' 'Mexico' 'Montenegro' 'Morocco'
 'Netherlands' 'New Zealand' 'Nicaragua' 'Norway' 'Oman' 'Panama'
 'Paraguay' 'Peru' 'Philippines' 'Poland' 'Portugal' '

In [None]:
#### pq le fait de restraindre le nombre d'années donne moins de pays ??? si on avait tout bien rempli, on aurait eu toutes les données pour 2015 remplies avec des années précédentes
###=> à revoir remplacement valeurs manquantes !!!

In [32]:
df_mort.head()

Unnamed: 0,Cancer_code,Sex,Age_bucket,Location,Year,Nb_deaths,Country,Age_group,Population,Mortality_rate
0,C80,2,1,Jamaica,2000,2,Jamaica,1,139549.999999,1.433178
1,C56,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0
2,C64,2,1,Jamaica,2000,1,Jamaica,1,139549.999999,0.716589
3,C44,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0
4,C84,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0


In [38]:
#fréquence C14 vs C33,34
print(len(df_mort[df['Cancer_code']=='C14']))
#df_mort.groupby("Cancer_code")['Mortality_rate'].count().sort_values(ascending=False)
df_mort.groupby("Cancer_code")['Nb_deaths'].count().sort_values(ascending=False)


60576


Cancer_code
C16                        102976
C15                        101712
C32                         99216
C19-C21                     97744
C67                         85584
C18                         84656
C22                         82704
C50                         78432
C17                         77008
C25                         74304
C43                         72128
C71                         70112
C80                         67920
C76                         65328
C73                         64768
C26                         63536
C14                         60576
C33, C34                    60224
C91, C92, C93, C94, C95     60128
C69                         58768
C81                         56320
C44                         56176
C40, C41, C50               53728
C40, C41                    53376
C81-C96                     52288
C53                         51952
C61                         51952
C38                         50736
C30,C31                     49680
C5

<h1><center>Diagramme traitements :</center></h1>

![Schéma de ce programme de traitement des bases de données :](Diagramme traitement base.png)

In [40]:
#vérification intégrité base Mortality
df_mort.head()

Unnamed: 0,Cancer_code,Sex,Age_bucket,Location,Year,Nb_deaths,Country,Age_group,Population,Mortality_rate
0,C80,2,1,Jamaica,2000,2,Jamaica,1,139549.999999,1.433178
1,C56,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0
2,C64,2,1,Jamaica,2000,1,Jamaica,1,139549.999999,0.716589
3,C44,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0
4,C84,2,1,Jamaica,2000,0,Jamaica,1,139549.999999,0.0


In [47]:
df_mort.groupby("Country")['Year'].nunique()

Country
Albania                   22
Antigua and Barbuda       36
Argentina                 39
Armenia                   30
Aruba                     19
Australia                 44
Austria                   45
Azerbaijan                23
Bahrain                   21
Barbados                  40
Belarus                   27
Belgium                   45
Belize                    42
Bolivia                    4
Bosnia and Herzegovina     9
Brazil                    38
Brunei Darussalam         19
Bulgaria                  44
Canada                    43
Chile                     45
Colombia                  37
Costa Rica                45
Croatia                   31
Cuba                      45
Cyprus                    13
Czech Republic            30
Denmark                   45
Dominican Republic        42
Ecuador                   44
El Salvador               32
                          ..
Poland                    43
Portugal                  44
Puerto Rico               44
Qatar 

In [48]:
df_mort.groupby("Year")['Country'].nunique()

Year
1970    51
1971    51
1972    51
1973    51
1974    52
1975    54
1976    52
1977    60
1978    55
1979    49
1980    57
1981    71
1982    66
1983    56
1984    59
1985    71
1986    71
1987    77
1988    74
1989    74
1990    76
1991    74
1992    72
1993    70
1994    75
1995    78
1996    76
1997    78
1998    79
1999    84
2000    85
2001    91
2002    89
2003    89
2004    86
2005    83
2006    84
2007    85
2008    88
2009    91
2010    87
2011    84
2012    81
2013    73
2014    63
2015    14
Name: Country, dtype: int64