In [12]:
import numpy as np
import pandas as pd
import warnings

# Ignorer les warnings
warnings.filterwarnings('ignore')

In [13]:
df = pd.read_csv('energy_antananarivo_collected.csv', index_col=None)
df

Unnamed: 0,district,nb_dmd,energy,date
0,Andramasina,28,0.000104,1980-01-01
1,Antananarivo-Avaradrano,18,0.000067,1980-01-01
2,Antananarivo-Atsimondrano,4,0.000015,1980-01-01
3,Manjakandriana,-3,-0.000011,1980-01-01
4,Ambohidratrimo,10,0.000037,1980-01-01
...,...,...,...,...
3691,Antananarivo-Atsimondrano,432,0.003384,2023-12-01
3692,Manjakandriana,448,0.003509,2023-12-01
3693,Ambohidratrimo,426,0.003337,2023-12-01
3694,Anjozorobe,443,0.003470,2023-12-01


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3696 entries, 0 to 3695
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   district  3696 non-null   object 
 1   nb_dmd    3696 non-null   int64  
 2   energy    3696 non-null   float64
 3   date      3696 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 115.6+ KB


Vérifions si il existe des valeurs manquantes dans le Dataframe

In [15]:
df.isna().sum()

district    0
nb_dmd      0
energy      0
date        0
dtype: int64

On voit qu'il n'y a pas de valeurs manquantes.  

Pour avoir un dataset avec une serie temporelle, il faut modifier l'index en Datetime. Etant donnée la colonne date, on va définir ce dernier en index.

In [16]:
# Il faut regrouper par date puis de calculer la somme de 'energy' et la somme de 'nb_dmd'
df = df.groupby(['date']).agg(tot_nb_dmd=('nb_dmd','sum'), tot_energy=('energy','sum')).reset_index()
df

Unnamed: 0,date,tot_nb_dmd,tot_energy
0,1980-01-01,93,0.000346
1,1980-02-01,141,0.000525
2,1980-03-01,114,0.000425
3,1980-04-01,140,0.000522
4,1980-05-01,31,0.000115
...,...,...,...
523,2023-08-01,2877,0.022536
524,2023-09-01,2836,0.022215
525,2023-10-01,2920,0.022873
526,2023-11-01,2962,0.023202


In [17]:
# Tout d'abord il faut modifier le type de la colonne 'date' DateTime
df['date'] = pd.to_datetime(df['date'])

# Ensuite, definir 'date' comme index
df.set_index('date', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 528 entries, 1980-01-01 to 2023-12-01
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tot_nb_dmd  528 non-null    int64  
 1   tot_energy  528 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 12.4 KB


In [18]:
df.head(12)

Unnamed: 0_level_0,tot_nb_dmd,tot_energy
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1980-01-01,93,0.000346
1980-02-01,141,0.000525
1980-03-01,114,0.000425
1980-04-01,140,0.000522
1980-05-01,31,0.000115
1980-06-01,-12,-4.5e-05
1980-07-01,-96,-0.000358
1980-08-01,-157,-0.000585
1980-09-01,-113,-0.000421
1980-10-01,-114,-0.000425


In [19]:
# Suppresion des observations contenant des valeurs négatives
df = df.loc[df['tot_nb_dmd'] >= 0]
df

Unnamed: 0_level_0,tot_nb_dmd,tot_energy
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1980-01-01,93,0.000346
1980-02-01,141,0.000525
1980-03-01,114,0.000425
1980-04-01,140,0.000522
1980-05-01,31,0.000115
...,...,...
2023-08-01,2877,0.022536
2023-09-01,2836,0.022215
2023-10-01,2920,0.022873
2023-11-01,2962,0.023202


In [20]:
df.to_csv('energy_antananarivo_cleaned.csv', index=True)