In [34]:
import numpy as np
import pandas as pd
import random, sys

from datetime import datetime, timedelta, date
from random import randrange
from dateutil.relativedelta import relativedelta

import warnings

# Ignorer les warnings
warnings.filterwarnings('ignore')

# Définition de la colonne Date comme index
# df.set_index('date', inplace=True)

In [37]:
df = pd.read_csv('energy_antananarivo_collected.csv')
df

Unnamed: 0,district,nb_dmd,energy,date
0,Andramasina,215,0.011680,2000-01-01
1,Antananarivo-Avaradrano,223,0.012114,2000-01-01
2,Antananarivo-Atsimondrano,208,0.011299,2000-01-01
3,Manjakandriana,218,0.011843,2000-01-01
4,Ambohidratrimo,224,0.012169,2000-01-01
...,...,...,...,...
2011,Antananarivo-Atsimondrano,424,0.081443,2023-12-01
2012,Manjakandriana,445,0.085477,2023-12-01
2013,Ambohidratrimo,427,0.082020,2023-12-01
2014,Anjozorobe,425,0.081635,2023-12-01


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   district  2016 non-null   object 
 1   nb_dmd    2016 non-null   int64  
 2   energy    2016 non-null   float64
 3   date      2016 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 63.1+ KB


Vérifions si il existe des valeurs manquantes dans le Dataframe

In [39]:
df.isna().sum()

district    0
nb_dmd      0
energy      0
date        0
dtype: int64

On voit qu'il n'y a pas de valeurs manquantes.  

Pour avoir un dataset avec une serie temporelle, il faut modifier l'index en Datetime. Etant donnée la colonne date, on va définir ce dernier en index.

In [40]:
# Il faut regrouper par date puis de calculer la somme de 'energy' et la somme de 'nb_dmd'
df = df.groupby(['date']).agg(tot_nb_dmd=('nb_dmd','sum'), tot_energy=('energy','sum')).reset_index()
df

Unnamed: 0,date,tot_nb_dmd,tot_energy
0,2000-01-01,1527,0.082952
1,2000-02-01,1488,0.072664
2,2000-03-01,1511,0.062455
3,2000-04-01,1550,0.052442
4,2000-05-01,1501,0.042543
...,...,...,...
283,2023-08-01,2872,0.487043
284,2023-09-01,2814,0.498312
285,2023-10-01,2918,0.538614
286,2023-11-01,2903,0.551784


In [41]:
# Tout d'abord il faut modifier le type de la colonne 'date' DateTime
df['date'] = pd.to_datetime(df['date'])

# Ensuite, definir 'date' comme index
df.set_index('date', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 288 entries, 2000-01-01 to 2023-12-01
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tot_nb_dmd  288 non-null    int64  
 1   tot_energy  288 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 6.8 KB


In [42]:
df

Unnamed: 0_level_0,tot_nb_dmd,tot_energy
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-01,1527,0.082952
2000-02-01,1488,0.072664
2000-03-01,1511,0.062455
2000-04-01,1550,0.052442
2000-05-01,1501,0.042543
...,...,...
2023-08-01,2872,0.487043
2023-09-01,2814,0.498312
2023-10-01,2918,0.538614
2023-11-01,2903,0.551784


In [43]:
df.to_csv('energy_antananarivo_cleaned.csv', index=True)