In [19]:
import numpy as np
import pandas as pd
import random, sys

from datetime import datetime, timedelta, date
from random import randrange
from dateutil.relativedelta import relativedelta

import warnings

# Ignorer les warnings
warnings.filterwarnings('ignore')

In [20]:
df = pd.read_csv('energy_antananarivo_collected.csv', index_col=None)
df

Unnamed: 0,district,nb_dmd,energy,date
0,Andramasina,209,0.000864,2000-01-01
1,Antananarivo-Avaradrano,228,0.000942,2000-01-01
2,Antananarivo-Atsimondrano,206,0.000851,2000-01-01
3,Manjakandriana,200,0.000827,2000-01-01
4,Ambohidratrimo,217,0.000897,2000-01-01
...,...,...,...,...
2011,Antananarivo-Atsimondrano,413,0.003235,2023-12-01
2012,Manjakandriana,448,0.003509,2023-12-01
2013,Ambohidratrimo,446,0.003494,2023-12-01
2014,Anjozorobe,427,0.003345,2023-12-01


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   district  2016 non-null   object 
 1   nb_dmd    2016 non-null   int64  
 2   energy    2016 non-null   float64
 3   date      2016 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 63.1+ KB


Vérifions si il existe des valeurs manquantes dans le Dataframe

In [22]:
df.isna().sum()

district    0
nb_dmd      0
energy      0
date        0
dtype: int64

On voit qu'il n'y a pas de valeurs manquantes.  

Pour avoir un dataset avec une serie temporelle, il faut modifier l'index en Datetime. Etant donnée la colonne date, on va définir ce dernier en index.

In [23]:
# Il faut regrouper par date puis de calculer la somme de 'energy' et la somme de 'nb_dmd'
df = df.groupby(['date']).agg(tot_nb_dmd=('nb_dmd','sum'), tot_energy=('energy','sum')).reset_index()
df

Unnamed: 0,date,tot_nb_dmd,tot_energy
0,2000-01-01,1470,0.006076
1,2000-02-01,1519,0.006279
2,2000-03-01,1516,0.006266
3,2000-04-01,1499,0.006196
4,2000-05-01,1436,0.005935
...,...,...,...
283,2023-08-01,2913,0.022818
284,2023-09-01,2908,0.022779
285,2023-10-01,2824,0.022121
286,2023-11-01,2932,0.022967


In [24]:
# Tout d'abord il faut modifier le type de la colonne 'date' DateTime
df['date'] = pd.to_datetime(df['date'])

# Ensuite, definir 'date' comme index
df.set_index('date', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 288 entries, 2000-01-01 to 2023-12-01
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tot_nb_dmd  288 non-null    int64  
 1   tot_energy  288 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 6.8 KB


In [25]:
df.head(10)

Unnamed: 0_level_0,tot_nb_dmd,tot_energy
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-01,1470,0.006076
2000-02-01,1519,0.006279
2000-03-01,1516,0.006266
2000-04-01,1499,0.006196
2000-05-01,1436,0.005935
2000-06-01,1425,0.00589
2000-07-01,1595,0.006593
2000-08-01,1339,0.005535
2000-09-01,1266,0.005233
2000-10-01,1228,0.005076


In [26]:
df.to_csv('energy_antananarivo_cleaned.csv', index=True)