In [1]:
# Import des librairies
import os
import sys 
import shutil
import pandas as pd
from dateutil import parser
import traceback
import zipfile

### Lecture des fichiers

In [17]:
zf = zipfile.ZipFile('brut.zip')

columns_status = ['date','Station','Status','Vélos_dispos','Emplacements_dispos']
columns_meteo = ['Timestamp','Status','Clouds','Humidity','Pressure','Rain','WindGust','WindVarEnd','WindVarBeg','WindDeg','WindSpeed','Snow','TemperatureMax','TemperatureMin','TemperatureTemp']

# On prend soin de ne lire que les 1000 premières lignes 
# pour ne pas travailler directement sur l'entièreté des données
# De plus, on utilise le parseur de date -->
df_velos = pd.read_csv(zf.open(zipfile.ZipFile.namelist(zf)[1]),sep=';', nrows=1000)
df_status = pd.read_csv(zf.open(zipfile.ZipFile.namelist(zf)[2]),sep=';', names=columns_status, nrows=1000,parse_dates=['date'])
df_meteo = pd.read_csv(zf.open(zipfile.ZipFile.namelist(zf)[3]),sep=';',names=columns_meteo, nrows=1000, parse_dates=['Timestamp'])

### Suppression données abérentes

On commence par s'assurer que la colonnes des dates et heures contient des données cohérentes. Pour ce, on convertit ces colones au format datetimes64. On considère que si aucune erreur ne ressort c'est que les données ne sont pas abérentes.

In [3]:
print('Voici un apperçu du DataFrame')
df_meteo.head()

Voici un apperçu du DataFrame


Unnamed: 0,Timestamp,Status,Clouds,Humidity,Pressure,Rain,WindGust,WindVarEnd,WindVarBeg,WindDeg,WindSpeed,Snow,TemperatureMax,TemperatureMin,TemperatureTemp
0,2014-11-14 09:35:38,clouds,40,100,1013.0,{u'3h': 0},,,,200.504,0.84,{},9.0,9.0,9.0
1,2014-11-14 09:45:05,mist,40,100,1014.0,{u'3h': 0},,,,200.504,0.84,{},10.0,10.0,10.0
2,2014-11-14 09:50:05,mist,40,100,1014.0,{u'3h': 0},,,,200.504,0.84,{},10.0,10.0,10.0
3,2014-11-14 09:55:05,clouds,40,100,1013.0,{u'3h': 0},,,,200.504,0.84,{},9.0,9.0,9.0
4,2014-11-14 10:00:04,mist,40,100,1014.0,{u'3h': 0},,,,200.504,0.84,{},10.0,10.0,10.0


In [4]:
df_meteo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
Timestamp          1000 non-null datetime64[ns]
Status             1000 non-null object
Clouds             1000 non-null int64
Humidity           1000 non-null int64
Pressure           1000 non-null float64
Rain               1000 non-null object
WindGust           1000 non-null object
WindVarEnd         1000 non-null object
WindVarBeg         1000 non-null object
WindDeg            1000 non-null float64
WindSpeed          1000 non-null float64
Snow               1000 non-null object
TemperatureMax     1000 non-null float64
TemperatureMin     1000 non-null float64
TemperatureTemp    1000 non-null float64
dtypes: datetime64[ns](1), float64(6), int64(2), object(6)
memory usage: 117.3+ KB


In [5]:
df_status.head()

Unnamed: 0,date,Station,Status,Vélos_dispos,Emplacements_dispos
0,2014-11-14 09:35:38,Duc,1,4,5
1,2014-11-14 09:35:38,Ospedale Maggiore,1,2,7
2,2014-11-14 09:35:38,Traversetolo,1,2,7
3,2014-11-14 09:35:38,Campus Chimica,1,4,5
4,2014-11-14 09:35:38,Stazione FF.SS.,1,9,10


In [6]:
df_status.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
date                   1000 non-null datetime64[ns]
Station                1000 non-null object
Status                 1000 non-null int64
Vélos_dispos           1000 non-null int64
Emplacements_dispos    1000 non-null int64
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 39.1+ KB


In [7]:
df_velos.head()

Unnamed: 0,system,station,latitude,longitude,elevation
0,bicincitta_parma,01. Duc,44.807118,10.332934,51.076065
1,bicincitta_parma,02. Ospedale Maggiore,44.802263,10.306275,56.344078
2,bicincitta_parma,03. Traversetolo,44.781595,10.344492,58.324486
3,bicincitta_parma,04. Campus Chimica,44.766433,10.314547,76.587212
4,bicincitta_parma,05. Stazione FF.SS.,44.809888,10.327693,57.179089


La lecture des colonnes contenant les dates et heures semble s'être effecctuée correctement.

#### Erreur de collecte 

In [8]:
# On supprime les lignes pour lesquelles il y a eu une erreur de collecte
df_status = df_status[df_status.Status != 1]

### Différenciation des stations

In [18]:
df_meteo = df_meteo.set_index('Timestamp')
df_meteo.resample('10min', label='right',closed='right')

DatetimeIndexResampler [freq=<10 * Minutes>, axis=0, closed=right, label=right, convention=start, base=0]

In [19]:
df_meteo

Unnamed: 0_level_0,Status,Clouds,Humidity,Pressure,Rain,WindGust,WindVarEnd,WindVarBeg,WindDeg,WindSpeed,Snow,TemperatureMax,TemperatureMin,TemperatureTemp
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2014-11-14 09:35:38,clouds,40,100,1013.00,{u'3h': 0},,,,200.504,0.84,{},9.00,9.00,9.00
2014-11-14 09:45:05,mist,40,100,1014.00,{u'3h': 0},,,,200.504,0.84,{},10.00,10.00,10.00
2014-11-14 09:50:05,mist,40,100,1014.00,{u'3h': 0},,,,200.504,0.84,{},10.00,10.00,10.00
2014-11-14 09:55:05,clouds,40,100,1013.00,{u'3h': 0},,,,200.504,0.84,{},9.00,9.00,9.00
2014-11-14 10:00:04,mist,40,100,1014.00,{u'3h': 0},,,,200.504,0.84,{},10.00,10.00,10.00
2014-11-14 10:05:05,mist,40,100,1014.00,{u'3h': 0},,,,200.504,0.84,{},10.00,10.00,10.00
2014-11-14 10:10:05,clouds,40,100,1013.00,{u'3h': 0},,,,200.504,0.84,{},9.00,9.00,9.00
2014-11-14 10:15:05,mist,40,100,1014.00,{u'3h': 0},,,,200.504,0.84,{},10.00,10.00,10.00
2014-11-14 10:20:04,mist,40,100,1014.00,{u'3h': 0},,,,200.504,0.84,{},10.00,10.00,10.00
2014-11-14 10:25:06,mist,40,100,1014.00,{u'3h': 0},,,,200.504,0.84,{},10.00,10.00,10.00
