In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
os.listdir('data')

['Env_QH.csv',
 'micro_sud3.pkl',
 'AllPM_QH.csv',
 'micro_sud3_normalized.pkl',
 'AllNO2_QH.csv']

In [3]:
df = pd.read_pickle('data/micro_sud3_normalized.pkl')
df = df.reset_index()
df.head(100)

Unnamed: 0,index,date,PM_ref,PM_6182,PM_6179,PM_617B,PM25_6182,PM25_6179,PM25_617B,NO2_ref,NO2_61FD,NO2_61F0,NO2_61EF,temp,rh,tgrad,pressure,pluvio
0,15,2017-09-28 14:00:00,16.2,-1.178505,-1.137844,-1.134624,-1.183081,-1.128074,-1.148204,10.1,-0.392423,-0.621107,-0.419097,1.986031,-1.114144,-0.922393,0.315942,-0.150524
1,16,2017-09-28 14:15:00,9.6,-1.108262,-1.085060,-1.121956,-1.101652,-1.071229,-1.128278,9.9,-0.392423,-0.621107,-0.419097,2.057032,-1.123212,-0.977185,0.335134,-0.150524
2,17,2017-09-28 14:30:00,10.3,-1.178505,-1.169515,-1.257077,-1.176817,-1.167865,-1.252817,16.1,-0.392423,-0.621107,-0.419097,2.080699,-1.232038,-1.086769,0.315942,-0.150524
3,18,2017-09-28 14:45:00,9.4,-1.137530,-1.000606,-1.206407,-1.139235,-1.008700,-1.222928,10.9,-0.392423,-0.621107,-0.419097,2.009698,-1.259245,-0.812809,0.315942,-0.150524
4,19,2017-09-28 15:00:00,10.7,-1.166798,-1.164236,-1.138846,-1.164290,-1.167865,-1.148204,16.0,-0.392423,-0.621107,-0.419097,1.867697,-1.141350,-0.922393,0.315942,-0.150524
5,20,2017-09-28 15:15:00,10.7,-1.166798,-1.201185,-1.037506,-1.158026,-1.196288,-1.038610,9.7,-0.392423,-0.621107,-0.419097,1.749363,-1.050662,-0.867601,0.315942,-0.150524
6,21,2017-09-28 15:30:00,9.6,-1.078994,-1.169515,-1.007948,-1.070333,-1.162181,-0.998757,10.0,-0.392423,-0.621107,-0.419097,1.654695,-0.959974,-0.812809,0.315942,-0.150524
7,22,2017-09-28 15:45:00,10.2,-1.032164,-1.132566,-0.995280,-1.032750,-1.122389,-0.988794,19.1,-0.392423,-0.621107,-0.419097,1.583695,-0.887423,-0.758017,0.315942,-0.150524
8,23,2017-09-28 16:00:00,9.8,-1.043872,-1.095617,-0.910830,-1.039014,-1.093967,-0.919052,15.0,-0.392423,-0.621107,-0.419097,1.536361,-0.851148,-0.758017,0.315942,-0.150524
9,24,2017-09-28 16:15:00,8.9,-1.043872,-1.042833,-1.181072,-1.039014,-1.037122,-1.173112,19.4,-0.392423,-0.621107,-0.419097,1.489027,-0.851148,-0.703225,0.335134,-0.150524


In [4]:
df = df[['date', 'NO2_ref', 'NO2_61FD', 'NO2_61F0', \
        'NO2_61EF', 'temp', 'rh', 'tgrad', 'pressure', 'pluvio']]

# Premier modèle: simple DNN

In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

def baseline_model(dense_size, input_dim, loss='mean_squared_error', optimizer='adam'):
    # create model
    model = Sequential()
    model.add(Dense(dense_size, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss=loss, optimizer=optimizer)
    model.summary()
    return model

Using TensorFlow backend.


In [6]:
#df = df.reindex(np.random.permutation(df.index))

def split_dataframe(dataframe, percent):
    nb_rows = int(np.floor(percent * len(dataframe)))
    return dataframe[:nb_rows], dataframe[nb_rows:]

def dataframe_to_xy(df):
    return (np.array(df[['NO2_61FD', 'NO2_61F0', 'NO2_61EF', 'temp', 'rh',\
                         'tgrad', 'pressure', 'pluvio']]),\
            np.array(df['NO2_ref']))

df_train, df_test = split_dataframe(df, 0.5) 
df_valid, df_test = split_dataframe(df_test, 0.5)

X_train, y_train = dataframe_to_xy(df_train)
X_valid, y_valid = dataframe_to_xy(df_valid)
X_test, y_test = dataframe_to_xy(df_test)

In [None]:
plt.plot(y_train, '+r')
plt.show()
plt.plot(y_valid, '+r')
plt.show()
plt.plot(y_test, '+r')
plt.show()

<IPython.core.display.Javascript object>

In [None]:
model = baseline_model(32, X_train.shape[1], 'mean_squared_error', 'adam')
early_stopping = EarlyStopping(monitor='val_loss', verbose=1, mode='auto', patience=10)
history = model.fit(X_train, y_train, batch_size=32, epochs=5000, validation_data=(X_valid, y_valid), callbacks=[early_stopping], verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                288       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 321
Trainable params: 321
Non-trainable params: 0
_________________________________________________________________
Train on 1126 samples, validate on 563 samples
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/500

In [None]:
y_pred = model.predict(X_test)
plt.title('Coût et coût de validation')
line1,=plt.plot(history.history['loss'], label="Loss", linestyle='-', color='r')
line2,=plt.plot(history.history['val_loss'], label="Val loss", linestyle='-', color='b')
first_legend = plt.legend(handles=[line1, line2], loc=1)

plt.show()

plt.title('y_pred en fonction de y_test')

plt.plot(y_pred[:], y_test[:], '+')
plt.ylabel('Test')
plt.xlabel('Prédiction')
plt.show()

## DNN 2 Couches

In [None]:
from keras.layers import SimpleRNN

df = pd.read_pickle('data/micro_sud3_normalized.pkl')
df = df[['date', 'NO2_ref', 'NO2_61FD', 'NO2_61F0', \
        'NO2_61EF', 'temp', 'rh', 'tgrad', 'pressure', 'pluvio']]
df = df.reset_index()
df_train, df_test = split_dataframe(df, 0.5) 
df_valid, df_test = split_dataframe(df_test, 0.5)

X_train, y_train = dataframe_to_xy(df_train)
X_valid, y_valid = dataframe_to_xy(df_valid)
X_test, y_test = dataframe_to_xy(df_test)

def simple_rnn_model(nb_units, dense_size, loss='mean_squared_error', optimizer='adam'):
    model = Sequential()
    model.add(Dense(dense_size, input_dim=dense_size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(dense_size//2, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss=loss, optimizer=optimizer)
    model.summary()
    return model


model = simple_rnn_model(32, X_train.shape[1])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', verbose=1, mode='auto', patience=10)
history = model.fit(X_train, y_train, batch_size=32, epochs=5000, validation_data=(X_valid, y_valid), callbacks=[early_stopping], verbose=1)

In [None]:
y_pred = model.predict(X_test)
plt.plot(history.history['loss'], 'r-')
plt.plot(history.history['val_loss'], 'b-')
plt.show()

plt.title('y_pred, y_test')

plt.plot(y_pred[:], y_test[:], '+')
plt.show()
percent_high_detected = np.sum(y_pred.reshape((len(y_pred), )) > 20) / np.sum(y_test.reshape((len(y_test), )) > 20)
print(percent_high_detected)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(df['NO2_ref'], df['NO2_61FD'], '+')

In [None]:
plt.show()