In [106]:
# Import section
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers, metrics
from tensorflow.keras.layers import Normalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import RobustScaler
import pandas as pd
import numpy as np 
import math
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import datetime
from datetime import date, timedelta
from clean import clean_all, get_data, get_bairros_data
from preproc import get_format, get_popfile, clean_pop_data, extract_ts
idx = pd.IndexSlice

In [100]:
# Data loading
data1, data2, data3, data4 = get_data()
data = clean_all(data1, data2, data3, data4, get_bairros_data())
preprocessed_data, preprocessed_data_1000 = get_format(data)
pop_clean = clean_pop_data()


KeyboardInterrupt



In [None]:
# Preprocessing data
def subsample_sequence(data, length, AR): # Return a shorter dataframe with specified 
                                                 # length for a specific barrio
    last_possible = data.shape[0] - length
    random_start = np.random.randint(0, last_possible)
    data_sample = data[random_start: random_start+length]
    data_sample = data_sample.loc[:,idx[:,AR]]

    return data_sample

def split_subsample_sequence(data, length, AR): # Return a random sequence of specified length

    data_subsample = subsample_sequence(data, length, AR)
    y_sample = data_subsample.iloc[length-31:]
    
    X_sample = data_subsample[0:length-31]
    X_sample = X_sample.values
    return np.array(X_sample), np.array(y_sample)

def get_X_y(data, n_sequences, length, AR): # Return a sepcific number of (X,y) samples of specified length
                                                   # for a specified bairro

    X, y = [], []

    for i in range(n_sequences):
        (xi, yi) = split_subsample_sequence(data, length, AR)
        X.append(xi)
        y.append(yi)
        
    X = np.array(X)
    y = np.array(y)
    return X, y

def get_train_test(data,n_sequences,length, AR): # Return train and test data

    len_ = int(0.8*data.shape[0])
    data_train = data[:len_]
    data_test = data[len_:]
    
    test_seq = math.floor(n_sequences/4)
    
    X_train, y_train = get_X_y(data_train, n_sequences, length, AR)
    X_test, y_test = get_X_y(data_test, test_seq, length, AR)
    
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1],1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1],1)
    
    return X_train, y_train, X_test, y_test

In [None]:
 # Loading X_train/test, y_train/test for each AR in a global dictionnary

AR_list = preprocessed_data_1000.columns.levels[1].tolist()
all_data = {}

for AR in AR_list:
    data = get_train_test(preprocessed_data_1000, 2000, 200, AR)
    all_data[AR] = data

In [148]:
AR_list

['Anchieta',
 'Bangu',
 'Barra da Tijuca',
 'Botafogo',
 'Campo Grande',
 'Centro',
 'Cidade de Deus',
 'Complexo do Alemao',
 'Copacabana',
 'Guaratiba',
 'Ilha do Governador',
 'Inhauma',
 'Iraja',
 'Jacarepagua',
 'Jacarezinho',
 'Lagoa',
 'Madureira',
 'Mare',
 'Meier',
 'Pavuna',
 'Portuaria',
 'Ramos',
 'Realengo',
 'Rio Comprido',
 'Rocinha',
 'Santa Cruz',
 'Santa Teresa',
 'Sao Cristovao',
 'Tijuca',
 'Vila Isabel']

In [115]:
# Outside preprocessing 

for AR in AR_list:
    X_norm = Normalization()
    normalizer.adapt(all_data[AR][0])
    normalizer.adapt(all_data[AR][2])

In [145]:
# Creating a model
def init_model(AR):
    metric = metrics.MAPE
    opt = optimizers.RMSprop(learning_rate=0.005)
    
    model = models.Sequential()
    model.add(layers.LSTM(30, return_sequences=True, activation='tanh'))
    model.add(layers.Dense(5, activation='relu'))
    model.add(layers.Dense(31, activation='linear'))
    
    model.compile(loss='mse', 
                  optimizer=opt, 
                  metrics=[metric])
    
    return model

# Parameters
# def get_parameters(): 
es = EarlyStopping(monitor='val_loss', verbose=1, patience=5, restore_best_weights=True)

# Fitting the model
def fit(model, AR):
    hist = model.fit(all_data[AR][0], all_data[AR][1],
            validation_split=0.3,
            epochs=5, 
            batch_size=32,
            callbacks=[es], verbose=0)
    return hist

# # Evaluating the model 
# def evaluate(hist, AR):
#     results = hist.model.evaluate(all_data[AR][2], all_data[AR][3])
#     return results

# # All-in-1
# def modelling(AR):
#     results = evaluate(fit(init_model(AR), AR), AR)
#     return results

In [138]:
model = init_model('Centro')
fitted_model = fit(model, 'Centro')

In [146]:
results = evaluate(fitted_model, 'Centro')
results



[0.685863733291626, 29.732606887817383]

In [147]:
# Getting the results for each AR

all_results = {}

for AR in AR_list:
    all_results[AR] = modelling(AR)
    
all_results



{'Anchieta': [0.0007052274304442108, 43.19216537475586],
 'Bangu': [0.00038059073267504573, 31.299766540527344],
 'Barra da Tijuca': [0.000483615614939481, 49.03586959838867],
 'Botafogo': [0.0023192802909761667, 30.577455520629883],
 'Campo Grande': [5.748889816459268e-05, 31.537446975708008],
 'Centro': [0.7550690770149231, 28.683307647705078],
 'Cidade de Deus': [0.000908440153580159, 19729296.0],
 'Complexo do Alemao': [0.00013193904305808246, 6677155.5],
 'Copacabana': [0.010875527746975422, 55.954063415527344],
 'Guaratiba': [6.901000597281381e-05, 777216.3125],
 'Ilha do Governador': [0.00017346777894999832, 349548.15625],
 'Inhauma': [0.0011083482531830668, 295615.8125],
 'Iraja': [0.0010068053379654884, 28.063663482666016],
 'Jacarepagua': [0.000110312677861657, 27.325624465942383],
 'Jacarezinho': [5.9099231293657795e-05, 3595526.75],
 'Lagoa': [0.6943167448043823, 76.94197082519531],
 'Madureira': [0.0007411403930746019, 25.924497604370117],
 'Mare': [0.0001275737740797922, 

In [152]:
# getting the mean of all the results
mape = [all_results[AR][1] for AR in all_results]
mape_overall = np.mean(mape)
mape_overall

1822820.9762325287

In [113]:
# Plotting the results
import matplotlib.pyplot as plt
plt.plot(history.history['mean_absolute_percentage_error'])
plt.plot(history.history['val_mean_absolute_percentage_error'])

NameError: name 'history' is not defined

In [None]:
# Printing the results

res = model.evaluate(X_test, y_test, verbose=0)
print(f'MAPE on the test set : {res[1]:.0f} %')