In [2]:
# Import section
!pip install unidecode
!pip install fuzzymatcher
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers, metrics
from tensorflow.keras.layers import Normalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import RobustScaler
import pandas as pd
import numpy as np 
import math
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import datetime
from datetime import date, timedelta
from clean import clean_all, get_data, get_bairros_data
from preproc import get_format, extract_ts
idx = pd.IndexSlice



In [8]:
def get_format(data):
    '''Function taking a clean dataset and returning 2 ARIMA-friendly df:
    1. total nb_crime / day and AR
    2. nb_crime per 1000 inhab / day and AR'''

    data["Date"] = data["Date_Time"].dt.date
    preprocessed_data = data.groupby(["AR", "Date"]).count()[["Crime_ID"]]
    preprocessed_data.rename(columns={"Crime_ID":"nb_crimes"}, inplace=True)

    ## add a column with yearly population per AR
    pop_clean = clean_pop_data()
    preprocessed_data["year_temp"] = preprocessed_data.index.map(lambda x: x[1].year)
    input_merge = preprocessed_data.reset_index()
    data_merge = pd.merge(left=input_merge, right=pop_clean, left_on=["AR","year_temp"],\
        right_on=["administrative_regions", "Ano"])
    data_merge.drop(columns=["administrative_regions","Ano"], inplace = True)
    preprocessed_data.drop(columns=["year_temp"],inplace = True)
    ## compute nb crimes / 1000 inhabitants
    data_merge["nb_crimes_1000"]=data_merge.nb_crimes / data_merge.Populacao*1000
    data_merge.drop(columns=["nb_crimes","Populacao","year_temp"], inplace=True)
    data_merge.rename(columns={'nb_crimes_1000': 'nb_crimes'},inplace=True)
    data_merge.set_index(["AR","Date"],inplace=True)
    ##
    preprocessed_data = preprocessed_data.unstack(level=0)
    preprocessed_data = preprocessed_data.replace(np.nan, 0).astype(int)
    preprocessed_data_1000 = data_merge.unstack(level=0)
    preprocessed_data_1000 = preprocessed_data_1000.replace(np.nan, 0)
    return preprocessed_data, preprocessed_data_1000

def get_popfile():
    '''import population file'''
    return pd.read_csv("population_Rio.csv", sep=",")

def clean_pop_data():
    data= get_popfile()
    data = data.drop(columns=["DensidadeBruta", "DensidadeLiquida", "TaxaGeometrica"])

    # Get rid of Roman numbers in front of name of administrative regions
    splitted_regions = data['RegiaoAdministrativa'].str.split().str[1:]
    cleaned_regions = splitted_regions.str.join(" ")
    data["administrative_regions"] = cleaned_regions
    data = data.drop(columns=["RegiaoAdministrativa"])

    # Dictionary of population data per region for 2000-2020 in 5-years steps
    regions_dict = {}
    for region in data["administrative_regions"].unique():
        regions_dict[region] = data[data["administrative_regions"]==region]

    # Dataframe with years for 2000-2020 in 1-year steps
    year_df = pd.DataFrame(pd.period_range(min(data.Ano), max(data.Ano), freq="Y"), columns=["Ano"])
    year_df = year_df[["Ano"]].astype("str").astype("int64")

    # Extend time series to annual time series and interpolate lineraly the missing population data
    middle_dict = {}
    for k, v in regions_dict.items():
        middle_dict[k] = year_df.merge(v, how="left", on="Ano")
        middle_dict[k]["administrative_regions"].fillna(value=k, inplace=True)
        middle_dict[k]["Populacao"].interpolate(method='linear', inplace=True)


    # Create a new dataframe with cleand and extended population data
    empty = pd.DataFrame(columns=["Ano", "Populacao", "administrative_regions"])
    df = pd.concat(middle_dict).reset_index().drop(columns=["level_0", "level_1"])

    return df

def extract_ts(df, AR):
    '''Extract the time series from selected df and for each AR'''
    df1 = df.reset_index()
    df2 = df1[[(     'Date',                   ''), ('nb_crimes',AR)]]
    df2.columns = df2.columns.droplevel()
    df2.columns=["ds", "y"]
    df2.ds = df2.ds.map(pd.to_datetime)
    return df2


In [9]:
# Data loading
data1 = pd.read_csv('parte1.csv', sep=';', encoding = 'iso-8859-1')
data2 = pd.read_csv('parte2.csv', sep=';', encoding = 'iso-8859-1')
data3 = pd.read_csv('parte3.csv', sep=';', encoding = 'iso-8859-1')
data4 = pd.read_csv('parte4.csv', sep=';', encoding = 'iso-8859-1')
bairros = pd.read_csv('bairros_lista.csv', encoding = 'iso-8859-1')
data = clean_all(data1, data2, data3, data4, bairros)
preprocessed_data, preprocessed_data_1000 = get_format(data)
pop_clean = clean_pop_data()

In [10]:
# Preprocessing data
def subsample_sequence(data, length, AR): # Return a shorter dataframe with specified 
                                                 # length for a specific barrio
    last_possible = data.shape[0] - length
    random_start = np.random.randint(0, last_possible)
    data_sample = data[random_start: random_start+length]
    data_sample = data_sample.loc[:,idx[:,AR]]

    return data_sample

def split_subsample_sequence(data, length, AR): # Return a random sequence of specified length

    data_subsample = subsample_sequence(data, length, AR)
    y_sample = data_subsample.iloc[length-31:]
    
    X_sample = data_subsample[0:length-31]
    X_sample = X_sample.values
    return np.array(X_sample), np.array(y_sample)

def get_X_y(data, n_sequences, length, AR): # Return a sepcific number of (X,y) samples of specified length
                                                   # for a specified bairro

    X, y = [], []

    for i in range(n_sequences):
        (xi, yi) = split_subsample_sequence(data, length, AR)
        X.append(xi)
        y.append(yi)
        
    X = np.array(X)
    y = np.array(y)
    return X, y

def get_train_test(data,n_sequences,length, AR): # Return train and test data

    len_ = int(0.8*data.shape[0])
    data_train = data[:len_]
    data_test = data[len_:]
    
    test_seq = math.floor(n_sequences/4)
    
    X_train, y_train = get_X_y(data_train, n_sequences, length, AR)
    X_test, y_test = get_X_y(data_test, test_seq, length, AR)
    
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1],1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1],1)
    
    return X_train, y_train, X_test, y_test

In [11]:
 # Loading X_train/test, y_train/test for each AR in a global dictionnary

AR_list = preprocessed_data_1000.columns.levels[1].tolist()
all_data = {}

for AR in AR_list:
    data = get_train_test(preprocessed_data_1000, 500, 120, AR)
    all_data[AR] = data

In [13]:
# Outside preprocessing 

for AR in AR_list:
    normalizer = Normalization()
    normalizer.adapt(all_data[AR][0])
    normalizer.adapt(all_data[AR][2])

In [1]:
# Creating a model
def init_model(AR):
    metric = metrics.MAPE
    opt = optimizers.RMSprop(learning_rate=0.005)
    
    model = models.Sequential()
    model.add(layers.LSTM(30, return_sequences=True, activation='tanh'))
    model.add(layers.LSTM(10, activation='tanh'))
    model.add(layers.Dense(5, activation='relu'))
    model.add(layers.Dense(31, activation='linear'))
    
    model.compile(loss='mse', 
                  optimizer=opt, 
                  metrics=[metric])
    
    return model

# Parameters
# def get_parameters(): 
es = EarlyStopping(monitor='val_loss', verbose=1, patience=10, restore_best_weights=True)

# Fitting the model
def fit(model, AR):
    hist = model.fit(all_data[AR][0], all_data[AR][1],
            validation_split=0.3,
            epochs=20, 
            batch_size=32,
            callbacks=[es], verbose=0)
    return hist

# Evaluating the model 
def evaluate(hist, AR):
    results = hist.model.evaluate(all_data[AR][2], all_data[AR][3])
    return results

# All-in-1
def modelling(AR):
    results = evaluate(fit(init_model(AR), AR), AR)
    return results

NameError: ignored

In [None]:
# Getting the results for each AR

all_results = {}

for AR in AR_list:
    all_results[AR] = modelling(AR)
    
all_results

Restoring model weights from the end of the best epoch: 7.
Epoch 17: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 5.
Epoch 15: early stopping
Restoring model weights from the end of the best epoch: 6.
Epoch 16: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping


In [None]:
# Getting the mean of all the results
mape = [all_results[AR][1] for AR in all_results]
mape_overall = np.mean(mape)
mape_overall