In [79]:
import os
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_log_error
from keras.layers import LSTM, Dense, Input, Bidirectional
from keras.optimizers import Adagrad, Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
from keras.preprocessing.sequence import pad_sequences


In [80]:
DATA_DIR = 'data'

def load_initial_data():
    train_path = os.path.join(DATA_DIR, 'train.csv')
    test_path = os.path.join(DATA_DIR, 'test.csv')
    train_data, test_data = pd.read_csv(train_path), pd.read_csv(test_path)
    return train_data, test_data

def load_extended_data():
    train_path = os.path.join(DATA_DIR, 'extended_train.csv')
    test_path = os.path.join(DATA_DIR, 'extended_test.csv')
    train_data, test_data = pd.read_csv(train_path), pd.read_csv(test_path)
    return train_data, test_data

In [81]:
def extract_days_from_first_infection(data: pd.DataFrame, write_file=False):
    result = list()
    prev_region, prev_country = None, None
    current_count = 0
    found_case = False
    for value in data.values:
        if prev_country is None:
            prev_region, prev_country = value[1], value[2]
        if value[1] is prev_region and value[2] == prev_country:
            if value[6] == 0:
                result.append(current_count)
            elif value[6] != 0 and not found_case:
                found_case = True
                result.append(current_count)
            elif value[6] != 0 and found_case:
                current_count += 1
                result.append(current_count)
        else:
            found_case = False
            current_count = 0
            if value[6] == 0:
                result.append(current_count)
            elif value[6] != 0:
                found_case = True
                result.append(current_count)
        prev_region, prev_country = value[1], value[2]
    data['Days since first infection'] = result
    if write_file:
        data.to_csv('added_days_train.csv', index=False)
    

def add_days_from_first_infection_test(train_data: pd.DataFrame, test_data: pd.DataFrame, write_file=False):
    max_counts = dict()
    result = list()
    for value in train_data.values:
        max_counts[str(value[1]) + str(value[2])] = value[-1]
    previous_key = None
    current_count = 0
    for value in test_data.values:
        key = str(value[1]) + str(value[2])
        if previous_key is None or key != previous_key:
            current_count = max_counts[key] + 1
        else:
            current_count += 1
        result.append(current_count)
        previous_key = key
    test_data['Days since first infection'] = result
    if write_file:
        test_data.to_csv('added_days_test.csv', index=False)

In [82]:
def replace_missing_values_with_mean(data, write_file=False):
    data = data.mask(data == 0).fillna(data.mean())
    if write_file:
        data.to_excel('extra_features_improved.xlsx', index=False)

def merge_with_extra(train_df: pd.DataFrame, test_df: pd.DataFrame, extra_df: pd.DataFrame, write_file=False):
    train_df = train_df.merge(extra_df, how='left', on='Country/Region')
    test_df = test_df.merge(extra_df, how='left', on='Country/Region')
    if write_file:
        train_df.to_csv('extended_train_merged.csv', index=False)
        test_df.to_csv('extended_test_merged.csv', index=False)
    

In [83]:
def root_mean_squared_log_error(y_true, y_predicted):
    return np.sqrt(mean_squared_log_error(y_true, y_predicted))


def build_model(train_data):
    inputs = Input(shape=train_data[0].shape)
    lstm_1 = Bidirectional(LSTM(units=32, activation='softsign', return_sequences=True))(inputs)
    lstm_2 = Bidirectional(LSTM(units=32, activation='softsign'))(lstm_1)
    output_1 = Dense(1)(lstm_2)
    output_2 = Dense(1)(lstm_2)
    model = Model(inputs=inputs, outputs=[output_1, output_2])
    model.compile(optimizer=Adagrad(), loss=[root_mean_squared_log_error, root_mean_squared_log_error], metrics=['acc'])
    return model

In [133]:
def get_features(df, maxlen=None):
    def add_padding(data, seqlen):
        n = data.shape[1]
        if n < seqlen:
            return np.concatenate((data, np.zeros((data.shape[0], seqlen - n, data.shape[2]),
                                                  dtype=data.dtype)),
                                  axis=1)
        else:
            return data
    
    seqlen = df.shape[-1]
    if maxlen is not None:
        seqlen = maxlen
    feature_columns = [i for i in range(3, 22) if not i in [5, 16, 17]]
    label_columns = [16, 17]
    df['Province/State'] = df['Province/State'].fillna('<placeholder>')
    groups = np.stack([group for _, group in df.groupby(['Country/Region', 'Province/State'])])
    # Pad here
    ######################
    features = add_padding(groups[:, :, feature_columns].astype(float), seqlen)
    labels = add_padding(groups[:, :, label_columns].astype(float), seqlen)
    ######################
    return features, labels

In [134]:
train, test = load_extended_data()
# extra_data = pd.read_excel('data/extra_features.xlsx')
# merge_with_extra(train, test, extra_data)
get_features(train, 62)

(array([[[ 33.        ,  65.        ,  45.64371021, ...,   2.3797546 ,
            2.58492694, 400.15337423],
         [ 33.        ,  65.        ,  45.64371021, ...,   2.3797546 ,
            2.58492694, 400.15337423],
         [ 33.        ,  65.        ,  45.64371021, ...,   2.3797546 ,
            2.58492694, 400.15337423],
         ...,
         [ 33.        ,  65.        ,  45.64371021, ...,   2.3797546 ,
            2.58492694, 400.15337423],
         [ 33.        ,  65.        ,  45.64371021, ...,   2.3797546 ,
            2.58492694, 400.15337423],
         [ 33.        ,  65.        ,  45.64371021, ...,   2.3797546 ,
            2.58492694, 400.15337423]],
 
        [[ 41.1533    ,  20.1683    ,  27.71428571, ...,   2.3797546 ,
           13.74473591, 400.15337423],
         [ 41.1533    ,  20.1683    ,  27.71428571, ...,   2.3797546 ,
           13.74473591, 400.15337423],
         [ 41.1533    ,  20.1683    ,  27.71428571, ...,   2.3797546 ,
           13.74473591, 400.1533