In [61]:
import os
import pandas as pd
import numpy as np

from keras.layers import LSTM, Dense, Input, Bidirectional
from keras.optimizers import Adagrad, Adam
from keras.losses import mean_squared_logarithmic_error
from keras.utils import to_categorical
from keras.models import load_model, Model
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K


In [62]:
DATA_DIR = 'data'
OUTPUT_FILE = 'submission.csv'
TRAIN_SEQ_SIZE = 62
TEST_SEQ_SIZE = 43

def load_initial_data():
    train_path = os.path.join(DATA_DIR, 'train.csv')
    test_path = os.path.join(DATA_DIR, 'test.csv')
    train_data, test_data = pd.read_csv(train_path), pd.read_csv(test_path)
    return train_data, test_data

def load_extended_data():
    train_path = os.path.join(DATA_DIR, 'extended_train.csv')
    test_path = os.path.join(DATA_DIR, 'extended_test.csv')
    train_data, test_data = pd.read_csv(train_path), pd.read_csv(test_path)
    return train_data, test_data

In [63]:
def extract_days_from_first_infection(data: pd.DataFrame, write_file=False):
    result = list()
    prev_region, prev_country = None, None
    current_count = 0
    found_case = False
    for value in data.values:
        if prev_country is None:
            prev_region, prev_country = value[1], value[2]
        if value[1] is prev_region and value[2] == prev_country:
            if value[6] == 0:
                result.append(current_count)
            elif value[6] != 0 and not found_case:
                found_case = True
                result.append(current_count)
            elif value[6] != 0 and found_case:
                current_count += 1
                result.append(current_count)
        else:
            found_case = False
            current_count = 0
            if value[6] == 0:
                result.append(current_count)
            elif value[6] != 0:
                found_case = True
                result.append(current_count)
        prev_region, prev_country = value[1], value[2]
    data['Days since first infection'] = result
    if write_file:
        data.to_csv('added_days_train.csv', index=False)
    

def add_days_from_first_infection_test(train_data: pd.DataFrame, test_data: pd.DataFrame, write_file=False):
    max_counts = dict()
    result = list()
    for value in train_data.values:
        max_counts[str(value[1]) + str(value[2])] = value[-1]
    previous_key = None
    current_count = 0
    for value in test_data.values:
        key = str(value[1]) + str(value[2])
        if previous_key is None or key != previous_key:
            current_count = max_counts[key] + 1
        else:
            current_count += 1
        result.append(current_count)
        previous_key = key
    test_data['Days since first infection'] = result
    if write_file:
        test_data.to_csv('added_days_test.csv', index=False)

In [64]:
def replace_missing_values_with_mean(data, write_file=False):
    data = data.mask(data == 0).fillna(data.mean())
    if write_file:
        data.to_excel('extra_features_improved.xlsx', index=False)

def merge_with_extra(train_df: pd.DataFrame, test_df: pd.DataFrame, extra_df: pd.DataFrame, write_file=False):
    train_df = train_df.merge(extra_df, how='left', on='Country/Region')
    test_df = test_df.merge(extra_df, how='left', on='Country/Region')
    if write_file:
        train_df.to_csv('extended_train_merged.csv', index=False)
        test_df.to_csv('extended_test_merged.csv', index=False)
    

In [65]:
def root_mean_squared_log_error(y_true, y_predicted):
    return K.sqrt(mean_squared_logarithmic_error(y_true, y_predicted))


def build_model(train_features, train_labels, build_version=1, load_path=None):
    if load_path:
        model = load_model(load_path)
    else:
        if build_version == 1:
            inputs = Input(shape=train_features[0].shape)
            lstm_1 = LSTM(units=16, activation='softsign', return_sequences=True)(inputs)
            lstm_2 = LSTM(units=8, activation='softsign', return_sequences=True)(lstm_1)
            dense = Dense(4, activation='relu')(lstm_2)
            output = Dense(2)(dense)
            model = Model(inputs=inputs, outputs=output)
            model.compile(optimizer=Adagrad(), loss=mean_squared_logarithmic_error, metrics=['acc'])
            model.fit(train_features, train_labels, batch_size=64, epochs=10)
            model.save('model_v1.h5')
        else:
            inputs = Input(shape=train_features[0].shape)
            lstm_1 = Bidirectional(LSTM(units=32, activation='softsign', return_sequences=True))(inputs)
            lstm_2 = Bidirectional(LSTM(units=32, activation='softsign', return_sequences=True))(lstm_1)
            dense = Dense(4, activation='relu')(lstm_2)
            output = Dense(2)(dense)
            model = Model(inputs=inputs, outputs=output)
            model.compile(optimizer=Adagrad(), loss=mean_squared_logarithmic_error, metrics=['acc'])
            model.fit(train_features, train_labels, batch_size=64, epochs=10)
            model.save('model_v2.h5')
    model.summary()
    return model

In [66]:
def get_features(df: pd.DataFrame, seq_len=TRAIN_SEQ_SIZE, for_train=True):
    df['Province/State'] = df['Province/State'].fillna('<placeholder>')
    groups = np.stack([group for _, group in df.groupby(['Country/Region', 'Province/State'])])
    if for_train:
        feature_columns = [i for i in range(3, 22) if i not in [5, 16, 17]]
        label_columns = [16, 17]
    else:
        feature_columns = [i for i in range(3, 20) if i != 5]
        label_columns = None
    features = groups[:, :, feature_columns]
    features = pad_sequences(features, seq_len, padding='post', truncating='post', dtype='float32')
    if label_columns is not None:
        labels = groups[:, :, label_columns]
        labels = pad_sequences(labels, seq_len, padding='post', truncating='post', dtype='float32')
        return features, labels
    return features, None

In [67]:
def create_output_file(model: Model, test_features):
    predictions = model.predict(test_features)
    print(predictions.shape)
    with open(OUTPUT_FILE, 'w') as f:
        f.write('ForecastId,ConfirmedCases,Fatalities\n')
        count = 1
        for pred in predictions:
            for i in range(TEST_SEQ_SIZE):
                f.write(str(count) + ',' + str(pred[i][0]) + ',' + str(pred[i][1]) + '\n')
                count += 1

In [68]:
def extra_main():
    train, test = load_extended_data()

    # 62 is the sequence length for train data and 43 for the testing data
    train_features, train_labels = get_features(train)
    test_features, _ = get_features(test, for_train=False)
    
    model = build_model(train_features, train_labels, build_version=2)
    create_output_file(model, test_features)


extra_main()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 62, 16)            0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 62, 64)            12544     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 62, 64)            24832     
_________________________________________________________________
dense_9 (Dense)              (None, 62, 4)             260       
_________________________________________________________________
dense_10 (Dense)             (None, 62, 2)             10        
Total params: 37,646
Trainable params: 37,646
Non-trainable params: 0
_________________________________________________________________
(284, 62, 