In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import curve_fit

In [2]:
DATA_DIR = 'data'
OUTPUT_FILE = 'submission.csv'
TRAIN_SEQ_SIZE = 62
TEST_SEQ_SIZE = 43

def extract_days_fist_infection(data):
    data['Confirmed>0'] = data['ConfirmedCases'] > 0
    data['Days since first infection'] = \
        data.groupby(['Country_Region',
                      'Province_State'])['Confirmed>0'] \
        .cumsum() \
        .astype(int)
    data.drop(columns=['Confirmed>0'], inplace=True)


def load_initial_data():
    train_path = os.path.join(DATA_DIR, 'train.csv')
    test_path = os.path.join(DATA_DIR, 'test.csv')
    train_data, test_data = pd.read_csv(train_path), pd.read_csv(test_path)
    return train_data, test_data


def load_extended_data():
    def date_parser(x):
        return pd.datetime.strptime(x, '%Y-%m-%d')

    train_path = os.path.join(DATA_DIR, 'train.csv')
    test_path = os.path.join(DATA_DIR, 'test.csv')
    labeled = pd.read_csv(train_path, parse_dates=['Date'],
                          date_parser=date_parser, index_col='Date')
    unlabeled = pd.read_csv(test_path, parse_dates=['Date'],
                            date_parser=date_parser, index_col='Date')
    labeled['Province_State'] = labeled['Province_State'].fillna('<placeholder>')
    unlabeled['Province_State'] = unlabeled['Province_State'].fillna('<placeholder>')
    extract_days_fist_infection(labeled)
    labeled['dummy'] = 1
    labeled['Dummy'] = labeled.groupby(['Country_Region', 'Province_State'])['dummy'].cumsum()
    labeled.drop(columns=['dummy'], inplace=True)
    n_days_training = labeled.groupby(['Country_Region', 'Province_State'])['Id'] \
                             .count() \
                             .max()
    n_days_forecast = unlabeled.groupby(['Country_Region', 'Province_State'])['ForecastId'] \
                               .count() \
                               .max()
    last_days = labeled \
                .groupby(['Country_Region', 'Province_State'])['Days since first infection'] \
                .max()
    last_days = pd.Series(np.repeat(last_days.values, n_days_forecast),
                          index=unlabeled.index)
    unlabeled['dummy'] = 1
    dummy_cumsum = unlabeled \
                   .groupby(['Country_Region', 'Province_State'])['dummy'] \
                   .cumsum()
    continued_days = last_days + dummy_cumsum
    unlabeled['Dummy'] = dummy_cumsum + n_days_training
    unlabeled['Days since first infection'] = continued_days
    unlabeled.drop(columns=['dummy'], inplace=True)
    return labeled, unlabeled

In [4]:
def convert_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if isinstance(data, list) else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i))
                  for j in range(n_vars)]
    # output sequence
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1))
                      for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1))
                      for j in range(n_vars)]
    # combine all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [5]:
def linear_regression(labeled, unlabeled, dummy=False, remove_duplicates=False):
    fit_models = {}
    for key, group in labeled.groupby(['Country_Region', 'Province_State']):
        if dummy:
            X = group['Dummy']
            y = group[['ConfirmedCases', 'Fatalities']]
        else:
            if remove_duplicates:
                unique_days = len(group['Days since first infection'].drop_duplicates(keep='last'))
                start_idx = len(group) - unique_days
            else:
                start_idx = 0
            X = group['Days since first infection'][start_idx:]
            y = group[['ConfirmedCases', 'Fatalities']][start_idx:]
        X = X.values.reshape(-1, 1)
        y = y.values
        model = LinearRegression()
        fit_models[key] = model.fit(X, y)
        print('linear regression => (%s, %s)' % (key[0], key[1]))
    predictions = []
    for key, group in unlabeled.groupby(['Country_Region', 'Province_State']):
        if dummy:
            X = group['Dummy']
        else:
            X = group['Days since first infection']
        X = X.values.reshape(-1, 1)
        predictions.append(fit_models[key].predict(X))
    predictions = np.vstack(predictions)
    predictionsdf = pd.DataFrame(columns=['ForecastId', 'ConfirmedCases', 'Fatalities'])
    predictionsdf['ForecastId'] = unlabeled['ForecastId']
    predictionsdf[['ConfirmedCases', 'Fatalities']] = predictions
    predictionsdf[['ConfirmedCases', 'Fatalities']] = predictionsdf[['ConfirmedCases', 'Fatalities']].astype(int)
    return predictionsdf

In [6]:
def polynomial_regression(labeled, unlabeled, dummy=False, degree=4,
                          remove_duplicates=False):
    fit_models = {}
    for key, group in labeled.groupby(['Country_Region', 'Province_State']):
        if dummy:
            X = group['Dummy']
            y = group[['ConfirmedCases', 'Fatalities']]
        else:
            if remove_duplicates:
                unique_days = len(group['Days since first infection'].drop_duplicates(keep='last'))
                start_idx = len(group) - unique_days
            else:
                start_idx = 0
            X = group['Days since first infection'][start_idx:]
            y = group[['ConfirmedCases', 'Fatalities']][start_idx:]
        X = X.values.reshape(-1, 1)
        y = y.values
        poly_features = PolynomialFeatures(degree=degree)
        X = poly_features.fit_transform(X)
        model = LinearRegression()
        fit_models[key] = model.fit(X, y)
        print('polynomial regression => (%s, %s)' % (key[0], key[1]))
    predictions = []
    for key, group in unlabeled.groupby(['Country_Region', 'Province_State']):
        if dummy:
            X = group['Dummy']
        else:
            X = group['Days since first infection']
        X = X.values.reshape(-1, 1)
        poly_features = PolynomialFeatures(degree=degree)
        X = poly_features.fit_transform(X)
        predictions.append(fit_models[key].predict(X))
    predictions = np.vstack(predictions)
    predictionsdf = pd.DataFrame(columns=['ForecastId', 'ConfirmedCases', 'Fatalities'])
    predictionsdf['ForecastId'] = unlabeled['ForecastId']
    predictionsdf[['ConfirmedCases', 'Fatalities']] = predictions
    predictionsdf[['ConfirmedCases', 'Fatalities']] = predictionsdf[['ConfirmedCases', 'Fatalities']].astype(int)
    return predictionsdf

In [7]:
def sigmoid(X, M, beta, alpha):
    return M / (1 + np.exp(-beta * (X - alpha)))

def sigmoid_regression(labeled, unlabeled, dummy=False,
                       remove_duplicates=False):
    fit_models = {}
    for key, group in labeled.groupby(['Country_Region', 'Province_State']):
        if dummy:
            X = group['Dummy']
            y = group[['ConfirmedCases', 'Fatalities']]
        else:
            if remove_duplicates:
                unique_days = len(group['Days since first infection'].drop_duplicates(keep='last'))
                start_idx = len(group) - unique_days
            else:
                start_idx = 0
            X = group['Days since first infection'][start_idx:]
            y = group[['ConfirmedCases', 'Fatalities']][start_idx:]
        y = y.values
        y_c = y[:, 0]
        y_f = y[:, 1]
        try:
            popt, pcov = curve_fit(sigmoid, X, y_c,
                                   maxfev=15000,
                                   p0=[1000, 0.25, 100])
            model_c = popt
            popt, pcov = curve_fit(sigmoid, X, y_f,
                                   maxfev=15000,
                                   p0=[1000, 0.25, 100])
            model_f = popt
            fit_models[key] = {'cases': model_c, 'fatalities': model_f}
            print('sigmoid regression => (%s, %s)' % (key[0], key[1]))
        except RuntimeError as e:
            print(e.args[0])
            print('fitting linear regression model instead')
            X = X.values.reshape(-1, 1)
            lin_model = LinearRegression()
            fit_model = lin_model.fit(X, y)
            fit_models[key] = fit_model
            print('linear regression => (%s, %s)' % (key[0], key[1]))
    predictions = []
    for key, group in unlabeled.groupby(['Country_Region', 'Province_State']):
        if dummy:
            X = group['Dummy']
        else:
            X = group['Days since first infection']
        X = X.values
        if isinstance(fit_models[key], LinearRegression):
            X = X.reshape(-1, 1)
            y_pred = fit_models[key].predict(X)
        else:
            c_params = fit_models[key]['cases']
            f_params = fit_models[key]['fatalities']
            y_pred = np.hstack([sigmoid(X, *c_params).reshape(-1, 1),
                               sigmoid(X, *f_params).reshape(-1, 1)])
        predictions.append(y_pred)
    predictions = np.vstack(predictions)
    predictionsdf = pd.DataFrame(columns=['ForecastId', 'ConfirmedCases', 'Fatalities'])
    predictionsdf['ForecastId'] = unlabeled['ForecastId']
    predictionsdf[['ConfirmedCases', 'Fatalities']] = predictions
    predictionsdf[['ConfirmedCases', 'Fatalities']] = predictionsdf[['ConfirmedCases', 'Fatalities']].astype(int)
    return predictionsdf

In [8]:
def get_submissions(model_name, dummy_vars=False, poly_degree=4,
                    remove_duplicates=False):
    model = model_name.lower()
    if model_name == 'linreg':
        linear_regression(labeled, unlabeled,
                          dummy=dummy_vars,
                          remove_duplicates=remove_duplicates).to_csv('submission.csv',
                                                                      index=False)
    elif model == 'polyreg':
        polynomial_regression(labeled, unlabeled,
                              dummy=dummy_vars,
                              degree=poly_degree,
                              remove_duplicates=remove_duplicates).to_csv('submission.csv',
                                                                          index=False)
    elif model == 'sigmoid':
        sigmoid_regression(labeled, unlabeled,
                           dummy=dummy_vars,
                           remove_duplicates=remove_duplicates).to_csv('submission.csv',
                                                                       index=False)
    else:
        raise NotImplemented(f'{model} not implemented.')

In [11]:
labeled, unlabeled = load_extended_data()
get_submissions('sigmoid', dummy_vars=False, poly_degree=2,
                remove_duplicates=True)

  result = getattr(ufunc, method)(*inputs, **kwargs)


sigmoid regression => (Afghanistan, <placeholder>)
sigmoid regression => (Albania, <placeholder>)
sigmoid regression => (Algeria, <placeholder>)
sigmoid regression => (Andorra, <placeholder>)
sigmoid regression => (Angola, <placeholder>)
sigmoid regression => (Antigua and Barbuda, <placeholder>)
sigmoid regression => (Argentina, <placeholder>)




sigmoid regression => (Armenia, <placeholder>)
sigmoid regression => (Australia, Australian Capital Territory)
sigmoid regression => (Australia, New South Wales)
sigmoid regression => (Australia, Northern Territory)
sigmoid regression => (Australia, Queensland)
sigmoid regression => (Australia, South Australia)
sigmoid regression => (Australia, Tasmania)
sigmoid regression => (Australia, Victoria)
sigmoid regression => (Australia, Western Australia)
sigmoid regression => (Austria, <placeholder>)
sigmoid regression => (Azerbaijan, <placeholder>)
sigmoid regression => (Bahamas, <placeholder>)
sigmoid regression => (Bahrain, <placeholder>)
sigmoid regression => (Bangladesh, <placeholder>)
sigmoid regression => (Barbados, <placeholder>)
sigmoid regression => (Belarus, <placeholder>)
sigmoid regression => (Belgium, <placeholder>)
sigmoid regression => (Belize, <placeholder>)
sigmoid regression => (Benin, <placeholder>)
sigmoid regression => (Bhutan, <placeholder>)
sigmoid regression => (Bol

  result = getattr(ufunc, method)(*inputs, **kwargs)


sigmoid regression => (Dominican Republic, <placeholder>)
sigmoid regression => (Ecuador, <placeholder>)
sigmoid regression => (Egypt, <placeholder>)
sigmoid regression => (El Salvador, <placeholder>)
sigmoid regression => (Equatorial Guinea, <placeholder>)
sigmoid regression => (Eritrea, <placeholder>)
sigmoid regression => (Estonia, <placeholder>)
sigmoid regression => (Eswatini, <placeholder>)
sigmoid regression => (Ethiopia, <placeholder>)
sigmoid regression => (Fiji, <placeholder>)
sigmoid regression => (Finland, <placeholder>)
sigmoid regression => (France, <placeholder>)
sigmoid regression => (France, French Guiana)
sigmoid regression => (France, French Polynesia)
sigmoid regression => (France, Guadeloupe)
sigmoid regression => (France, Martinique)
sigmoid regression => (France, Mayotte)
sigmoid regression => (France, New Caledonia)
sigmoid regression => (France, Reunion)
sigmoid regression => (France, Saint Barthelemy)
sigmoid regression => (France, Saint Pierre and Miquelon)
s

sigmoid regression => (US, Washington)
sigmoid regression => (US, West Virginia)
sigmoid regression => (US, Wisconsin)
sigmoid regression => (US, Wyoming)
sigmoid regression => (Uganda, <placeholder>)
sigmoid regression => (Ukraine, <placeholder>)
sigmoid regression => (United Arab Emirates, <placeholder>)
sigmoid regression => (United Kingdom, <placeholder>)
sigmoid regression => (United Kingdom, Anguilla)
sigmoid regression => (United Kingdom, Bermuda)
sigmoid regression => (United Kingdom, British Virgin Islands)
sigmoid regression => (United Kingdom, Cayman Islands)
sigmoid regression => (United Kingdom, Channel Islands)
sigmoid regression => (United Kingdom, Falkland Islands (Malvinas))
sigmoid regression => (United Kingdom, Gibraltar)
sigmoid regression => (United Kingdom, Isle of Man)
sigmoid regression => (United Kingdom, Montserrat)
sigmoid regression => (United Kingdom, Turks and Caicos Islands)
sigmoid regression => (Uruguay, <placeholder>)
sigmoid regression => (Uzbekistan,