In [1]:
# Packages
import numpy as np
import sklearn 
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from scipy.integrate import odeint
from scipy.optimize import minimize
from utility_code.utility import utils

In [2]:
trainrd2 = pd.read_csv("train_round2.csv")
datestofind = trainrd2['Date'] == '09-01-2020'
start = datestofind[datestofind == True].index[0]
datestofind2 = trainrd2['Date'] == '09-26-2020'
end = datestofind[datestofind == True].index[49]

bestsub = pd.read_csv("team31-nov29-2.csv")

validation = trainrd2.iloc[start:end,:]
def MAPE(pred, valid):
    pred = pred.reset_index()
    valid = valid.reset_index()
    pred = pred.astype('int64')
    valid = valid.astype('int64')
    v = pred.subtract(valid)
    v = v.divide(valid)
    v = v.abs()
    v = v.sum(axis = 0)
    #v = v[0]+v[1]+v[2]
    n = len(pred)
    return v/n

In [3]:
#import csvs
train = pd.read_csv("ucla2020-cs145-covid19-prediction/train.csv")
test = pd.read_csv("ucla2020-cs145-covid19-prediction/test.csv")
graph = pd.read_csv("ucla2020-cs145-covid19-prediction/graph.csv")
supp = pd.read_csv("data-test/raw_data_test.csv", skiprows=2, thousands=',')
supp = supp[supp['Location'].isin(train['Province_State'])]
supp['Population'] = supp['Number of COVID-19 Cases'].divide(supp['COVID-19 Cases per 1,000,000 Population']) * 1e6

states = pd.Series.unique(train['Province_State'])
num_states = len(states)

In [10]:
## MANIPULATES MODEL AND MODEL PARAMS
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
import random

###################################################
#TRY DIFFERENT VALUES FOR PARAMETERS AND KEEP THE BEST
best_parameter_setup = { 'alpha': 1.6625501873415338,
                         'max_iter': 2267,
                         'tol': 0.0018737851734607867,
                         'random_state':229018,
                         'window_size':5,
                         'confirmed_error': 7.667212448385207e-05,
                         'deaths_error': 0.00016738204091735175  }

best_parameter_setups = [best_parameter_setup]
confirmed_error_best = 1 
deaths_error_best = 1

#standard deviations: try increasing sd_scale for more variance (or decreasing for opposite effect)
sd_scale = 8
alpha_sd = 0.2 * sd_scale;
max_iter_sd = 100 * sd_scale;
tol_sd = 5e-4 * sd_scale;

#returns sample taken from normal distribution, bounds the sample with min and max
DEFAULT_MIN = 1e-25
DEFAULT_MAX = 999999
def normal_sample (mean,sd,mn,mx,rnd = False):
    sample = max(mn, min(mx, np.random.normal(mean, sd)))
    if rnd:
        return int(sample)
    else:
        return sample


ITERATIONS = 10
parameter_setup = {}

print("Starting using previous best setup...")

for it in range(ITERATIONS+1):
    
    #apply normal "randomization"
    parameter_setup = {}
    if it > 0:
        parameter_setup['alpha'] = normal_sample( best_parameter_setups[-1]['alpha'], alpha_sd, DEFAULT_MIN, DEFAULT_MAX)
        parameter_setup['max_iter'] = normal_sample( best_parameter_setups[-1]['max_iter'], max_iter_sd, 1, DEFAULT_MAX, rnd=True)
        parameter_setup['tol'] = normal_sample( best_parameter_setups[-1]['tol'], tol_sd, DEFAULT_MIN, 20) 
        parameter_setup['random_state'] = random.randint(0,DEFAULT_MAX)
        parameter_setup['window_size'] = random.randint(1,14)
    else:
        parameter_setup = best_parameter_setups[-1]
    
    
    #choose a regressor
    reg = MultiOutputRegressor(estimator=Ridge(
                                    alpha=parameter_setup['alpha'],
                                    max_iter=parameter_setup['max_iter'],
                                    tol=parameter_setup['tol'],
                                    random_state = parameter_setup['random_state']))
        
###################################################

    ## MANIPULATES FEATURES FROM DF TO USE AND WINDOW SIZE
    #only look at the features in features list
    features = ['Confirmed','Deaths']
    num_features = len(features)

    #stratify by state (into state dictionary)
    statesdata = {}
    for s in states:
        statesdata[s] = train.loc[train['Province_State'] == s,features]

    
    ###################################################
    WINDOW_SIZE = parameter_setup['window_size']
    ###################################################
    
    state_feature_indices = utils.get_column_indices(statesdata['California'],features)

    #append the feature spaces from the W days prior (where W is the window length)
    new_features = []
    for day in range(WINDOW_SIZE):
        for f in features:
            new_features.append(f + "(-"+ str(WINDOW_SIZE-day) + " days)")
    all_new_features = new_features + features

    ## Set up dictionary of projections
    proj = {}

    ## Loop over states
    for s in states:

        a = statesdata[s]

        #fill the knn data using days from training set
        knndata = pd.DataFrame(columns = all_new_features)
        num_training_days = len(statesdata['California'])

        #fill the table
        for d in range(WINDOW_SIZE,num_training_days):
            knndata_row_index = knndata.shape[0]
            knn_row = utils.flatten_dataframe(a,slice(d-WINDOW_SIZE,d+1), state_feature_indices)
            utils.dataframe_append_row(knndata,knn_row,s,d)   

        # Actual recursive prediction
        days_to_predict = 26
        for d in range(days_to_predict):
            #x = knndata.drop(columns=features)
            #y = knndata.drop(columns=features)
            x = knndata.drop(['Confirmed', 'Deaths'], axis = 1)
            y = knndata[['Confirmed', 'Deaths']]
            toguess = 1
            trainx = x.head(len(x))
            trainy = y.head(len(y))
            # testy = y.tail(toguess)

            reg.fit(trainx, trainy)

            #rmv = [i for i in range(num_features)]
            #ftrs = knndata.drop(columns=knndata.columns[rmv]).tail(1)
            ftrs = knndata.drop(columns=knndata.columns[[0,1]]).tail(1)
            #ftrs.drop(columns=knndata.columns[[0,1]])

            ftrs.columns = knndata.columns[0:num_features*WINDOW_SIZE]

            new = reg.predict(ftrs)
            ftrs = np.append(ftrs, new)
            ftrs = ftrs.astype(int)
            #if d==0: print(ftrs)
            knndata = knndata.append(dict(zip(knndata.columns, ftrs)), ignore_index=True)

            # append to knndata
            #if d == 0: 
                #print(knndata)
        done = knndata.tail(days_to_predict)
        done = done[['Confirmed', 'Deaths']]
        #print(done)
        proj[s] = done

    ## Get ordering of states in test    
    order = test.loc[0:49,'Province_State']

    # format submission
    conf = []
    dead = []
    fid = 0
    for i in range(days_to_predict):
        for j in order:
            projection = proj[j].iloc[i]
            #print(j, 'day', i)
            conf.append(int(projection['Confirmed']))
            dead.append(int(projection['Deaths']))
            #print(fid)
            fid+=1

    test['Confirmed'] = conf
    test['Deaths'] = dead

###################################################
    errors = MAPE(test[['Confirmed', 'Deaths']], validation[['Confirmed', 'Deaths']])
    parameter_setup['confirmed_error'] = errors['Confirmed']
    parameter_setup['deaths_error'] = errors['Deaths']
    
    
    found_good_sol = False
    if(parameter_setup['confirmed_error'] < confirmed_error_best):
        confirmed_error_best = parameter_setup['confirmed_error']
        found_good_sol = True
    if(parameter_setup['deaths_error'] < deaths_error_best):
        deaths_error_best = parameter_setup['deaths_error']
        found_good_sol = True
    if found_good_sol:
        best_parameter_setups.append(parameter_setup)
        
    if found_good_sol:
        if it == 0:
            print("(*) SETUP:","Confirmed Error: "+str(parameter_setup['confirmed_error'])," | Deaths Error: "+str(parameter_setup['deaths_error']))
        else:
            print("(*) ITERATION "+str(it)+":","Confirmed Error: "+str(parameter_setup['confirmed_error'])," | Deaths Error: "+str(parameter_setup['deaths_error']))
    else:
        if it == 0:
            print("(-) SETUP:","Confirmed Error: "+str(parameter_setup['confirmed_error'])," | Deaths Error: "+str(parameter_setup['deaths_error']))
        else:
            print("(-) ITERATION "+str(it)+":","Confirmed Error: "+str(parameter_setup['confirmed_error'])," | Deaths Error: "+str(parameter_setup['deaths_error']))
        
        
print("Found",len(best_parameter_setups) - 1,"good solutions!")
###################################################

Starting using previous best setup...
(*) SETUP: Confirmed Error: 8.264506555201208e-05  | Deaths Error: 0.00017968191404611904
(-) ITERATION 1: Confirmed Error: 8.332579213953679e-05  | Deaths Error: 0.000186474359543878
(-) ITERATION 2: Confirmed Error: 9.794785079174812e-05  | Deaths Error: 0.0002007489199403362
(-) ITERATION 3: Confirmed Error: 9.128299743429795e-05  | Deaths Error: 0.00019119433704191385
(-) ITERATION 4: Confirmed Error: 9.812021535278148e-05  | Deaths Error: 0.0002007489199403362
(*) ITERATION 5: Confirmed Error: 8.000969883068725e-05  | Deaths Error: 0.0001782893056527195
(*) ITERATION 6: Confirmed Error: 9.138337344401038e-05  | Deaths Error: 0.00017420171962008045
(-) ITERATION 7: Confirmed Error: 8.000969883068725e-05  | Deaths Error: 0.0001782893056527195
(-) ITERATION 8: Confirmed Error: 8.009859641178407e-05  | Deaths Error: 0.0001782893056527195
(-) ITERATION 9: Confirmed Error: 8.279028563304487e-05  | Deaths Error: 0.00017968191404611904
(-) ITERATION 1

In [11]:
for setup in best_parameter_setups:
        print(setup)
        print()

{'alpha': 1.0527008892145706, 'max_iter': 1919, 'tol': 0.0015241377448979765, 'random_state': 906652, 'window_size': 8, 'confirmed_error': 8.264506555201208e-05, 'deaths_error': 0.00017968191404611904}

{'alpha': 1.0527008892145706, 'max_iter': 1919, 'tol': 0.0015241377448979765, 'random_state': 906652, 'window_size': 8, 'confirmed_error': 8.264506555201208e-05, 'deaths_error': 0.00017968191404611904}

{'alpha': 1.8381037054483904, 'max_iter': 2986, 'tol': 0.0012648811860279752, 'random_state': 75529, 'window_size': 4, 'confirmed_error': 8.000969883068725e-05, 'deaths_error': 0.0001782893056527195}

{'alpha': 1.725823170197946, 'max_iter': 2171, 'tol': 0.00667999073950969, 'random_state': 43448, 'window_size': 6, 'confirmed_error': 9.138337344401038e-05, 'deaths_error': 0.00017420171962008045}



In [6]:
#submission = test.drop(columns=['Date', 'Province_State'])
print(submission)
filename = "team31-dec1-1-nosub.csv"
#submission.to_csv(filename)

NameError: name 'submission' is not defined

In [None]:
print(MAPE(bestsub[['Confirmed', 'Deaths']], validation[['Confirmed', 'Deaths']]))
print("Error of current run")
print(MAPE(test[['Confirmed', 'Deaths']], validation[['Confirmed', 'Deaths']]))

In [None]:
submission['Confirmed']=bestsub['Confirmed']
submission['Deaths']=test['Deaths']