# Introduction

In [None]:
# In this file, we will divide the counties in US into 4 groups according to their reported deaths number.
# Then we will train different models for different groups of counties using curve fitting and svm.

# Dependencies

In [1]:
from linear_regression import LinearRegressor
from svm import SVM
from output import Output
from model_scoring_evaluation import generate_day_tag, score_all_predictions
import pandas as pd
import numpy as np

# Linear Regression

In [None]:
# First of all, we fit the death data from the counties that reported less than 14 deaths during the past two weeks.

In [2]:
lr = LinearRegressor()
lr.train()
lr.test()

Day: 0. Training acc: 8.100000
Day: 1. Training acc: 7.900000
Day: 2. Training acc: 7.920000
Day: 3. Training acc: 7.370000
Day: 4. Training acc: 6.660000
Day: 5. Training acc: 6.300000
Day: 6. Training acc: 5.540000
Day: 7. Training acc: 4.640000
Day: 8. Training acc: 3.670000
Day: 9. Training acc: 3.160000
Day: 10. Training acc: 2.860000
Day: 11. Training acc: 2.530000
Day: 12. Training acc: 2.150000
Day: 13. Training acc: 1.870000
Predictions saved as models/LR/lr_burning.csv


In [None]:
# Then we can write our predictions to 'sample_submission.csv' in order to calculate pinball loss.

In [3]:
source = 'models/LR/lr_burning.csv'
dst = 'submissions/submission_svm_lr_model.csv'
output = Output()
output.save_submission(source, dst)

0/293293
100000/293293
200000/293293
Successfully saved!


In [None]:
# Secondly, we fit the death data from the counties that reported less than 140 deaths during the past two weeks.

In [5]:
lr = LinearRegressor()
lr.train('mid')
lr.test('mid')

Day: 0. Training acc: 26.560000
Day: 1. Training acc: 29.960000
Day: 2. Training acc: 29.580000
Day: 3. Training acc: 29.220000
Day: 4. Training acc: 29.600000
Day: 5. Training acc: 28.230000
Day: 6. Training acc: 25.610000
Day: 7. Training acc: 21.670000
Day: 8. Training acc: 17.930000
Day: 9. Training acc: 17.020000
Day: 10. Training acc: 17.260000
Day: 11. Training acc: 13.400000
Day: 12. Training acc: 12.030000
Day: 13. Training acc: 9.910000
Predictions saved as models/LR/lr_mid.csv


In [None]:
# Again, we write our submission to our output file.

In [6]:
source = 'models/LR/lr_mid.csv'
dst = 'submissions/submission_svm_lr_model.csv'
output = Output('submissions/submission_svm_lr_model.csv')
output.save_submission(source, dst)

0/293293
100000/293293
200000/293293
Successfully saved!


# SVM

In [None]:
# Then we use SVM model to fit counties with more deaths. We divide these counties into two groups and we will build different models for them.
# So we fit the data of counties which reported more than 140 but less than 700 deaths during past two weeks using SVM model.

In [7]:
s = SVM()
s.train('mid2')
s.test('mid2')

Date: 0. Training acc: 52.620000
Date: 1. Training acc: 52.050000
Date: 2. Training acc: 49.590000
Date: 3. Training acc: 50.470000
Date: 4. Training acc: 49.130000
Date: 5. Training acc: 48.710000
Date: 6. Training acc: 48.140000
Date: 7. Training acc: 42.740000
Date: 8. Training acc: 39.460000
Date: 9. Training acc: 35.390000
Date: 10. Training acc: 37.360000
Date: 11. Training acc: 38.570000
Date: 12. Training acc: 37.890000
Date: 13. Training acc: 35.570000
Predictions saved as models/SVM/svm_mid2.csv


In [8]:
source = 'models/SVM/svm_mid2.csv'
dst = 'submissions/submission_svm_lr_model.csv'
output = Output('submissions/submission_svm_lr_model.csv')
output.save_submission(source, dst)

0/293293
100000/293293
200000/293293
Successfully saved!


In [None]:
# Finally, we fit the data of counties which reported more than 700 deaths during past two weeks.

In [8]:
s = SVM()
s.train()
s.test()

Date: 0. Training acc: 86.470000
Date: 1. Training acc: 87.360000
Date: 2. Training acc: 87.950000
Date: 3. Training acc: 86.260000
Date: 4. Training acc: 82.430000
Date: 5. Training acc: 83.740000
Date: 6. Training acc: 84.630000
Date: 7. Training acc: 84.260000
Date: 8. Training acc: 85.710000
Date: 9. Training acc: 83.400000
Date: 10. Training acc: 83.860000
Date: 11. Training acc: 83.130000
Date: 12. Training acc: 76.950000
Date: 13. Training acc: 71.780000
Predictions saved as models/SVM/svm_outbreak.csv


In [9]:
source = 'models/SVM/svm_outbreak.csv'
dst = 'submissions/submission_svm_lr_model.csv'
output = Output('submissions/submission_svm_lr_model.csv')
output.save_submission(source, dst)

0/293293
100000/293293
200000/293293
Successfully saved!


# Pinball Loss

In [None]:
# Now we can calculate pinball loss of our model to see the performance.

In [10]:
pred_file = 'submissions/submission_svm_lr_model.csv'

start_date = '2020-05-18'
predicted_length = 7
date_list = generate_day_tag(start_date, predicted_length)
for day in date_list:
    scores = score_all_predictions(pred_file, day, '2020-05-17', key='deaths')
    scores_mse = score_all_predictions(pred_file, day, '2020-05-17', key='deaths', mse=True)
    print("Day %s: pinball=%f mse=%f" % (day, scores[0], scores_mse[0]))

Day 2020-05-18: pinball=0.141824 mse=5.507602
Day 2020-05-19: pinball=0.229476 mse=9.346572
Day 2020-05-20: pinball=0.229755 mse=8.532733
Day 2020-05-21: pinball=0.206330 mse=5.901334
Day 2020-05-22: pinball=0.198107 mse=7.022960
Day 2020-05-23: pinball=0.162344 mse=4.504189
Day 2020-05-24: pinball=0.092802 mse=3.150791
