<a href="https://colab.research.google.com/github/cauchy221/ML2022Spring/blob/main/HW01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Homework 1: COVID-19 Cases Prediction (Regression)**

# Download data
Download it from the Google driver links below, or from Kaggle then upload data manually to the workspace.

In [2]:
!gdown --id '1kLSW_-cW2Huj7bh84YTdimGBOJaODiOS' --output covid.train.csv
!gdown --id '1iiI5qROrAhZn-o4FPqsE97bMzDEFvIdg' --output covid.test.csv

Downloading...
From: https://drive.google.com/uc?id=1kLSW_-cW2Huj7bh84YTdimGBOJaODiOS
To: /content/covid.train.csv
100% 2.49M/2.49M [00:00<00:00, 160MB/s]
Downloading...
From: https://drive.google.com/uc?id=1iiI5qROrAhZn-o4FPqsE97bMzDEFvIdg
To: /content/covid.test.csv
100% 993k/993k [00:00<00:00, 75.0MB/s]


# Import packages
Import all packages that are needed.

In [3]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# Show Trianing Process in Progress Bar
from tqdm import tqdm

# Pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split  # can split training dataset into trining and validation

# For Plotting Learning Curve
from torch.utils.tensorboard import SummaryWriter

# Some Utility Functions
Don't have to modify this part but have to understand the usage of each function.

In [4]:
from numpy.random.mtrand import triangular
def same_seed(seed):
  '''Fix random number generator seeds for reproducibility.'''
  torch.backends.cudnn.deterministic = True  # if true, the convolutional algorithm is the same each time
  torch.backends.cudnn.benchmark = False  # if true, auto-tuner in cuDNN will find the most efficient algorithm each time automatically
  np.random.seed(seed)  # generate the same random number with the same seed
  torch.manual_seed(seed)  # set the seed for generating random numbers
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)  # set the seed for generating random numbers on all GPUs

def train_valid_split(data_set, valid_ratio, seed):
  '''Split provided training data into training set and validation set'''
  valid_set_size = int(valid_ratio * len(data_set))
  train_set_size = len(data_set) - valid_set_size
  train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
  return np.array(train_set), np.array(valid_set)  # return in np.array

def predict(test_loader, model, device):
  '''Get model prediction'''
  model.eval()  # set the model to evaluation mode
  preds = []
  for x in tqdm(test_loader):  # tqdm(iterator)
    x = x.to(device)
    with torch.no_grad():
      pred = model(x)  # it's on device right now
      preds.append(pred.detach().cpu())
  preds = torch.cat(preds, dim=0).numpy()
  return preds

# Dataset

In [5]:
class COVID19Dataset(Dataset):
  '''
  x: Features
  y: Targets, if none, do prediction
  '''
  def __init__(self, x, y=None):
    if y is None:
      self.y = y
    else:
      self.y = torch.FloatTensor(y)
    self.x = torch.FloatTensor(x)

  def __getitem__(self, idx):
    if self.y is None:
      return self.x[idx]
    else:
      return self.x[idx], self.y[idx]

  def __len__(self):
    return len(self.x)

# Neural Network Model
Try out different model architectures by modifying the class below

In [6]:
class My_Model(nn.Module):
  def __init__(self, input_dim):
    super(My_Model, self).__init__()
    # TODO: modify model's structure, be aware of dimensions
    # 2022/3/3 Try default structure with default input_dim
    self.layers = nn.Sequential(
        nn.Linear(input_dim, 16), 
        nn.ReLU(), 
        nn.Linear(16, 8), 
        nn.ReLU(), 
        nn.Linear(8, 1)
    )
  
  def forward(self, x):
    x = self.layers(x)
    x = x.squeeze(1)  # (B, 1) -> (B)
    return x

# Feature Selection
Not all features are useful. Carefully choose them by modifying the function below.

In [7]:
def select_feat(train_data, valid_data, test_data, select_all=True):
  '''Select useful features'''
  y_train, y_valid = train_data[:,-1], valid_data[:,-1]
  raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data[:,:-1]

  if select_all:
    feat_idx = list(range(raw_x_train.shape[1]))
  else:
    feat_idx = [0,1,2,3,4,5] # TODO: Select suitable columns
  
  return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

# Training Loop

In [None]:
def trainer(train_loader, valid_loader, model, config, device):
  criterion = nn.MSELoss(reduction='mean')  # loss function

  # define optimization algorithm
  # 