In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.optim import SGD
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

seed = 3047
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

In [47]:
# !pip install wandb
import wandb
wandb.login

<function wandb.sdk.wandb_login.login(anonymous: Union[Literal['must', 'allow', 'never'], NoneType] = None, key: Union[str, NoneType] = None, relogin: Union[bool, NoneType] = None, host: Union[str, NoneType] = None, force: Union[bool, NoneType] = None, timeout: Union[int, NoneType] = None) -> bool>

In [48]:
epoch = 100
lr = 0.01
batch = 32
OPTIMIZER = 'SGD'

CHECKPOINT = None
weight_d = 1e-3
momentum = 0.9
gamma = 0.8
step = 20

data_norm = True
SCHEDULER = False

C = 1
device = 'cuda:0'
WANDB = True

In [49]:
"""********************************************* 
  Self-defined
 *********************************************"""
def wandb_update():
    config = wandb.config
    config.epochs = epoch
    config.learning_rate = lr
    config.batch_size = batch
    config.optimizer = OPTIMIZER

    config.checkpoint = CHECKPOINT
    config.weight_d = weight_d
    config.momentum = momentum
    config.gamma = gamma
    config.step = step

    config.data_norm = data_norm
    config.scheduler = SCHEDULER

In [50]:
# !gdown 1o0m3jyfmetUOJ146TqHuEGUWwQyC7JXV
# !gdown 1B5OC3R0yM8F7yjoYOKu3t08QZalcr7DC
# !gdown 1THvOuf_EOn6c_6TLy0Bqs23BP2NraBR2

In [51]:
class SVM(nn.Module):
  def __init__(self):
    # TODO design your model
    super(SVM, self).__init__() 
    self.w = nn.Parameter(torch.randn((1, 1024)).to(torch.float32))
    self.f = nn.Sequential(
                  nn.Linear(107, 1024),
                  nn.Dropout(0.5),
                )
  def transform(self, x):
    x = self.f(x)
    return x
  def kernel(self, x):
    pass
  def forward(self, x):
    f = torch.matmul(self.transform(x), self.w.T)
    
    return f

In [52]:
class HingeLoss(nn.Module):
  def __init__(self, C):
    super(HingeLoss, self).__init__()  
    self.C = C
  def forward(self, y, f):
    loss = 0
    for i in range(len(y)):
      loss = loss + max(0, 1-y[i]*f[i])  # define Hinge loss
    loss = loss * self.C
    return loss


In [53]:
def cal_mu_std():
  X = pd.read_csv("./dataset/train.csv")
  mu = X.drop(['y'], axis=1).mean() # The mean of whole features except label y
  std = X.drop(['y'], axis=1).std() # The std of whole features except label y

  return mu, std

class TrainDataset(Dataset):
  def __init__(self, split, mu=None, std=None):
    X = pd.read_csv(f"{split}.csv")
    
    Y = X['y'].values.reshape(-1) * 2 - 1
    self.mu, self.std = mu, std
    if data_norm:
      X = self.normalize(X.drop(['y'], axis=1), self.mu, self.std)
    X = np.concatenate((X, np.ones((X.shape[0], 1))), 1)
    self.Y = torch.from_numpy(Y).to(torch.float32)
    self.X = torch.from_numpy(X).to(torch.float32)

  def normalize(self, X, mu=None, std=None):
    continuous_feat = ["age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week"]
    # X[0] = (X-mu)/std
    for i in range(len(continuous_feat)):
       X[continuous_feat[i]] = (X[continuous_feat[i]]-mu[continuous_feat[i]])/std[continuous_feat[i]]
    
    return X
  
  def __len__(self):
    return self.X.size(0)

  def __getitem__(self, idx):
    return self.X[idx], self.Y[idx]

class TestDataset(Dataset):
  def __init__(self, mu, std):
    X = pd.read_csv("./dataset/X_test")
    if data_norm:
      X = self.normalize(X, mu, std)
    X = np.concatenate((X, np.ones((X.shape[0], 1))), 1)
    self.X = torch.from_numpy(X).to(torch.float32)

  def normalize(self, X, mu_x, std_x):
    # X = (X-mu_x)/std_x
    continuous_feat = ["age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week"]
    for i in range(len(continuous_feat)):
       X[continuous_feat[i]] = (X[continuous_feat[i]]-mu_x[continuous_feat[i]])/std_x[continuous_feat[i]]

    return X
  
  def __len__(self):
    return self.X.size(0)

  def __getitem__(self, idx):
    return self.X[idx]

In [54]:
def train(train_data, val_data, model, optim, C, device='cuda:0', epoch=None):
    objective = HingeLoss(C)
    steps = 0
    best = 0

    for e in range(epoch):
      train_total_loss = 0
      for tr in train_data:
        steps += 1
        x_train, y_train = tr
        x_train, y_train = x_train.to(device), y_train.to(device)
        pred = model(x_train).squeeze(1)
        loss = objective(pred, y_train) + 1 / 2 * torch.sum(model.w[:-1] ** 2)
        
        optim.zero_grad()
        loss.backward()
        optim.step()

        train_total_loss += (loss.item() / len(train_data))
        
        if steps % 1000 == 0:
          model.eval()
          with torch.no_grad():
            acc = []
            for val in val_data:
              x_val, y_val = val
              x_val , y_val = x_val.to(device), y_val.to(device)
              pred = model(x_val).squeeze(1)
              pred = (pred > 0) * 2 - 1
              
              result = (y_val == pred)
              acc += [(float(result.sum()) / result.size(0))]
            acc = sum(acc) / len(acc)
            print(f'Steps {steps}| Train Loss = {train_total_loss}| Val acc = {acc}')
            if acc > best:
              torch.save(model.state_dict(), './model/best_handcraft.ckpt')
              best = acc
          model.train()
          
          if WANDB:
            wandb.log({"lr": optim.param_groups[0]['lr'],
                        "train_acc": acc,
                        "train_loss": train_total_loss})                   
      if SCHEDULER == True:
        scheduler.step()
    return model

In [None]:
mu, std = cal_mu_std()
trainset = TrainDataset('./dataset/train', mu, std)
devset = TrainDataset('./dataset/val', mu, std)
testset = TestDataset(mu, std)

train_dataloader = DataLoader(trainset, batch, True, drop_last=False)
val_dataloader = DataLoader(devset, 1, False)
test_dataloader = DataLoader(testset, 1, False)

model = SVM().to(device)
model.train()
'''Optim Prepare'''
if OPTIMIZER == 'adam':
    optim = torch.optim.Adam(model.parameters(), weight_decay=weight_d, lr=lr)
elif OPTIMIZER == 'SGD':
    optim = torch.optim.SGD(model.parameters(), lr=lr)
else:
    raise ValueError("Optimizer not supported.")
if WANDB:
  wandb.init(project='MLHW5')
  wandb_update()
if SCHEDULER == True:
  scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=step, gamma=gamma)
model = train(train_dataloader, val_dataloader, model, optim, C, device, epoch)

In [56]:
best_model = model
best_model.load_state_dict(torch.load('./model/best_handcraft.ckpt'))
best_model = best_model.eval()

y_test = []
for x in test_dataloader:
  x = x.to(device)
  y = best_model(x)
  y_test.append(((y > 0) * 1).item())



In [57]:
import csv
with open('./testing_result/predict_handcraft.csv', 'w', newline='') as csvf:
    # 建立 CSV 檔寫入器
    writer = csv.writer(csvf)
    writer.writerow(['id','label'])
    for i in range(len(y_test)):
      writer.writerow( [i + 1, int(y_test[i])] )

In [58]:
# from google.colab import drive
# drive.mount("/content/drive/")

In [59]:
# ls

In [60]:
# cd 'MLHW5'