In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from scripts.mcmc_samplers import SGHMCSampler,LossModule
import scripts.corruptDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from scripts.linear import Linear
import random
import itertools
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from collections import OrderedDict

N_FEATURES = 58
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
class OnlineNewsPopularityDataset(Dataset):
    """Online News Popularity dataset."""

    def __init__(self, csv_file, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.df = pd.read_csv(csv_file, header = 0)
        self.df.columns = self.df.columns.str.replace(' ', '') 
        self.X = self.df.drop(['url','timedelta','shares'],axis=1)
        self.X = MinMaxScaler().fit(self.X).transform(self.X)
        self.Y = self.df['shares']
        self.Y = LabelEncoder().fit_transform(self.Y>=1400)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            print(idx)
            idx = idx.tolist()
        x,y = self.X[idx,:], self.Y[idx]
        x,y = self.transform(x,y)
        return (x,y)


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""
    def __call__(self, x,y):
        return torch.from_numpy(np.array(x)).type(torch.FloatTensor), torch.Tensor([y]).type(torch.LongTensor).squeeze()


transform = ToTensor()
dataset = OnlineNewsPopularityDataset('dataUCI/OnlineNewsPopularity.csv', transform = transform)
lengths = [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)]
train_set, test_set = torch.utils.data.random_split(dataset, lengths)
train_loader = DataLoader(train_set, batch_size=256,
                        shuffle=True, num_workers=0)
test_loader = DataLoader(test_set, batch_size=len(test_set),
                        shuffle=False, num_workers=2)

In [3]:
class ANNet(nn.Module):
    def __init__(self, input_size):
        super(ANNet, self).__init__()

        self.inputLayer = Linear(input_size,64)
        self.hiddenLayers = [Linear(64,128), Linear(128,256),Linear(256,64),Linear(64,16)]
        self.outputLayer = Linear(16,2)
        self.act = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    def forward(self, x):
        x = self.inputLayer(x)
        x = self.act(x)
        x = self.dropout(x)
        for hl in self.hiddenLayers:
            x = hl(x)
            x = self.act(x)
        x = self.outputLayer(x)
        x = F.log_softmax(x,dim=1)
        return x

In [None]:
model = ANNet(input_size = N_FEATURES)
loss_fn = torch.nn.NLLLoss(reduction='sum')
lm = LossModule(model, train_loader, loss_fn,
                temperature = 1, scaling = None, non_curated = 0, corruptFunction = corruptDataset.corruptONP)
SGHMC = SGHMCSampler(lm, num_burn_in_steps=3000, lr = 0.005, keep_every=100)
SGHMC.sample(train_loader, test_loader, model, nsamples=200)
sampled_weights = SGHMC.sampled_weights

In [None]:
test_data, test_labels = next(iter(test_loader))
outputs = torch.zeros([len(test_labels),2]).to(device)
for i, set_params in enumerate(sampled_weights):       
    state_dict = {}
    for k,(name, param) in enumerate(model.named_parameters()):
        state_dict[name] = torch.from_numpy(set_params[k])
    state_dict_it = OrderedDict(state_dict)
    model.load_state_dict(state_dict_it, strict=False)
    with torch.no_grad():
        outputs += model(test_data)

correct = 0
total = 0
outputs = outputs/len(sampled_weights)
_, predicted = torch.max(outputs.data, 1)
total += test_labels.size()[0]
correct += (predicted == test_labels).sum().item()
accuracy = 100*correct/total
print(accuracy)

In [None]:
af = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/',sep=',',names=names)')
