# Water potability

In [17]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import math
import torch.optim as optim
from torch.nn import CrossEntropyLoss


## Load data, split train/validate/test/

In [2]:
potability = pd.read_csv('water_potability.csv')
potability.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,0.587349,0.577747,0.386298,0.568199,0.647347,0.292985,0.654522,0.795029,0.630115,0
1,0.643654,0.4413,0.314381,0.439304,0.514545,0.356685,0.377248,0.202914,0.520358,0
2,0.388934,0.470876,0.506122,0.524364,0.561537,0.142913,0.249922,0.401487,0.219973,0
3,0.72582,0.715942,0.506141,0.521683,0.751819,0.148683,0.4672,0.658678,0.242428,0
4,0.610517,0.532588,0.237701,0.270288,0.495155,0.494792,0.409721,0.469762,0.585049,0


In [3]:
data_len = potability.shape[0]
train_len = math.floor(data_len * 0.7)
val_len = math.floor(data_len * 0.2)
test_len = math.floor(data_len * 0.1)

In [4]:
train_data = potability.iloc[0:train_len, :]
val_data = potability.iloc[train_len:train_len+val_len, :]
test_data = potability.iloc[train_len + val_len:, :]

## Setup tensor datasets

In [23]:
def setupTensorDataset(data):
    # Load the different columns into two PyTorch tensors
    X = torch.tensor(potability.iloc[:, -1].to_numpy())
    y = torch.tensor(potability.iloc[:, 0:-1].to_numpy())
    dataset = TensorDataset(torch.tensor(X), torch.tensor(y).float())
    return dataset

In [40]:
train_dataset = setupTensorDataset(train_data)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=2)

val_dataset = setupTensorDataset(val_data)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=2)

test_dataset = setupTensorDataset(test_data)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=2)


  dataset = TensorDataset(torch.tensor(X), torch.tensor(y).float())


## Create model

In [64]:
model = nn.Sequential(
    nn.Linear(9, 5),
    nn.Linear(5, 1),
    nn.Sigmoid()
)

In [66]:
num_epochs = 1
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.95) # stochastic gradient descent with default params
criterion = nn.BCELoss()

for epoch in range(num_epochs):
    training_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # set gradients to 0
        optimizer.zero_grad()
        # get feature and target
        target, feature = data
        # forward pass
        pred = model(feature)
        # compute loss and gradients
        loss = criterion(pred.float(), target.float())
        loss.backward()
        # update model params
        optimizer.step()
        # calculate and sum losses
        training_loss += loss.item()
    epoch_loss = training_loss / len(train_loader) # len = number of batches in dataset
    print(epoch_loss)

ValueError: Using a target size (torch.Size([2])) that is different to the input size (torch.Size([2, 1])) is deprecated. Please ensure they have the same size.

In [60]:
pred.reshape(2).shape

torch.Size([2])

In [57]:
target.shape

torch.Size([2])

In [43]:
m = nn.Sigmoid()
loss = nn.BCELoss()
input = torch.randn(3, 2, requires_grad=True)
target = torch.rand(3, 2, requires_grad=False)
output = loss(m(input), target)
output.backward()

In [46]:
target

tensor([[0.9969, 0.6069],
        [0.9487, 0.5640],
        [0.3844, 0.3170]])

In [48]:
m(input)

tensor([[0.5518, 0.6075],
        [0.5211, 0.2968],
        [0.1378, 0.1513]], grad_fn=<SigmoidBackward0>)

In [51]:
input

tensor([[ 0.2079,  0.4369],
        [ 0.0845, -0.8627],
        [-1.8334, -1.7247]], requires_grad=True)