# Improving models

In [3]:
import pandas as pd
import numpy as np

animals = pd.read_csv('animals.csv')
animals.head()

Unnamed: 0,name,hair,feathers,eggs,milk,predator,fins,legs,tail,type
0,skimmer,0,1,1,0,1,0,2,1,2
1,gull,0,1,1,0,1,0,2,1,2
2,seahorse,0,0,1,0,0,1,0,1,4
3,tuatara,0,0,1,0,1,0,4,1,3
4,squirrel,1,0,0,1,0,0,2,1,1


In [15]:
# all columns except the name and the type
features = animals.iloc[:, 1:-1]
X = features.to_numpy()
X

array([[0, 1, 1, 0, 1, 0, 2, 1],
       [0, 1, 1, 0, 1, 0, 2, 1],
       [0, 0, 1, 0, 0, 1, 0, 1],
       [0, 0, 1, 0, 1, 0, 4, 1],
       [1, 0, 0, 1, 0, 0, 2, 1]])

In [16]:
# target aka ground truth
target = animals.iloc[:, -1]
y = target.to_numpy()
y

array([2, 2, 4, 3, 1])

In [17]:
import torch
from torch.utils.data import TensorDataset

# setup tensor dataset
dataset = TensorDataset(torch.tensor(X).float(), torch.tensor(y).float())

In [19]:
# access individual samples
sample = dataset[0]
input_sample, label_sample = sample
print("input: %s, label: %s" % (input_sample, label_sample))

input: tensor([0., 1., 1., 0., 1., 0., 2., 1.]), label: tensor(2.)


In [21]:
from torch.utils.data import DataLoader

batch_size = 2 # how many samples per iteration
shuffle = True

# create a dataloader and iterate
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
for batch_inputs, batch_labels in dataloader:
    print("inputs: %s, labels: %s" % (batch_inputs, batch_labels))
    

inputs: tensor([[0., 1., 1., 0., 1., 0., 2., 1.],
        [0., 0., 1., 0., 1., 0., 4., 1.]]), labels: tensor([2., 3.])
inputs: tensor([[1., 0., 0., 1., 0., 0., 2., 1.],
        [0., 0., 1., 0., 0., 1., 0., 1.]]), labels: tensor([1., 4.])
inputs: tensor([[0., 1., 1., 0., 1., 0., 2., 1.]]), labels: tensor([2.])


Another example

In [None]:
# Load the different columns into two PyTorch tensors
features = torch.tensor(dataframe[['ph', 'Sulfate', 'Conductivity', 'Organic_carbon']].to_numpy()).float()
target = torch.tensor(dataframe['Potability'].to_numpy()).float()

# Create a dataset from the two generated tensors
dataset = TensorDataset(features, target)

# Create a dataloader using the above dataset
dataloader = DataLoader(dataset, shuffle=True, batch_size=2)
x, y = next(iter(dataloader))

# Create a model using the nn.Sequential API
model = nn.Sequential(nn.Linear(4, 2), nn.Linear(2, 1))
output = model(features)
print(output)