<a href="https://colab.research.google.com/github/anirban1221/Learning_PyTorch/blob/main/exploring_Dataset_and_Dataloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import make_classification
X, y = make_classification(
    n_samples=10,       # Number of samples
    n_features=2,       # Number of features
    n_informative=2,    # Number of informative features
    n_redundant=0,      # Number of redundant features
    n_classes=2,        # Number of classes
    random_state=42     # For reproducibility
)

In [3]:
import torch
X_tensor=torch.tensor(X,dtype=torch.float32)
y_tensor=torch.tensor(y,dtype=torch.long)


In [5]:
y_tensor

tensor([1, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [17]:
from torch.utils.data import Dataset,DataLoader

class CustomDataset(Dataset):
  def __init__(self,features,labels):
    self.features=features
    self.labels=labels

  def __len__(self):
    return self.features.shape[0]

  def __getitem__(self,index):

    return self.features[index],self.labels[index]

In [18]:
dataset=CustomDataset(X,y)

In [19]:
dataset[2]

(array([-2.8953973 ,  1.97686236]), np.int64(0))

In [20]:
dataloader=DataLoader(dataset,batch_size=2,shuffle=True)

In [21]:
for batch_features, batch_labels in dataloader:
  print(batch_features)
  print(batch_labels)

tensor([[-1.9629, -0.9923],
        [-1.1402, -0.8388]], dtype=torch.float64)
tensor([0, 0])
tensor([[ 1.0683, -0.9701],
        [-0.7206, -0.9606]], dtype=torch.float64)
tensor([1, 0])
tensor([[-0.9382, -0.5430],
        [-0.5872, -1.9717]], dtype=torch.float64)
tensor([1, 0])
tensor([[ 1.7273, -1.1858],
        [-2.8954,  1.9769]], dtype=torch.float64)
tensor([1, 0])
tensor([[1.8997, 0.8344],
        [1.7774, 1.5116]], dtype=torch.float64)
tensor([1, 1])


In [None]:
## parallelization of work by using workers in pytorch
## using samplers to sample the batches
## uaing num_workers

# using drop_last-when last batch is tooo small than the other batches then
## uf required then we drop the last batch

In [26]:
## improving the breast cancer pipeline using dataset and dataloader class

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')

df.drop(columns=['id', 'Unnamed: 32'], inplace= True)
X=df.iloc[:,1:]
y=df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,test_size=0.2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
X_test_tensor = torch.from_numpy(X_test.astype(np.float32))
y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))

In [31]:
from torch.utils.data import DataLoader, Dataset
class CustomDataset(Dataset):
  def __init__(self,features,labels):
    self.features=features
    self.labels=labels

  def __len__(self):
    return self.features.shape[0]

  def __getitem__(self,index):
    return self.features[index], self.labels[index]

In [32]:
train_dataset=CustomDataset(X_train_tensor,y_train_tensor)
test_dataset=CustomDataset(X_test_tensor,y_train_tensor)


In [33]:
train_loader=DataLoader(train_dataset,batch_size=32,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=32,shuffle=True)

In [34]:
import torch.nn as nn


class NeuralNetwork(nn.Module):

  def __init__(self, num_features):
    super().__init__()
    self.linear=nn.Linear(num_features, 1)
    self.sigmoid=nn.Sigmoid()

  def forward(self,features):
    out=self.linear(features)
    out=self.sigmoid(out)
    return out

  def loss_function(self,y_pred,y):
    epsilon=1e-7
    y_pred=torch.clamp(y_pred,epsilon,1-epsilon)
    loss=-(y_train_tensor*torch.log(y_pred)+(1-y_train_tensor)*torch.log(1-y_pred)).mean()
    return loss

In [35]:
learning_rate=0.1
epochs=25

In [37]:
model=NeuralNetwork(X_train_tensor.shape[1])
optimizer=torch.optim.SGD(model.parameters(),lr=learning_rate)
loss_function=nn.BCELoss()

In [38]:
## taining_pipeline

for epoch in range(epochs):
  for batch_features, batch_labels in train_loader:
    y_pred =model(batch_features)
    loss=loss_function(y_pred,batch_labels.reshape(-1,1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

Epoch: 1, Loss: 0.27292436361312866
Epoch: 2, Loss: 0.13215091824531555
Epoch: 3, Loss: 0.1509459912776947
Epoch: 4, Loss: 0.12025129795074463
Epoch: 5, Loss: 0.07564710825681686
Epoch: 6, Loss: 0.043471984565258026
Epoch: 7, Loss: 0.1943916529417038
Epoch: 8, Loss: 0.05381624028086662
Epoch: 9, Loss: 0.07553450018167496
Epoch: 10, Loss: 0.07878528535366058
Epoch: 11, Loss: 0.03829490393400192
Epoch: 12, Loss: 0.09634768217802048
Epoch: 13, Loss: 0.1193113923072815
Epoch: 14, Loss: 0.10041489452123642
Epoch: 15, Loss: 0.1525832861661911
Epoch: 16, Loss: 0.004809085745364428
Epoch: 17, Loss: 0.03569712117314339
Epoch: 18, Loss: 0.16428633034229279
Epoch: 19, Loss: 0.021954627707600594
Epoch: 20, Loss: 0.029906338080763817
Epoch: 21, Loss: 0.15545041859149933
Epoch: 22, Loss: 0.03519034385681152
Epoch: 23, Loss: 0.1325482577085495
Epoch: 24, Loss: 0.006034291349351406
Epoch: 25, Loss: 0.04661354422569275


In [39]:
## evaluating the model
with torch.no_grad():
  y_pred = model.forward(X_test_tensor)
  y_pred = (y_pred > 0.6).float()

In [40]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_tensor,y_pred)

0.9824561403508771