In [5]:
# Implementing a Logistic Regression Model using pyTorch from scratch on the breast cancer dataset


import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [40]:
# Prepare dataset

# Load the breast cancer dataset
bc = datasets.load_breast_cancer()

# Extract input and output data
x, y = bc.data, bc.target

# Reshape the output data
y = y.reshape(-1, 1)

# Create a dataframe
df = np.concatenate((x, y), axis=1)
df  = pd.DataFrame(df)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [109]:
# Extracting insights from the dataset

# Get the shape of the dataset

df.shape # so we have 569 rows and 31 columns

(569, 31)

In [105]:
# Printing the missing columns
missing_cols = [col for col in df.columns if df[col].isnull().sum() > 0]
print(f'missing_cols: {missing_cols}')

missing_cols: []


In [101]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=14)

# scale the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Convert the data to pyTorch tensors
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))

# convert the y_train and y_test to 2D tensor because the model expects a 2D tensor
y_train = y_train.view(X_train.shape[0],1)
y_test = y_test.view(y_test.shape[0],1)


# Create a Logistic Regression Model
class LogisticRegression(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

# Model
n_samples, n_features = x.shape
model = LogisticRegression(input_size=n_features)

# Loss and optimizer
num_epochs = 100
learning_rate = 0.1
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    # Forward pass and loss
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)

    # Backward pass and update
    loss.backward()
    optimizer.step()

    # zero grad before new step
    optimizer.zero_grad()

    if (epoch+1) % 10 == 0:
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')


# Model Evaluation 
with torch.no_grad():
    y_predicted = model(X_test)
    y_predicted_cls = y_predicted.round()
    acc = y_predicted_cls.eq(y_test).sum() / float(y_test.shape[0])
    print(f'accuracy: {acc.item():.4f}')

epoch: 10, loss = 0.2494
epoch: 20, loss = 0.1871
epoch: 30, loss = 0.1591
epoch: 40, loss = 0.1424
epoch: 50, loss = 0.1312
epoch: 60, loss = 0.1230
epoch: 70, loss = 0.1166
epoch: 80, loss = 0.1116
epoch: 90, loss = 0.1074
epoch: 100, loss = 0.1039
accuracy: 0.9737
