In [40]:
import pandas as pd
data = pd.read_csv("/content/cancer_data.csv")
print(data.head())

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  perimeter_worst  area_wor

In [41]:
# Remove duplicates
data = data.drop_duplicates()


In [42]:
# Remove outliers using Z-score
import numpy as np
from scipy.stats import zscore
numeric_cols = data.select_dtypes(include=[np.number]).columns
z_scores = zscore(data[numeric_cols])
import pandas as pd
z_scores_df = pd.DataFrame(z_scores, columns=numeric_cols)
outliers = (np.abs(z_scores_df) > 3).any(axis=1)
data_cleaned = data[~outliers].reset_index(drop=True)

print("Original shape:", data.shape)
print("Cleaned shape:", data_cleaned.shape)

Original shape: (569, 32)
Cleaned shape: (487, 32)


In [43]:
# Label Encoding for target column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_cleaned.loc[:, "diagnosis"] = le.fit_transform(data_cleaned["diagnosis"]).astype(int)

# Features and target
X = data_cleaned.drop("diagnosis", axis=1)
y = data_cleaned["diagnosis"]


In [44]:
# Feature selection (ANOVA)
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)
selected_features = X.columns[selector.get_support()]
X_anova = X[selected_features]

# Final dataset
new_data = pd.concat([X_anova, y], axis=1)
new_data["diagnosis"] = new_data["diagnosis"].astype(int)  # Ensure diagnosis is integer type
print(new_data.head())

X_new = new_data.drop("diagnosis", axis=1)
y_new = new_data["diagnosis"]


   radius_mean  perimeter_mean  area_mean  concavity_mean  \
0        20.57          132.90     1326.0          0.0869   
1        19.69          130.00     1203.0          0.1974   
2        20.29          135.10     1297.0          0.1980   
3        12.45           82.57      477.1          0.1578   
4        18.25          119.60     1040.0          0.1127   

   concave points_mean  area_se  radius_worst  perimeter_worst  area_worst  \
0              0.07017    74.08         24.99            158.8      1956.0   
1              0.12790    94.03         23.57            152.5      1709.0   
2              0.10430    94.44         22.54            152.2      1575.0   
3              0.08089    27.19         15.47            103.4       741.6   
4              0.07400    53.91         22.88            153.2      1606.0   

   concave points_worst  diagnosis  
0                0.1860          1  
1                0.2430          1  
2                0.1625          1  
3               

In [45]:
# Normalize features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_new = scaler.fit_transform(X_new)

In [46]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

In [47]:
# Convert to PyTorch tensors
import torch
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)


In [48]:
# Neural Network Model

import torch.nn as nn
import torch.optim as optim

class ANN_Model(nn.Module):
    def __init__(self, input_dim):
        super(ANN_Model, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.relu(self.fc1(x))
        out = self.relu(self.fc2(out))
        out = self.sigmoid(self.fc3(out))
        return out

input_dim = X_train.shape[1]
model = ANN_Model(input_dim)


In [49]:
# Loss & Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [50]:
# Training

epochs = 100
for epoch in range(epochs):
    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [10/100], Loss: 0.7096
Epoch [20/100], Loss: 0.6963
Epoch [30/100], Loss: 0.6834
Epoch [40/100], Loss: 0.6683
Epoch [50/100], Loss: 0.6486
Epoch [60/100], Loss: 0.6246
Epoch [70/100], Loss: 0.5960
Epoch [80/100], Loss: 0.5620
Epoch [90/100], Loss: 0.5231
Epoch [100/100], Loss: 0.4808


In [51]:
# Evaluation

with torch.no_grad():
    y_pred = model(X_test)
    y_pred_classes = (y_pred >= 0.5).float()

    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    acc = accuracy_score(y_test, y_pred_classes)
    prec = precision_score(y_test, y_pred_classes)
    rec = recall_score(y_test, y_pred_classes)
    f1 = f1_score(y_test, y_pred_classes)

    print("\nModel Evaluation Metrics:")
    print(f"Accuracy: {acc*100:.2f}%")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")


Model Evaluation Metrics:
Accuracy: 92.86%
Precision: 1.0000
Recall: 0.8205
F1 Score: 0.9014
