In [1]:
import numpy as np
import random
import logreg as lr

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
ydat = breast_cancer_wisconsin_diagnostic.data.targets 
  
# metadata 
print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_diagnostic.variables) 

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

In [4]:
X=np.array(X)
ydat=np.array(ydat)
y=np.array([[int(yi=='M')] for yi in ydat])

In [5]:
#Augments feature vector x with x[0]=1 for bias
N=np.shape(X)[0]
X_aug=np.hstack((np.full((N, 1), 1),X))
m=np.shape(X_aug)[1]

In [6]:
#Perform feature scaling on X
scaler = preprocessing.StandardScaler().fit(X_aug)
X_scaled = scaler.transform(X_aug)

In [7]:
#Splits off 60% of the set into a training set, 20% into validation and test set each
X_train, X_testval, y_train, y_testval = train_test_split(X_scaled, y, test_size=0.40)
X_test, X_val, y_test, y_val = train_test_split(X_testval, y_testval, test_size=0.50)

In [8]:
print(np.shape(X),np.shape(X_train),np.shape(X_val),np.shape(X_test))

print(np.shape(y),np.shape(y_train),np.shape(y_val),np.shape(y_test))

(569, 30) (341, 31) (114, 31) (114, 31)
(569, 1) (341, 1) (114, 1) (114, 1)


In [9]:
M_train=np.sum(y_train)
M_val=np.sum(y_val)
M_test=np.sum(y_test)

N_train=np.shape(y_train)[0]
N_val=np.shape(y_val)[0]
N_test=np.shape(y_test)[0]

B_train=N_train-M_train
B_val=N_val-M_val
B_test=N_test-M_test

print(f"Training set: M: {M_train}, B: {B_train}")
print(f"Validation set: M: {M_val}, B: {B_val}")
print(f"Test set: M: {M_test}, B: {B_test}")

Training set: M: 132, B: 209
Validation set: M: 43, B: 71
Test set: M: 37, B: 77


In [54]:
w_sgd=lr.optimize(X_train,y_train,100000,.00025,15,10000)

Step 0: CE loss [[4.27626888]]
Step 10000: CE loss [[0.80537942]]
Step 20000: CE loss [[0.4502795]]
Step 30000: CE loss [[0.28214032]]
Step 40000: CE loss [[0.20872789]]
Step 50000: CE loss [[0.1768803]]
Step 60000: CE loss [[0.15683072]]
Step 70000: CE loss [[0.14312853]]
Step 80000: CE loss [[0.13272653]]
Step 90000: CE loss [[0.1246679]]
Final results: CE loss [[0.11819774]]


In [56]:
#Evaluates cross-entropy of validation set based on trained weights to tune hyperparameters
valentropy=lr.entropy(w_sgd,X_val,y_val)
print(valentropy)

[[0.11264025]]


In [58]:
testentropy=lr.entropy(w_sgd,X_test,y_test)
print(testentropy)

[[0.07116624]]


In [64]:
#Makes binary prediction on test set fom trained weights (sig(z)>0.5)
ypred_test=lr.ybin(w_sgd,X_test)

In [84]:
#Calculates TP TN FP FN on test set
TP=np.dot(ypred_test.T,y_test)
TN=np.dot((1-ypred_test).T,1-y_test)
FP=np.dot(ypred_test.T,1-y_test)
FN=np.dot((1-ypred_test).T,y_test)

print(TP+TN+FP+FN)

[[114]]


In [92]:
#calculates metrics
precision=TP/(TP+FP)
recall=TP/(TP+FN)
accuracy=(TP+TN)/(TP+TN+FP+FN)
F1=2*precision*recall/(precision+recall)
print(f"Precision: {precision}, recall: {recall}, accuracy: {accuracy}, F1: {F1}")

Precision: [[0.92307692]], recall: [[0.97297297]], accuracy: [[0.96491228]], F1: [[0.94736842]]
