In [1]:
# !pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

In [47]:
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
df = breast_cancer_wisconsin_diagnostic.data.features 
labels = breast_cancer_wisconsin_diagnostic.data.targets 

In [48]:
def data_preparation(df, labels):
    X = df.copy()
    y = labels.copy()
    X['mean_radius'] = (X['radius1'] + X['radius2'] + X['radius3']) / 3
    X['mean_area'] = (X['area1'] + X['area2'] + X['area3']) / 3
    X['mean_perimeter'] = (X['perimeter1'] + X['perimeter2'] + X['perimeter3']) / 3
    
    high_corr_columns = ['radius1', 'radius2', 'radius3', 'perimeter1', 'perimeter2', 'perimeter3', 'area1', 'area2', 'area3']
    X.drop(columns=high_corr_columns, inplace=True)
    
    numerical_columns = ['texture1', 'smoothness1', 'compactness1', 'concavity1',
       'concave_points1', 'symmetry1', 'fractal_dimension1', 'texture2',
       'smoothness2', 'compactness2', 'concavity2', 'concave_points2',
       'symmetry2', 'fractal_dimension2', 'texture3', 'smoothness3',
       'compactness3', 'concavity3', 'concave_points3', 'symmetry3',
       'fractal_dimension3', 'mean_radius', 'mean_area', 'mean_perimeter']
    
    for col in numerical_columns:
        X[col] = np.cbrt(X[col])
        
        IQR = X[col].quantile(0.75) - X[col].quantile(0.25)

        # Compute bounds
        lower_bound = X[col].quantile(0.25) - 1.5 * IQR
        upper_bound = X[col].quantile(0.75) + 1.5 * IQR

        # Set values outside the bounds to NaN
        X[col] = np.where(
            (X[col] <= lower_bound) | (X[col] >= upper_bound),
            np.nan,
            X[col]
        )
        
        X['target'] = np.where(y == 'M', 1, 0)
        X.dropna(inplace=True)
        X.reset_index(drop=True)
        y = X['target']
        X.drop(columns=['target'], inplace=True)
        X = X.to_numpy()
        X = np.column_stack((np.ones(X.shape[0]), X))
        return X, y

In [49]:
X, y = data_preparation(df, labels)

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [53]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))

Accuracy: 0.9646017699115044


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
print('Confusion Matrix:')
print(confusion_matrix(y_val, y_pred))

Confusion Matrix:
[[71  3]
 [ 1 38]]
