# Stage 1. Understanding The Problem
Predict patient breast cancer from 683 patients and 10 features. 

# Stage 2. Data ingestion/loading
Data is from https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original

In [1]:
import os
os.getcwd()

%cd "C:\Users\Angela\OneDrive\Desktop\ANA680"

C:\Users\Angela\OneDrive\Desktop\ANA680


In [2]:
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
BreastCancer = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
X = BreastCancer.data.features 
y = BreastCancer.data.targets 

# Stage 3. Data exploration and evaluation

##### Data Exploration
Deal with missing values

In [11]:
# Find the number of rows and columns
X.shape

(699, 9)

In [12]:
y.shape

(699, 1)

In [14]:
# list the name of features, how many features?
X.columns

Index(['Clump_thickness', 'Uniformity_of_cell_size',
       'Uniformity_of_cell_shape', 'Marginal_adhesion',
       'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin',
       'Normal_nucleoli', 'Mitoses'],
      dtype='object')

In [15]:
y.columns

Index(['Class'], dtype='object')

In [17]:
# Description of feature stats
X.select_dtypes(exclude=['object']).describe().round(decimals=2).transpose() # use transpose for an easier viewing

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Clump_thickness,699.0,4.42,2.82,1.0,2.0,4.0,6.0,10.0
Uniformity_of_cell_size,699.0,3.13,3.05,1.0,1.0,1.0,5.0,10.0
Uniformity_of_cell_shape,699.0,3.21,2.97,1.0,1.0,1.0,5.0,10.0
Marginal_adhesion,699.0,2.81,2.86,1.0,1.0,1.0,4.0,10.0
Single_epithelial_cell_size,699.0,3.22,2.21,1.0,2.0,2.0,4.0,10.0
Bare_nuclei,683.0,3.54,3.64,1.0,1.0,1.0,6.0,10.0
Bland_chromatin,699.0,3.44,2.44,1.0,2.0,3.0,5.0,10.0
Normal_nucleoli,699.0,2.87,3.05,1.0,1.0,1.0,4.0,10.0
Mitoses,699.0,1.59,1.72,1.0,1.0,1.0,1.0,10.0


In [21]:
# Check for missing values
print(X.isnull().sum().sort_values(ascending=False).to_string())

Bare_nuclei                    16
Clump_thickness                 0
Uniformity_of_cell_size         0
Uniformity_of_cell_shape        0
Marginal_adhesion               0
Single_epithelial_cell_size     0
Bland_chromatin                 0
Normal_nucleoli                 0
Mitoses                         0


In [25]:
# Replace Missing Bare Nuclei values with median: 3.54
X = X.fillna(X.median())

In [38]:
# Check that missing values were replaced
print(X.isnull().sum())

Clump_thickness                0
Uniformity_of_cell_size        0
Uniformity_of_cell_shape       0
Marginal_adhesion              0
Single_epithelial_cell_size    0
Bare_nuclei                    0
Bland_chromatin                0
Normal_nucleoli                0
Mitoses                        0
dtype: int64


##### Create and Assess Machine Learning Models
First we need to create a test set with 25% of the dataset and set it aside

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)

In [53]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report

# Train the model on training data
logReg = LogisticRegression()
logReg.fit(X_train, y_train)

# Make Predictions
predictions = logReg.predict(X_test)

# Find Metrics of  model
print(classification_report(y_test, predictions)) 

              precision    recall  f1-score   support

           2       0.96      0.96      0.96       114
           4       0.93      0.93      0.93        61

    accuracy                           0.95       175
   macro avg       0.95      0.95      0.95       175
weighted avg       0.95      0.95      0.95       175



  y = column_or_1d(y, warn=True)
