# Logistic Regression Model for Binary Classification of Breast Cancer

## Import Dependecies

In [None]:
import numpy as np
import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Collect and Process Data

In [None]:
# load dataset from sklearn
tumor_dataset = sklearn.datasets.load_breast_cancer()

# create pandas dataframe with imported dataset
tumor_df = pd.DataFrame(tumor_dataset.data, columns = tumor_dataset.feature_names)

# add new column to dataframe describing the (binary) pre-assigned label
tumor_df['label'] = tumor_dataset.target

In [None]:
# missing value check
tumor_df.info()
tumor_df.isnull().sum()

In [None]:
# stastical info
tumor_df.describe()

In [None]:
# target distribution check
tumor_df['label'].value_counts()

In [None]:
tumor_df.groupby('label').mean()

## Split Data into Training and Testing Sets

In [None]:
features = tumor_df.drop(columns='label', axis=1)
target = tumor_df['label']

# omit random state to change set splits
features_trn, features_tst, target_trn, target_tst = train_test_split(features, target, test_size=0.2, random_state=2)

## Train Model

### Logistic Regression

In [None]:
# initialize model
model = LogisticRegression()

In [None]:
# train with imported data
model.fit(features_trn, target_trn)

## Model Evaluation

### Accuracy Score

In [None]:
# accuracy on training data
features_trn_prediction = model.predict(features_trn)
training_data_accuracy = accuracy_score(target_trn, features_trn_prediction)

print('Accuracy on training data:\n', training_data_accuracy)

# accuracy on test data
features_tst_prediction = model.predict(features_tst)
testing_data_accuracy = accuracy_score(target_tst, features_tst_prediction)

print('Accuracy on training data:\n', testing_data_accuracy)

## Build Predictive System

In [None]:
# replace this data with input vector to be classified
in_data = (20.57, 17.77, 132.9, 1326, 0.08474, 0.07864, 0.0869, 0.07017, 0.1812, 0.05667, 0.5435, 0.7339, 3.398, 74.08, 0.005225, 0.01308, 0.0186, 0.0134, 0.01389, 0.003532, 24.99, 23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902)

# use numpy array
in_data_arr = np.asanyarray(in_data)

# reshape to predict for one datapoint
in_data_reshaped = in_data_arr.reshape(1, -1)

prediction = model.predict(in_data_reshaped)

diagnosis = 'Cancerous' if prediction[0] == 0 else 'Benign'

print('Predicted diagnosis:\n', diagnosis)