# FINAL PROJECT — LOGISTIC REGRESSION MODEL

## Import Packages & Read in the Data

In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [6]:
breastCancer = pd.read_csv('/Users/benvanzyll/Desktop/CPSC393/FinalProject/breast-cancer.csv')
breastCancer.drop('id', axis=1, inplace=True)
breastCancer.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Encode Diagnosis Label (0 for Benign, 1 for Malignant)

In [7]:
diagnosis_num = {'B':0, 'M':1}
breastCancer['diagnosis'] = breastCancer['diagnosis'].map(diagnosis_num)
breastCancer.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Split Data into Training, Validation, and Testing Sets

In [8]:
features = breastCancer.drop('diagnosis', axis=1)
labels = breastCancer['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

# Check to make sure each set is the current size (0.7, 0.2, 0.1)
for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(labels), 2))

0.7
0.2
0.1


## Define Function for Calculating Optimal Hyperparameters

In [9]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

## Build Model & Perform 5-Fold Cross Validation

In [10]:
lr = LogisticRegression(max_iter=10000)

In [8]:
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train, y_train.values.ravel())

print_results(cv)

BEST PARAMS: {'C': 1000}

0.93 (+/-0.052) for {'C': 0.001}
0.942 (+/-0.044) for {'C': 0.01}
0.945 (+/-0.044) for {'C': 0.1}
0.95 (+/-0.048) for {'C': 1}
0.952 (+/-0.046) for {'C': 10}
0.955 (+/-0.059) for {'C': 100}
0.972 (+/-0.033) for {'C': 1000}


------------------------------------------
Best parameters to try on validation set:
    
    {'C': 1000} = 0.972
    {'C': 100}  = 0.955
    {'C': 10}   = 0.952

## Try 3 Best Hyperparameter Settings on Validation Set

In [11]:
lr1 = LogisticRegression(C=1000, max_iter=10000)
lr1.fit(X_train, y_train.values.ravel())

LogisticRegression(C=1000, max_iter=10000)

In [12]:
lr2 = LogisticRegression(C=100, max_iter=10000)
lr2.fit(X_train, y_train.values.ravel())

LogisticRegression(C=100, max_iter=10000)

In [11]:
lr3 = LogisticRegression(C=10, max_iter=10000)
lr3.fit(X_train, y_train.values.ravel())

LogisticRegression(C=10, max_iter=10000)

In [15]:
for lr in [lr1, lr2, lr3]:
    y_pred = lr.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred), 3)
    precision = round(precision_score(y_val, y_pred), 3)
    recall = round(recall_score(y_val, y_pred), 3)
    print('C: {} -- A: {} / P: {} / R: {}'.format(lr.C,
                                                    accuracy,
                                                    precision,
                                                    recall))

C: 1000 -- A: 0.982 / P: 1.0 / R: 0.957
C: 100 -- A: 0.982 / P: 1.0 / R: 0.957
C: 10 -- A: 0.982 / P: 1.0 / R: 0.957


------------------------------------------
We will select a C value of 10; C value is a regularization
hyperparameter, and as C increases, regularization increaseses,
subsequently making the model more generalizable for future
unseen data. Because we can have just as accurate of a model with
a lower C value, we will go with the lower C value for our final
model to test on the test set, as there is no point in making the
model even more complex when accuracy, precision, and recall do
not improve at all when doing so.

## Test Final Model on Testing Set and Evaluate Results

In [21]:
y_pred = lr3.predict(X_test)
accuracy = round(accuracy_score(y_test, y_pred), 3)
precision = round(precision_score(y_test, y_pred), 3)
recall = round(recall_score(y_test, y_pred), 3)
print('C: {} -- A: {} / P: {} / R: {}'.format(lr3.C,
                                                accuracy,
                                                precision,
                                                recall))

C: 10 -- A: 0.982 / P: 0.941 / R: 1.0


------------------------------------------
Our final model performs very well on unseen data.
Performance was as good as validation data, and an accuracy score of
0.982, precision of 0.941, and recall of 1.0 is abnormally good. Likely due
to the simplicity of the data and its already linearly separable state.

In [12]:
log_odds = lr3.coef_[0]

pd.DataFrame(log_odds,
            X_train.columns,
            columns=['coef'])\
            .sort_values(by='coef', ascending=False)

Unnamed: 0,coef
concavity_worst,4.298551
concave points_worst,3.372849
symmetry_worst,3.06763
concave points_mean,2.323731
concavity_mean,2.194536
smoothness_worst,2.187933
smoothness_mean,1.310484
symmetry_mean,1.283606
compactness_worst,1.163168
radius_worst,0.915286


In [14]:
odds = np.exp(lr3.coef_[0])
pd.DataFrame(odds,
             X_train.columns,
             columns=['coef'])\
             .sort_values(by='coef', ascending=False)

Unnamed: 0,coef
concavity_worst,73.593046
concave points_worst,29.16148
symmetry_worst,21.490912
concave points_mean,10.213715
concavity_mean,8.975831
smoothness_worst,8.916766
smoothness_mean,3.707968
symmetry_mean,3.609633
compactness_worst,3.200055
radius_worst,2.497489


In [22]:
confusion_matrix(y_test, y_pred)

array([[40,  1],
       [ 0, 16]])

In [36]:
plot_confusion_matrix(lr3, y_test, y_pred)



ValueError: X has 1 features, but LogisticRegression is expecting 20 features as input.