# Logistic Regresison Brest Cancer Case study

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
dts = pd.read_csv('breast_cancer.csv')
X = dts.iloc[:, 1:-1].values
y = dts.iloc[:, -1].values

In [3]:
y = np.reshape(y, (-1,1))

## Splitting dataset into training and test set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

## Training the logistic regression model on Training set

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

LogisticRegression(random_state=0)

## Predicting tet set result

In [6]:
y_pred = clf.predict(X_test)

In [7]:
print(y_pred)

[2 2 4 4 2 2 2 4 2 2 4 2 4 2 2 2 4 4 4 2 2 2 4 2 4 4 2 2 2 4 2 4 4 2 2 2 4
 4 2 4 2 2 2 2 2 2 2 4 2 2 4 2 4 2 2 2 4 4 2 4 2 2 2 2 2 2 2 2 4 4 2 2 2 2
 2 2 4 2 2 2 4 2 4 2 2 4 2 4 4 2 4 2 4 4 2 4 4 4 4 2 2 2 4 4 2 2 4 2 2 2 4
 2 2 4 2 2 2 2 2 2 2 4 2 2 4 4 2 4 2 4 2 2 4 2 2 4 2 4 2 2 2 4 2 2 2 4 4 2
 4 2 4 2 2 2 2 2 4 4 2 4 4 4 4 2 4 2 2 2 2 2 2]


## computing Confusion Matrix and accuracy

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accu = accuracy_score(y_test, y_pred)
print("Test set Accuracy: {:.3f}%".format(accu*100))

[[103   4]
 [  5  59]]
Test set Accuracy: 94.737%


## Computing the accuracy with k-Fold Cross Validation

In [9]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=clf, cv=10, X=X_train, y=y_train)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 96.87 %
Standard deviation: 1.57 %


## Applying Grid Search to find the best model and the best parameters

In [10]:
from sklearn.model_selection import GridSearchCV
parameters = [{'penalty': ['l1', 'l2', 'elasticnet'], 'C':[0.25, 0.5, 0.75, 1], 'random_state': [0, 42, 50, 69, 129], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}]
grid_search = GridSearchCV(estimator=clf, 
                           param_grid=parameters, 
                           scoring='accuracy', 
                           cv=10, 
                           n_jobs=-1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.3f}%".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 97.270%
Best Parameters: {'C': 0.25, 'penalty': 'l1', 'random_state': 0, 'solver': 'saga'}


## Linear Regression Model with best parameters 

In [11]:
from sklearn.linear_model import LogisticRegression
clf_2 = LogisticRegression(random_state=0, C=0.25, penalty='l1', solver='saga')
clf_2.fit(X_train, y_train)

LogisticRegression(C=0.25, penalty='l1', random_state=0, solver='saga')

In [12]:
y_pred_2 = clf_2.predict(X_test)
print(y_pred_2)

[2 2 4 4 2 2 2 4 2 2 4 2 4 2 2 2 4 4 4 2 2 2 4 2 4 4 2 2 2 4 2 4 4 2 2 2 4
 4 2 4 2 2 2 2 2 2 2 4 2 2 4 2 4 2 2 2 4 4 2 4 2 2 2 2 2 2 2 2 4 4 2 2 2 2
 2 2 4 2 2 2 4 2 4 2 2 4 2 4 4 2 4 2 4 2 2 4 4 4 2 2 2 2 4 4 2 2 4 2 2 2 4
 2 2 4 2 2 2 2 2 2 2 4 2 2 4 4 2 4 2 4 2 2 4 2 2 4 2 4 2 2 2 2 2 2 2 4 4 2
 4 2 4 2 2 2 2 2 4 4 2 4 4 4 4 2 4 2 2 2 2 2 2]


In [13]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=clf_2, cv=10, X=X_train, y=y_train)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 97.27 %
Standard deviation: 2.77 %
