# Logistic Regression

In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("breast_cancer.csv")
df.shape
df = df.iloc[:,:-1]
x = df.iloc[:,2:].values
y = df.iloc[:,1].values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=700)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(455, 30) (114, 30)
(455,) (114,)


In [2]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [3]:
from sklearn.metrics import confusion_matrix, accuracy_score
p_score = lr_model.predict_proba(x_train)[:,1]
for t in np.arange(0,1,0.05):
    class_labels = p_score >= t
    print("Threshold is", t)
    print(confusion_matrix(y_train,class_labels.astype(int)))
    print()

Threshold is 0.0
[[  0 288]
 [  0 167]]

Threshold is 0.05
[[234  54]
 [  0 167]]

Threshold is 0.1
[[259  29]
 [  1 166]]

Threshold is 0.15000000000000002
[[266  22]
 [  1 166]]

Threshold is 0.2
[[275  13]
 [  2 165]]

Threshold is 0.25
[[277  11]
 [  4 163]]

Threshold is 0.30000000000000004
[[278  10]
 [  7 160]]

Threshold is 0.35000000000000003
[[279   9]
 [  8 159]]

Threshold is 0.4
[[279   9]
 [ 10 157]]

Threshold is 0.45
[[279   9]
 [ 11 156]]

Threshold is 0.5
[[279   9]
 [ 12 155]]

Threshold is 0.55
[[281   7]
 [ 12 155]]

Threshold is 0.6000000000000001
[[283   5]
 [ 12 155]]

Threshold is 0.65
[[286   2]
 [ 13 154]]

Threshold is 0.7000000000000001
[[287   1]
 [ 15 152]]

Threshold is 0.75
[[287   1]
 [ 16 151]]

Threshold is 0.8
[[287   1]
 [ 19 148]]

Threshold is 0.8500000000000001
[[288   0]
 [ 21 146]]

Threshold is 0.9
[[288   0]
 [ 26 141]]

Threshold is 0.9500000000000001
[[288   0]
 [ 31 136]]



In [20]:
c_25 = p_score > 0.25
print("Training:\n",confusion_matrix(y_train,c_25.astype(int)))
p_score_test = lr_model.predict_proba(x_test)[:,1]
c_25_test = p_score_test > 0.25
print("Testing:\n",confusion_matrix(y_test,c_25_test.astype(int)))

Training:
 [[277  11]
 [  4 163]]
Testing:
 [[62  7]
 [ 3 42]]


# Support Vector Machine

In [14]:
from sklearn.svm import SVC

In [15]:
from sklearn.model_selection import GridSearchCV
svm_classifier = SVC()
param_grid = {
    "C":[0.001,0.01,0.1,1.0],
    "kernel":["linear","sigmoid"],
    "gamma":[0.001,0.01,0.1,1.0]
}
grid = GridSearchCV (estimator=svm_classifier, param_grid=param_grid, cv=5)
grid.fit(x_train, y_train)
print(grid.best_estimator_)
print("\nBest Score: ",grid.best_score_)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Best Score:  0.9648351648351648


In [19]:
print("Testing Acc Score",accuracy_score(y_test, grid.predict(x_test)))
print("Training:\n",confusion_matrix(y_train, grid.predict(x_train)))
print("Testing:\n",confusion_matrix(y_test, grid.predict(x_test)))

Testing Acc Score 0.8771929824561403
Training:
 [[284   4]
 [  9 158]]
Testing:
 [[62  7]
 [ 7 38]]
