# Imports

In [2]:
import numpy as np
import sklearn
import sklearn.model_selection as ms
import sklearn.metrics as metrics
import sklearn.datasets as dataSets
import sklearn.naive_bayes

In [3]:
dataSet = dataSets.load_breast_cancer()
X_train, X_test, y_train, y_test = ms.train_test_split(dataSet.data, dataSet.target, test_size=0.2)

# A

In [4]:
from sklearn.linear_model import LogisticRegression

In [40]:
lr = LogisticRegression(C=1.0,random_state=10, penalty='l2', tol=1e-4, intercept_scaling=1, fit_intercept=True,
                        solver="lbfgs", max_iter=6000)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

acc = metrics.accuracy_score(y_test, pred)
cm = metrics.confusion_matrix(y_test, pred)
auc = metrics.roc_auc_score(y_test, pred)

print(f'Acc: {acc}')
print(f'Auc: {auc}')
print(f'confusion matrix \n {cm} ')

Acc: 0.956140350877193
Auc: 0.956439393939394
confusion matrix 
 [[46  2]
 [ 3 63]] 


# B

In [41]:
from sklearn.naive_bayes import GaussianNB

In [42]:
clf = GaussianNB()
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

acc = metrics.accuracy_score(y_test, pred)
cm = metrics.confusion_matrix(y_test, pred)
auc = metrics.roc_auc_score(y_test, pred)

print(f'Acc: {acc}')
print(f'Auc: {auc}')
print(f'confusion matrix \n {cm} ')

Acc: 0.9736842105263158
Auc: 0.96875
confusion matrix 
 [[45  3]
 [ 0 66]] 


# C

Lr

In [43]:
from sklearn import metrics

In [44]:
X_train.shape

(455, 30)

In [45]:
scores = []

for i in range(X_train.shape[1]):
    clf = LogisticRegression(random_state=10, penalty='l2', tol=1e-4, C=1.0, intercept_scaling=1, fit_intercept=True,
                             solver="lbfgs", max_iter=5000)
    data = X_train[:, i].reshape(X_train.shape[0], -1)
    clf.fit(data, y_train)
    pred = clf.predict(X_test[:, i].reshape(X_test.shape[0], -1))
    scores.append((i, metrics.roc_auc_score(y_test, pred)))

sorted_by_points = sorted(scores, key=lambda tup: tup[1], reverse=True)
sorted_by_points

[(20, 0.9071969696969697),
 (0, 0.896780303030303),
 (2, 0.896780303030303),
 (23, 0.896780303030303),
 (22, 0.8920454545454545),
 (3, 0.8863636363636365),
 (13, 0.8702651515151516),
 (26, 0.8494318181818182),
 (12, 0.8238636363636364),
 (10, 0.8077651515151515),
 (25, 0.7926136363636362),
 (27, 0.7708333333333334),
 (6, 0.7291666666666667),
 (1, 0.6609848484848485),
 (21, 0.65625),
 (5, 0.5577651515151516),
 (28, 0.5520833333333333),
 (7, 0.5416666666666667),
 (4, 0.5),
 (8, 0.5),
 (9, 0.5),
 (11, 0.5),
 (14, 0.5),
 (15, 0.5),
 (16, 0.5),
 (17, 0.5),
 (18, 0.5),
 (19, 0.5),
 (24, 0.5),
 (29, 0.5)]

In [46]:
selected_features = []
for i in range(20):
    selected_features.append(sorted_by_points[i][0])

In [47]:
clf = LogisticRegression(random_state=0, penalty='l2', tol=1e-4, C=1.0, intercept_scaling=1, fit_intercept=True,
                         solver="lbfgs", max_iter=6000)
clf.fit(X_train[:, selected_features], y_train)
pred = clf.predict(X_test[:, selected_features])
acc = metrics.accuracy_score(pred, y_test)
cm = metrics.confusion_matrix(pred, y_test)

print("LR results")
print(f'Acc: {acc}')
print(f'Auc: {auc}')
print(f'confusion matrix \n {cm} ')

LR results
Acc: 0.956140350877193
Auc: 0.96875
confusion matrix 
 [[46  3]
 [ 2 63]] 


Gussian

In [48]:
scores = []

for i in range(X_train.shape[1]):
    clf = GaussianNB()
    data = X_train[:, i].reshape(X_train.shape[0], -1)
    clf.fit(data, y_train)
    pred = clf.predict(X_test[:, i].reshape(X_test.shape[0], -1))
    scores.append((i, metrics.roc_auc_score(y_test, pred)))

sorted_by_points = sorted(scores, key=lambda tup: tup[1], reverse=True)
sorted_by_points

[(7, 0.9460227272727273),
 (27, 0.9176136363636365),
 (22, 0.9119318181818181),
 (23, 0.90625),
 (2, 0.9015151515151515),
 (20, 0.8986742424242424),
 (0, 0.8806818181818181),
 (3, 0.8806818181818181),
 (6, 0.8731060606060607),
 (13, 0.8570075757575758),
 (26, 0.8522727272727273),
 (5, 0.8352272727272728),
 (25, 0.8087121212121212),
 (10, 0.8077651515151515),
 (12, 0.8077651515151515),
 (17, 0.7225378787878787),
 (8, 0.7073863636363636),
 (28, 0.6931818181818181),
 (1, 0.6609848484848485),
 (24, 0.6581439393939394),
 (21, 0.65625),
 (4, 0.6534090909090909),
 (29, 0.6524621212121212),
 (16, 0.6212121212121212),
 (15, 0.5946969696969697),
 (18, 0.5625),
 (11, 0.5),
 (14, 0.5),
 (19, 0.5),
 (9, 0.49526515151515155)]

In [49]:
selected_features = []
for i in range(3):
    selected_features.append(sorted_by_points[i][0])

In [50]:
clf = LogisticRegression(random_state=0, penalty='l2', tol=1e-4, C=1.0, intercept_scaling=1, fit_intercept=True,
                         solver="lbfgs", max_iter=6000)
clf.fit(X_train[:, selected_features], y_train)
pred = clf.predict(X_test[:, selected_features])
acc = metrics.accuracy_score(pred, y_test)
cm = metrics.confusion_matrix(pred, y_test)

print("Gaussian results")
print(f'Acc: {acc}')
print(f'Auc: {auc}')
print(f'confusion matrix \n {cm} ')

Gaussian results
Acc: 0.8947368421052632
Auc: 0.96875
confusion matrix 
 [[42  6]
 [ 6 60]] 


# D

In [51]:
def convert_to_probebilties(train_x: np.ndarray, train_y: np.ndarray, test_x : np.ndarray):
    new_train_x = []
    new_test_x = []
    print(train_x.shape)
    for i in range(train_x.shape[1]):
        clf = LogisticRegression(random_state=0, penalty='l2', tol=1e-4, C=1.0, intercept_scaling=1, fit_intercept=True,
                                 solver="lbfgs", max_iter=5000)
        data = train_x[:, i].reshape(train_x.shape[0], -1)
        clf.fit(data, train_y)
        pred_x = clf.predict_proba(train_x[:, i].reshape(train_x.shape[0], -1))[:, 0]
        new_train_x.append(pred_x)
        pred_x = clf.predict_proba(test_x[:, i].reshape(test_x.shape[0], -1))[:, 0]
        new_test_x.append(pred_x)

    new_train_x = np.array(new_train_x).transpose()
    new_test_x = np.array(new_test_x).transpose()
    return new_train_x , new_test_x

In [52]:
X_train , X_test = convert_to_probebilties(X_train, y_train , X_test)
X_train

(455, 30)


array([[0.05076963, 0.25126848, 0.03957998, ..., 0.31475403, 0.30821986,
        0.35424038],
       [0.02062627, 0.24421788, 0.01962399, ..., 0.4317036 , 0.33253879,
        0.36358378],
       [0.17794082, 0.15359751, 0.14071122, ..., 0.33249268, 0.29404015,
        0.35469981],
       ...,
       [0.13893757, 0.34948601, 0.13646848, ..., 0.3299278 , 0.41872355,
        0.36260147],
       [0.00297811, 0.32569808, 0.00303387, ..., 0.3572694 , 0.38013467,
        0.36632196],
       [0.22151814, 0.14169968, 0.20262395, ..., 0.38265355, 0.36722425,
        0.35750196]])

In [53]:
X_test

array([[0.03723625, 0.33301002, 0.02896847, ..., 0.27851783, 0.37194253,
        0.36023898],
       [0.21977611, 0.06850118, 0.21462377, ..., 0.29003275, 0.34584171,
        0.3573207 ],
       [0.98952117, 0.74313165, 0.99430909, ..., 0.47509726, 0.42519581,
        0.36815584],
       ...,
       [0.98626996, 0.32621761, 0.98829766, ..., 0.3701379 , 0.35626125,
        0.35559021],
       [0.07927595, 0.30728704, 0.06093443, ..., 0.30653256, 0.36771119,
        0.35730471],
       [0.11642285, 0.21457253, 0.08676224, ..., 0.26670651, 0.32048533,
        0.3552419 ]])

now repeate part a and b

In [57]:
lr = sklearn.naive_bayes.MultinomialNB()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

acc = metrics.accuracy_score(y_test, pred)
cm = metrics.confusion_matrix(y_test, pred)
auc = metrics.roc_auc_score(y_test, pred)

print('MultinomialNB')
print(f'Acc: {acc}')
print(f'Auc: {auc}')
print(f'confusion matrix \n {cm} ')

MultinomialNB
Acc: 0.9035087719298246
Auc: 0.9024621212121213
confusion matrix 
 [[43  5]
 [ 6 60]] 


In [58]:
clf = GaussianNB()
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
acc = metrics.accuracy_score(y_test, pred)
cm = metrics.confusion_matrix(y_test, pred)
auc = metrics.roc_auc_score(y_test, pred)

print(f'Gussian')
print(f'Acc: {acc}')
print(f'Auc: {auc}')
print(f'confusion matrix \n {cm} ')

Gussian
Acc: 0.956140350877193
Auc: 0.9621212121212122
confusion matrix 
 [[48  0]
 [ 5 61]] 


In [59]:
clf = LogisticRegression(random_state=10, penalty='l2', tol=1e-4, C=1.0, intercept_scaling=1, fit_intercept=True,
                        solver="lbfgs", max_iter=8000)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
acc = metrics.accuracy_score(y_test, pred)
cm = metrics.confusion_matrix(y_test, pred)
auc = metrics.roc_auc_score(y_test, pred)

print('LogisticRegression')
print(f'Acc: {acc}')
print(f'Auc: {auc}')
print(f'confusion matrix \n {cm} ')

LogisticRegression
Acc: 0.9824561403508771
Auc: 0.9820075757575757
confusion matrix 
 [[47  1]
 [ 1 65]] 
