In [1]:
import numpy as np
import pandas as pd

In [2]:
# FIX separate import of libraries from actual code
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score 
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_breast_cancer

In [3]:
# FIX use separate cells to define functions
def load_cancer_data(): 
  cancer = load_breast_cancer()     
  data = np.c_[cancer.data, cancer.target]
  columns = np.append(cancer.feature_names, ["target"])
  return pd.DataFrame(data, columns=columns)

In [4]:
all_data = load_cancer_data()

In [12]:
# FIX use meaningfull names e.g. validation_data, training_data instead of delayed_data (отложенные данные?) and all_data
validation_data = all_data[int(0.8 * len(all_data)):]
training_data = all_data[:int(0.8 * len(all_data))]

# FIXIT bad assumption that targets are always at the end (better specify target column name and 
# split the data according to the name of this variable)
training_labels = training_data[training_data.columns[-1]].values
training_feature_matrix = training_data[training_data.columns[:-1]].values

validation_labels = validation_data[validation_data.columns[-1]].values
validation_feature_matrix = validation_data[validation_data.columns[:-1]].values


train_feature_matrix, test_feature_matrix, train_labels, test_labels = train_test_split(
    training_feature_matrix, training_labels, test_size=0.33, random_state=42)

## Use sklearn logistic regression

In [13]:
lp = np.exp(np.linspace(-5, 5, 11))

logistic_regression_classifier = LogisticRegression(max_iter=1000)
param_grid = [{'penalty': ['l1', 'l2'],
               'C': lp       
              }]

searcher = GridSearchCV(logistic_regression_classifier, param_grid, cv=5,  verbose=10, n_jobs = 4)
searcher.fit(training_feature_matrix, training_labels)

Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1892s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1344s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done  36 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done  62 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done 110 out of 110 | elapsed:    7.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid=[{'penalty': ['l1', 'l2'], 'C': array([6.73795e-03, 1.83156e-02, 4.97871e-02, 1.35335e-01, 3.67879e-01,
       1.00000e+00, 2.71828e+00, 7.38906e+00, 2.00855e+01, 5.45982e+01,
       1.48413e+02])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [14]:
searcher.best_score_

0.9714285714285714

In [15]:
searcher.best_params_

{'C': 20.085536923187668, 'penalty': 'l1'}

In [19]:
test_predicted_labels = searcher.best_estimator_.predict(test_feature_matrix)
print(classification_report(test_labels, test_predicted_labels))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98        65
         1.0       0.99      0.99      0.99        86

   micro avg       0.99      0.99      0.99       151
   macro avg       0.99      0.99      0.99       151
weighted avg       0.99      0.99      0.99       151



In [20]:
classifier = LogisticRegression(max_iter=1000)

classifier.fit(train_feature_matrix, train_labels)
validation_predicted_labels = classifier.predict(validation_feature_matrix)
print(classification_report(validation_labels , validation_predicted_labels))

              precision    recall  f1-score   support

         0.0       0.76      0.96      0.85        26
         1.0       0.99      0.91      0.95        88

   micro avg       0.92      0.92      0.92       114
   macro avg       0.87      0.94      0.90       114
weighted avg       0.94      0.92      0.92       114





In [21]:
validation_feature_matrix.shape

(114, 30)

In [61]:
x_train = feature_matrix.reshape(-1, 1).astype('float32')

In [63]:
x_train.shape

(8730, 1)

In [68]:
tensor_y.shape

torch.Size([291, 1])

## Use PyTorch

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data as utils
from torch.optim import Adam , Adadelta, SGD
from sklearn.preprocessing import OneHotEncoder

In [47]:
x_train = training_feature_matrix.reshape(-1, 1).astype('float32')
y_train = training_labels.reshape(-1, 1).astype('float32')

tensor_x = torch.stack([torch.Tensor(i) for i in training_feature_matrix.astype('float64')])
# FIXIT if you use one hot encoding, you should pass labels on One-Hot-Encoding format
tensor_y = torch.stack([torch.Tensor(i) for i in 
                        OneHotEncoder(sparse=False).fit_transform((training_labels).reshape(-1, 1)).astype('float32')]) 

my_dataset = utils.TensorDataset(tensor_x, tensor_y) 
batch_size = 50
my_dataloader = utils.DataLoader(my_dataset, batch_size=batch_size,
                                 shuffle=True, num_workers=4)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [48]:
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes) 
        
    def forward(self, x):
        out = self.linear(x)
        # out = nn.Sigmoid()(out) # avoid this step, we need only 
        return out

# FIXIT why do we need this function?
def get_param_values():
    return w.data[0][0], b.data[0]

In [53]:
model = LogisticRegression(30, 2)
criterion = nn.CrossEntropyLoss() # This criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class!
# categorical cross entropy
optimizer = Adam(model.parameters(), 0.01)

[w, b] = model.parameters()

In [7]:
torch.cuda.is_available()

True

In [54]:
epoch_number = 30
for epoch in range(epoch_number):
    model.train()
    epoch_loss = 0
    batch_number = 0
    for batch_id, batch_sample in enumerate(my_dataloader):       
        #print(x[0].shape)
        #print(x[1])
        optimizer.zero_grad()
        points = batch_sample[0]
        
        predicted_probabilities = model(points)
        #print(trans.shape)
        #print(x[1].shape)
        loss = criterion(predicted_probabilities, torch.max(batch_sample[1], 1)[1])

        epoch_loss += loss.item()
        batch_number += 1
        loss.backward()
        optimizer.step()
    print("loss on epoch {} = {}".format(epoch, epoch_loss / batch_number))

loss on epoch 0 = 64.1955135345459
loss on epoch 1 = 31.68964672088623
loss on epoch 2 = 15.890056228637695
loss on epoch 3 = 9.68368957042694
loss on epoch 4 = 4.351440572738648
loss on epoch 5 = 1.991914439201355
loss on epoch 6 = 1.1975868850946427
loss on epoch 7 = 1.119466695189476
loss on epoch 8 = 0.9694336377084255
loss on epoch 9 = 1.0040420204401017
loss on epoch 10 = 0.815358966588974
loss on epoch 11 = 0.9001766711473465
loss on epoch 12 = 0.6263689577579499
loss on epoch 13 = 0.5624534502625466
loss on epoch 14 = 0.5720000777393579
loss on epoch 15 = 0.4342592859175056
loss on epoch 16 = 0.4412039041519165
loss on epoch 17 = 0.5433797866106034
loss on epoch 18 = 0.3401086062192917
loss on epoch 19 = 0.305942979734391
loss on epoch 20 = 0.27461996615165846
loss on epoch 21 = 0.23795140013098717
loss on epoch 22 = 0.26984078623354435
loss on epoch 23 = 0.27948376489803195
loss on epoch 24 = 0.21755223721265793
loss on epoch 25 = 0.15225974165368825
loss on epoch 26 = 0.21412

In [None]:
classifier.fit(train_feature_matrix, train_labels)
y_ans = clf.predict(delayed_feature_matrix)
print(classification_report(delayed_labels , y_ans))

In [56]:
x_delayed = torch.stack([torch.Tensor(i) for i in validation_feature_matrix.astype('float64')])

In [57]:
x_delayed.shape

torch.Size([114, 30])

In [58]:
torch_results = model(x_delayed)

In [59]:
torch_labels = torch.max(torch_results, 1)[1]

In [60]:
accuracy_scoretorch_labels


tensor([1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])