# Hometask for the logistic regression.
## Dataset: breast_cancer
### Made by Andrey Stasishin

In [40]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import numpy as np
import heapq
import pandas

In [41]:
cancer_dataset = load_breast_cancer()
data = np.c_[cancer_dataset.data, cancer_dataset.target]
columns = np.append(cancer_dataset.feature_names, ["target"])
frame = pandas.DataFrame(data, columns=columns)

In [42]:
X = frame[frame.columns[:-1]]
y = frame.target

In [43]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=1)

In [44]:
algo = LogisticRegression()
arr = cross_val_score(algo, x_train, y_train, cv=3, scoring='accuracy')
print (arr)
print (np.mean(arr))

[ 0.90977444  0.93984962  0.96969697]
0.939773676616


In [45]:
algo.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
accuracy_score(algo.predict(x_test), y_test)

0.95321637426900585

In [47]:
algo.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [58]:
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"C": sp_randint.rvs(0.1, 2),
              "max_iter": sp_randint.rvs(10,300),
              "penalty": ["l1","l2"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(algo, param_distributions=param_dist,
                                   n_iter=n_iter_search)

# start = time()
# random_search.fit(x_train, y_train)
# print("RandomizedSearchCV took %.2f seconds for %d candidates"
#      " parameter settings." % ((time() - start), n_iter_search))
# report(random_search.cv_results_)
random_search

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'C': 1, 'max_iter': 223, 'penalty': ['l1', 'l2']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)