# Find optimal logistic regression paramteters for a cancer patients

import statements

In [5]:
import pandas as pd
import numpy as np
import json
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

load data

In [6]:
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']], columns= np.append(cancer['feature_names'], ['target']))
X = df.iloc[:, 0:30].as_matrix()
y = df.iloc[:, 30:31].as_matrix().flatten()

In [7]:
type(y)

numpy.ndarray

generate test and train data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42
)

set logit regression and set GridSearchCV to get the optimal parameters

In [13]:
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=False, 
                           intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', 
                           max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
params = {'penalty': ['l2'], 'C': [n for n in range(1,60)], 'solver': ['newton-cg', 'liblinear']}
clf = GridSearchCV(model, params, error_score=0.0)

get accuracy of the GridSearchCV chosen parameters

In [14]:
%%time
arr = cross_val_score(clf.fit(X, y), X, y, cv=3, scoring='accuracy')
arr.mean()

Wall time: 33.3 s


write solution

In [15]:
arr.mean()

0.95253875429314017

In [None]:
with open('classification_params_example.json', 'w') as outfile: 
    json.dump(clf.best_params_, outfile, sort_keys=True, indent=4, separators=(',', ': '))