# Simple Machine Learning Examples

### import libraries and define functions

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, recall_score,confusion_matrix

def choose_model(model, c_type = 'best', plot_result = True):
    """
    This function chooses a model from a set of models identified using LogisticRegressionCV
    It can return the best model (model.C_) or the most parsimonious model, which is the model whose score is
    within 1 standard error from the best score
    :param model: logistic regression model with cross validation (LogisticRegressionCV)
    :param c_type: type of C value to return,
                   can be either 'best' for best model, or 'par' for the most parsimonious model
    :param plot_result: whether to plot the reult and show the best model and parsimonious model on the same figure
    :return: C value for the chosen model
    """
    n_folds = model.coefs_paths_[1.0].shape[0]
    c_vals = model.Cs_
    best_c = model.C_

    best_c_ind = np.where(np.abs(c_vals - model.C_) < 1e-10)[0][0]

    included_vars = np.sum(model.coefs_paths_[1.0].mean(axis=0) != 0,  axis=1) - 1  # the -1 is make sure the intercept is not included
    included_vars = included_vars[
        [int(item) for item in np.linspace(0, len(included_vars) - 1, 30)]]  # Take only 30 samples from included_vars
    scores = model.scores_[1.0].mean(axis=0)
    scores_sem = model.scores_[1.0].std(axis=0) / np.sqrt(n_folds)

    # Get 1 standard error of the mean (SEM) from the best accuracy,
    # According to Friedman, Hastie, and Tibshirani (2010): since risk curves are estimated
    # with errors, it is better to err on the side of parsimony
    best_sem = scores_sem[best_c_ind]

    # finds the last point where scores are within one SEM from best score
    c1se_ind = np.where(scores[best_c_ind] - scores[0:best_c_ind] < best_sem)[0][0]
    c1se = model.Cs_[c1se_ind]  # least acceptable score
    if plot_result:
        fig, ax = plt.subplots(figsize=(8, 5))
        ax3 = ax.twiny()
        ax3.set_xticks(np.arange(0, len(included_vars) + 2), [''] + list(included_vars) + [''], font = 'Cambria', fontsize = 12)
        ax3.tick_params(width=0, length = 0)
        ax3.set_xlabel('Included Variables', font = "Cambria", fontsize = 18)
        ax.axvline(x=np.log(best_c), color='grey', ls='-', lw=1, label='Best Score Model')
        ax.axvline(x=np.log(c1se), color='grey', ls='-.', lw=1, label='Parsimonious Model')
        ax.errorbar(np.log(model.Cs_), scores, scores_sem, fmt='o', linewidth=1,
                    color='grey', mfc='royalblue', mec='none', capsize=4)
        ax.legend()
        x_axis_text = np.round(ax.get_xticks()[1:-1],1)
        y_axis_text = np.round(ax.get_yticks()[1:-1],1)
        ax.set_xticks(ticks =x_axis_text, labels =  x_axis_text,font = 'Cambria', fontsize = 12)
        ax.set_yticks(ticks =y_axis_text, labels =  y_axis_text,font = 'Cambria', fontsize = 12)
        ax.set_xlabel('log(C)',font = 'Cambria', fontsize = 18)
        ax.set_ylabel('Accuracy',font = 'Cambria', fontsize = 18)

    if c_type=='best':
        return model.C_[0]
    elif c_type == 'par':
        return c1se
    else:
        raise Warning("c_type can only be set to 'best' or 'par'")


def model_performance(model,X_test,y_test):
    """
    Print the coefficients and compute accuracy
    :param model: the model to be tested
    :return: print the coeffcieints and compute accuracy
    """
    # let's have a look at the coefficients and see if anything was removed
    print('Coefficients:')
    coefs = [model.intercept_[0]] + list(model.coef_[0])
    coefs = [str(np.round(item,3)) if item!=0 else "-" for item in coefs]
    coef_names = ['intercept'] + list(data.columns[1:-1])
    coefficients = pd.DataFrame(coefs, index=coef_names,columns=['value'])
    print(coefficients)
    print('\n Scores:')

    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    sens = tp / (tp + fn)
    spec = tn / (tn + fp)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print(pd.DataFrame([[sens, spec, accuracy]], columns=['Sensitivity', 'Specificity', 'Accuracy']))


### Load the data, impute missing values and have a look at the data

In [2]:
data_dir = r'\\klinik.uni-wuerzburg.de\homedir\userdata11\Sawalma_A\data\Documents\12874_2019_681_MOESM1_ESM.csv'
data = pd.read_csv(data_dir)
data = data.iloc[0:150,:]

# impute missing values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_data = imp.fit_transform(data.copy())

imp_data = pd.DataFrame(imp_data, columns=data.columns)
imp_data

Unnamed: 0,ID,thickness,cell_size,cell_shape,adhesion,epithelial_size,bare_nuclei,bland_cromatin,normal_nucleoli,mitoses,class
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.000000,3.0,1.0,1.0,0.0
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.000000,3.0,2.0,1.0,0.0
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.000000,3.0,1.0,1.0,0.0
3,1016277.0,6.0,8.0,8.0,1.0,3.0,4.000000,3.0,7.0,1.0,0.0
4,1017023.0,4.0,1.0,1.0,3.0,2.0,1.000000,3.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
145,1184840.0,1.0,1.0,3.0,1.0,2.0,3.787671,2.0,1.0,1.0,0.0
146,1185609.0,3.0,4.0,5.0,2.0,6.0,8.000000,4.0,1.0,1.0,1.0
147,1185610.0,1.0,1.0,1.0,1.0,3.0,2.000000,2.0,1.0,1.0,0.0
148,1187457.0,3.0,1.0,1.0,3.0,8.0,1.000000,5.0,8.0,1.0,0.0


### Create training and test datasets

In [7]:
# Choose features
X = imp_data.iloc[:,1:-1]
y = imp_data.iloc[:,-1]

# Divide into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

# scale both X_train and X_test for faster convergence
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


### Create and fit the model

Choose a number of c-values for the model to test. The model will 

Here is a representation from [scikit-learn website](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

<img src="https://scikit-learn.org/stable/_images/grid_search_cross_validation.png" width=440 height=305 />

In [None]:
# Choose a number of c-values for the model to test.
c_vals = np.logspace(-3,1, 50) # The smaller the value, the higher the penalty is


cv_model = LogisticRegressionCV(Cs=c_vals,penalty='l1', cv=10,
                             tol=0.001, solver='saga', scoring='accuracy')
# you can also choose scoring = 'neg_mean_squared_error', but remember to multiply scores with -1

# fit the model to the training data set
cv_model.fit(X_train,y_train)

# choose best model
best_c = choose_model(model = cv_model,c_type='best', plot_result=True)
best_model = LogisticRegression(C=best_c, penalty='l1', tol = 0.001, solver = 'saga').fit(X_train,y_train)
model_performance(best_model,  X_test, y_test)

# Choose parsimonious model
par_c = choose_model(model = cv_model,c_type='par', plot_result=False)
par_model = LogisticRegression(C=par_c, penalty='l1', tol = 0.001, solver = 'saga').fit(X_train,y_train)
model_performance(par_model, X_test, y_test)