In [20]:
import sys
sys.path.append('/Users/samrelins/Documents/LIDA/ace_project/')
from data_prep.data_prep import *
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression, LogisticRegression

def fit_lm_and_return_plot(data, logistic=False, show_decision_line=False):
    if logistic:
        clf = LogisticRegression()
    else:
        clf = LinearRegression()
    clf.fit(data["x"].values.reshape(-1,1),
            data["y"])

    y_labels = data["y"].apply(lambda y: "y = 1" if y else "y = 0")
    fig = px.scatter(data,
                     x="x",
                     y="y",
                     color=y_labels)

    x_values = np.arange(0, 10, 0.1)
    if logistic:
        y_pred_values = clf.predict_proba(x_values.reshape(-1,1))[:,1]
    else:
        y_pred_values = clf.predict(x_values.reshape(-1, 1))
    fig.add_trace(
        go.Scatter(
            x=x_values,
            y=y_pred_values,
            mode="lines",
            line=go.scatter.Line(color="gray"),
            showlegend=False)
    )

    if show_decision_line:
        if logistic:
            decision_value = float(-clf.intercept_/clf.coef_)
        else:
            decision_value = float((0.5 - clf.intercept_)/ clf.coef_)
        x_decision_values = [decision_value] * 2
        y_decision_values = [-.1, 1.1]
        fig.add_trace(
            go.Scatter(
                x=x_decision_values,
                y=y_decision_values,
                mode="lines",
                line=go.scatter.Line(color="red", dash="dash"),
                showlegend=False)
        )
    fig.update_yaxes(range=[-.1, 1.1])
    fig.update_xaxes(range=[-1, 10.5])
    return fig

eg_data = {
    "x": np.concatenate([np.random.normal(2, 1, 20),
                         np.random.normal(4, 0.5, 20)]),
    "y": np.concatenate([np.zeros(20),
                         np.ones(20)])
}
eg_df = pd.DataFrame(eg_data)

## Logistic Regression

This notebooks is the first of three "deep-dives" into the most successful of
 the ACE models trained so far. We'll begin with logistic regression as it is
  by far the most simple of the techniques used. We'll get
  the theory out of the way first:

### The Theory

Logistic regression is very similar to the linear regression - indeed,
we can quickly explain it by comparing the two.

Linear regression attempts to directly predict a continuous output by
computing a "line of best fit" - most optimal the solution to the following
formula:

$y =  \beta_{0} + \beta_{1}x_{1} + \beta_{2}x_{2} + ... + \beta_{n}x_{n} + \epsilon$

which is a linear combination of the predictive parameters ($x_{i}$'s) and model
weights ($\beta_{i}$'s) that best fits the outcome variable / label
($y$), with an error term $\epsilon$ representing the inevitable
deviations from this line.

It's a fantastic tool for prediction tasks as it's simple, doesn't require
lots of data and the output is easy to interpret. If we want to understand
the effect one of our variables has on the output, say $x_{3}$, we need simply
look at the
 weight associated with it, $\beta_{3}$ - increasing $x_{3}$ by one unit increases the
 output prediction $y_{i}$ by the value of $\beta_{3}$ (holding all other
 variables constant).

However, linear regression is not well suited to classification tasks. If we
were to fit a linear
regression model to a
really simple classification
 task, the solution might look like this:

In [21]:
lin_reg_plot = fit_lm_and_return_plot(eg_df)
lin_reg_plot.show()

This is obviously not an optimal solution to the problem. Our true y's are
all zeros and ones, but the linear model predicts zero / one
at single, very specific, points and is wrong everywhere else - Particularly
so at the extremes where the classification should be most obvious!

You might argue that we could simply classify those predictions above 0.5 as
 one and the others zero. This can be visualised by adding a decision
 boundary to the above plot:

In [22]:
lin_reg_plot = fit_lm_and_return_plot(eg_df,
                                      show_decision_line=True)
lin_reg_plot.show()

This may now seem like a reasonable solution - the model chooses a decent
decision boundary. But consider the following:

The examples that lie at the extremes of our classification groups (in this
case those x values approaching 0 or 6) shouldn't sway our decisions - they
are easy to classify and don't contribute to the confusion where the classes
overlap. But, if we add a number of examples to the extremes of one of the
groups, in this case we'll add some extra examples classified as y = 1, we see
the following:

In [23]:
extra_values = pd.DataFrame(
    {"x": np.random.normal(8, 1, 10),
     "y": np.ones(10)}
)
eg_df_ext = eg_df.append(extra_values)

In [24]:
lin_reg_plot_ext = fit_lm_and_return_plot(eg_df_ext,
                                          show_decision_line=True)
lin_reg_plot_ext.show()

The linear regression model's predictions change dramatically and it moves it's
boundary towards these new extreme values, which is obviously not desirable.

A better solution, accounting for all of the above issues, is to re-formulate
the linear regression function. Instead of directly outputting numeric predictions of
y, instead we'll re-formulate the model function so that it outputs
probabilities that $y
 = 1$. To do this, we take the output of the same linear regression formula
 we had before, but this time we'll rename the output $\theta$ to avoid
 confusing it with a prediction for $y$:

$\theta =  \beta_{0} + \beta_{1}x_{1} + \beta_{2}x_{2} + ... + \beta_{n}x_{n} +
\epsilon$

We take the $\theta$ value and calculate our probability that $y = 1$
output using a
"link function" that scales $\theta_{i}$ to between zero and one:

$p(y = 1 | \boldsymbol x) = \frac{1}{1 + e^{-\theta}} $

The result is predictions that look like this:

In [13]:
log_reg_plot = fit_lm_and_return_plot(eg_df,
                                      logistic=True,
                                      show_decision_line=True)
log_reg_plot.show()

You can see that, now we've adjusted the model, the predictions are now all
probabilites - they lie very close to zero or one at the extremes and then sharply curve as they approach the decision boundary. These
 predictions seem to do a much better job of fitting to the data. Moreover,
 if we add the same additional extreme values as before the model's predictions and decision boundary remains unmoved:

In [14]:
log_reg_plot_ext = fit_lm_and_return_plot(eg_df_ext,
                                          logistic=True,
                                          show_decision_line=True)
log_reg_plot_ext.show()

### Fitting to ACE Data

Great! With that done we can move on to fitting the model to the ACE dataset.
 This can be done with a few lines of code given we already found the best
 parameters and data preparation method in the previous notebook:

In [45]:
data_loc = "/Users/samrelins/Documents/LIDA/ace_project/data/ace_data_orig.csv"
ace_data_orig = pd.read_csv(data_loc)

X_train, y_train, X_test, y_test = return_train_test(ace_data_orig)
X_train, X_test = encode_and_scale(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    cat_encoder="one_hot"
)

log_reg_clf = LogisticRegression(max_iter=10000,
                                 penalty = "l1",
                                 solver = "liblinear",
                                 class_weight="balanced",
                                 C = 0.08858667904100823)

log_reg_clf.fit(X_train, y_train)

LogisticRegression(C=0.08858667904100823, class_weight='balanced',
                   max_iter=10000, penalty='l1', solver='liblinear')

In [46]:
from sklearn.metrics import (make_scorer, confusion_matrix, precision_score,
                             f1_score, roc_auc_score, accuracy_score,
                             recall_score)

# custom scoring functions for CV loop
true_neg = make_scorer(lambda y, y_pred: confusion_matrix(y, y_pred)[0][0])
false_neg = make_scorer(lambda y, y_pred: confusion_matrix(y, y_pred)[1][0])
true_pos = make_scorer(lambda y, y_pred: confusion_matrix(y, y_pred)[1][1])
false_pos = make_scorer(lambda y, y_pred: confusion_matrix(y, y_pred)[0][1])
precision = make_scorer(precision_score, zero_division=0)

# dict of scoring functions
SCORING = {
    "f1": make_scorer(f1_score),
    "roc_auc": make_scorer(roc_auc_score),
    "accuracy": make_scorer(accuracy_score),
    "recall": make_scorer(recall_score),
    "precision": make_scorer(precision_score),
    "true_pos": true_pos,
    "true_neg": true_neg,
    "false_pos": false_pos,
    "false_neg": false_neg
}


def score_classifier(clf, X, y):
    """
    Scores a classifier function against scoring functions in SCORING dict

    :param clf: classifier to be scored
    :param X: matrix of training vectors
    :param y: vector of target labels
    :return: dict of {score type: score} pairs
    """

    scores = {}
    for name, scorer in SCORING.items():
        scores[name] = scorer(clf, X, y)
    return scores

score_classifier(log_reg_clf, X_test, y_test)

{'f1': 0.2857142857142857,
 'roc_auc': 0.5543117744610282,
 'accuracy': 0.6273291925465838,
 'recall': 0.4444444444444444,
 'precision': 0.21052631578947367,
 'true_pos': 12,
 'true_neg': 89,
 'false_pos': 45,
 'false_neg': 15}

In [58]:
X_train.columns[(log_reg_clf.coef_ != 0)[0]]

Index(['referral_from_GP', 'age', 'ox_sat', 'resp_rate', 'heart_rate'], dtype='object')

In [55]:
log_reg_clf.coef_

array([[ 0.        ,  0.        ,  0.18114178,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [57]:
log_reg_clf.coef_ != 0

array([[False, False,  True, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False,  True,  True,  True,  True, False]])