In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import precision_recall_curve, roc_auc_score, auc
from ast import literal_eval
import datetime
from sklearn.calibration import calibration_curve

%cd ../
from src import create_fake_patients

#### Prepare data

In [None]:
replace_type = 'hip'
num_read_codes = 512
list_of_read_codes = [str(x) for x in list(range(num_read_codes))]

In [None]:
cv_patients = create_fake_patients.create_fake_patient_df(num_patients=9000, max_events=100, max_nodes=512)
test_patients = create_fake_patients.create_fake_patient_df(num_patients=2000, max_events=100, max_nodes=512)
recal_test_patients = create_fake_patients.create_fake_patient_df(num_patients=2000, max_events=100, max_nodes=512)

In [None]:
def prep_data_for_log_reg(patients, replace_type):
    
    """
    Take the TGCNN ready table and convert it into a one hot encoding table.
    """
    patients = patients[(patients['replace_type']=='none') | (patients['replace_type']==replace_type)]
    
    # Get a column of the sequence of events
    patients['event_seq'] = "2"
    for i, row in patients.iterrows():
        patients['event_seq'][i] = [sublist[1] for sublist in patients['indices'][i]][::-1]
        
    # Turn list of event codes to ohe columns
    mlb = MultiLabelBinarizer(sparse_output=True)

    with_ohe = patients.join(
                pd.DataFrame.sparse.from_spmatrix(
                    mlb.fit_transform(patients.pop('event_seq')),
                    index=patients.index,
                    columns=mlb.classes_))
    

    with_ohe.gender[with_ohe.gender == 'M'] = 1
    with_ohe.gender[with_ohe.gender == 'F'] = 0
    with_ohe.columns = with_ohe.columns.astype(str)

    # Make the sparse columns dense
    sparse_columns = with_ohe.columns[with_ohe.dtypes == 'Sparse[int32, 0]']

    # Convert sparse columns to dense columns one by one
    for col in sparse_columns:
        with_ohe[col] = with_ohe[col].sparse.to_dense()

     
    # adding any missing columns
    all_columns = list(range(num_read_codes))
    all_columns = [str(x) for x in all_columns]

    # Check if each column exists, and if not, fill it with zeros
    for col in all_columns:
        if col not in with_ohe.columns:
            with_ohe[col] = 0
            
    with_ohe.columns = with_ohe.columns.astype(str)
        
    return with_ohe

cv_ohe = prep_data_for_log_reg(cv_patients, replace_type)
test_ohe = prep_data_for_log_reg(test_patients, replace_type)
post_recal_ohe = prep_data_for_log_reg(recal_test_patients, replace_type)

#### Calibration plot function

In [None]:
def plot_calibration(y_test, pred_probs):
    true_y = [0 if item == 'none' else 1 for item in y_test]
    true_y = np.array(true_y)  
    pred_y = np.array(pred_probs)
    class0_prob = pred_y[:, 0]# prob of being class 0
    fop, mpv = calibration_curve(true_y, class0_prob, n_bins=20, normalize=True)
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.plot(mpv, fop, marker='.')
    plt.xlabel('Mean Predicted Value')
    plt.ylabel('Fraction of Positives')
    plt.show()

#### Logistic Regression Function

In [None]:
# Function for logistic regression modelling

def log_reg_model(x_train: pd.DataFrame, y_train: pd.DataFrame, x_test: pd.DataFrame, y_test: pd.DataFrame, 
                  x_test2: pd.DataFrame, y_test2: pd.DataFrame, data_type_str: str):

    y_train = [1 if x == 'hip' else 0 for x in y_train]
    y_test = [1 if x == 'hip' else 0 for x in y_test]
    y_test2 = [1 if x == 'hip' else 0 for x in y_test2]

    
    model = LogisticRegression(penalty='l2', max_iter=10000, solver='newton-cg')
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=0)
    metric_scoring = ['accuracy', 'roc_auc']#, 'f1', 'precision', 'recall']
    n_scores = cross_validate(model, x_train, y_train, scoring=metric_scoring, cv=cv, n_jobs=-1, return_train_score=True)
    
    scores_df= pd.DataFrame.from_dict(n_scores, orient='index')
    scores_df['mean'] = scores_df.iloc[:, 0:5].mean(axis=1)
    scores_df['std'] =  scores_df.iloc[:, 0:5].std(axis=1)

    df_out = scores_df.stack()
    df_out.index = df_out.index.map('{0[0]}_split{0[1]}'.format)
    df_out = df_out.to_frame().T

    df_out.to_csv("temp_LR_"+data_type_str+"_scores.csv")
    print("Cross validation results:")
    print(f"Mean accuracy: {np.mean(n_scores.get('test_accuracy'))*100:.3f}% ({np.std(n_scores.get('test_accuracy'))*100:.3})")
    print(f"Mean AUC: {np.mean(n_scores.get('test_roc_auc')):.3f} ({np.std(n_scores.get('test_roc_auc')):.3})")

    no_cv_fit = model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    pred_probs = model.predict_proba(x_test)[:,1]
    

        
    file_full_name_proba = 'pred_proba_and_true/LR_'+data_type_str+'_holdout1_proba.npy'
    with open(file_full_name_proba, 'wb') as f:
        np.save(f, pred_probs)

    file_full_name_true = 'pred_proba_and_true/LR_'+data_type_str+'_holdout1_true.npy'
    with open(file_full_name_true, 'wb') as f:
        np.save(f, y_test)
        
    
    y_pred2 = model.predict(x_test2)
    y_pred_proba2 = model.predict_proba(X_test2)[:,1]
    
    file_full_name_proba2 = 'pred_proba_and_true/LR_'+data_type_str+'_holdout2_proba.npy'
    with open(file_full_name_proba2, 'wb') as f:
        np.save(f, y_pred_proba2)

    file_full_name_true2 = 'pred_proba_and_true/LR_'+data_type_str+'_holdout2_true.npy'
    with open(file_full_name_true2, 'wb') as f:
        np.save(f, y_test2)      
        
                    
    
    print("Test data confusion matrix")
    
    cm = metrics.confusion_matrix(y_test, predictions)
    TN, FP, FN, TP = metrics.confusion_matrix(y_test, predictions).ravel()
    plt.figure(figsize=(9,9))
    sns.heatmap(cm, annot=True, linewidths=.5, square = True, cmap = 'Blues')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    all_sample_title = 'Test Confusion Matrix'
    plt.title(all_sample_title, size = 15)
    plt.show()

    
    
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)
    
    # keep probabilities for the positive outcome only
    bin_preds = predictions #predictions[:, 1]
    
    model_precision, model_recall, _ = precision_recall_curve(y_test, bin_preds)
    
    # AUROC
    AUROC = roc_auc_score(y_test, bin_preds)
    
    # AUPRC
    AUPRC = auc(model_recall, model_precision)
    
    print("Test results:")
    print(f"ACC: {round(ACC,2)}\nAUROC: {round(AUROC,2)}\nAUPRC: {round(AUPRC,2)}")
    
    return no_cv_fit

#### Logistic Regression: All demographics and all Read Codes

In [None]:
model_variables = ['gender', 'imd_quin', 'age_at_label_event'] + list_of_read_codes

lr_cv_input = cv_ohe[model_variables]
lr_cv_label = cv_ohe['replace_type'].to_list()

lr_test_input = test_ohe[model_variables]
lr_test_label = test_ohe['replace_type'].to_list()

X_test2 = post_recal_ohe[model_variables]
y_test2 = post_recal_ohe['replace_type'].to_list()

no_cv_fit= log_reg_model(lr_cv_input, lr_cv_label, lr_test_input, lr_test_label, X_test2, y_test2, data_type_str = 'all_demo_all_codes_1999_to_one_year_L2')

coefficients = no_cv_fit.coef_
coef_df = pd.DataFrame(coefficients, columns=model_variables)
print("Model intercept:", no_cv_fit.intercept_)

print("\nOdds of Model Coefficients:")
odds = coef_df.apply(np.exp)
# odds.to_csv('baseline_models/odds_from_log_reg/odds_all_demo_all_codes_1999_to_one_year_L2.csv', index=False)
odds

#### Logistic Regression: Only Read Codes

In [None]:
model_variables = list_of_read_codes

# Define the input and label variables
lr_cv_input = cv_ohe[model_variables]
lr_cv_label = cv_ohe['replace_type'].to_list()

lr_test_input = test_ohe[model_variables]
lr_test_label = test_ohe['replace_type'].to_list()

X_test2 = post_recal_ohe[model_variables]
y_test2 = post_recal_ohe['replace_type'].to_list()

no_cv_fit = log_reg_model(lr_cv_input, lr_cv_label, lr_test_input, lr_test_label, X_test2, y_test2, data_type_str = 'only_read_codes_1999_to_one_year_L2')

coefficients = no_cv_fit.coef_
coef_df = pd.DataFrame(coefficients, columns=model_variables)
print("Model intercept:", no_cv_fit.intercept_)

print("\nOdds of Model Coefficients")
odds = coef_df.apply(np.exp)
# odds.to_csv('../evaluation/odds_from_log_reg/odds_only_read_codes_1999_to_one_year_L2.csv', index=False)
odds

#### Logistic Regression: Only demographics

In [None]:
model_variables = ['gender', 'imd_quin', 'age_at_label_event']
print(model_variables)

# Define the input and label variables
lr_cv_input = cv_ohe[model_variables]
lr_cv_label = cv_ohe['replace_type'].to_list()

lr_test_input = test_ohe[model_variables]
lr_test_label = test_ohe['replace_type'].to_list()

X_test2 = post_recal_ohe[model_variables]
y_test2 = post_recal_ohe['replace_type'].to_list()

no_cv_fit = log_reg_model(lr_cv_input, lr_cv_label, lr_test_input, lr_test_label, X_test2, y_test2, data_type_str = 'only_demo_1999_to_one_year_L2')

coefficients = no_cv_fit.coef_
coef_df = pd.DataFrame(coefficients, columns=model_variables)
print("Model intercept:", no_cv_fit.intercept_)

print("\nOdds of Model Coefficients")
odds = coef_df.apply(np.exp)
# odds.to_csv('../evaluation/odds_from_log_reg/odds_only_demo_1999_to_one_year_L2.csv', index=False)
odds