In [1]:
import numpy as np
import pandas as pd
import random
import sys
import shap
import matplotlib.pyplot as plt

from scipy.stats import randint
from scipy.stats.mstats import winsorize

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# set seeds and k
seed = 123
np.random.seed(seed)
random.seed(seed)
k = 5

train_ids_labs = pd.read_csv('../objects/train_ids_labs.csv')
test_ids_labs = pd.read_csv('../objects/test_ids_labs.csv')
id_train, id_test = train_ids_labs['PATIENT_ID'], test_ids_labs['PATIENT_ID']
y_train, y_test = train_ids_labs['dep'], test_ids_labs['dep']


X_train = pd.DataFrame()
X_test = pd.DataFrame()

for idx, name in enumerate(['means', 'medians', 'sds', 'rmssds']):
    if idx == 0: 
        # load the features
        train = pd.read_csv('../objects/train_' + name + '.csv')
        test = pd.read_csv('../objects/test_' + name + '.csv')
    else: 
        train = pd.read_csv('../objects/train_' + name + '.csv').filter(like='dynamic', axis=1)
        test = pd.read_csv('../objects/test_' + name + '.csv').filter(like='dynamic', axis=1)
    X_train = pd.concat([X_train, train], axis=1)
    X_test = pd.concat([X_test, test], axis=1)
    
print(X_train.shape)
print(X_test.shape)

# Define the Winsorizer class
class Winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, limits=(0.05, 0.05)):
        self.limits = limits

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(lambda col: winsorize(col, limits=self.limits))

# Set up pipeline object
model = Pipeline([
    ('winsorizer', Winsorizer()),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(class_weight='balanced', random_state=seed))
])

# Fit the model
model.fit(X_train, y_train)

(3200, 278)
(800, 278)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [4]:
# Access each part of the pipeline
winsorizer = model.named_steps['winsorizer']
scaler = model.named_steps['scaler']
logreg = model.named_steps['model']

X_test_wins = winsorizer.transform(X_test)  # Apply winsorization
X_test_scaled = scaler.transform(X_test_wins)  # Apply scaling

explainer = shap.Explainer(logreg, X_test_scaled)
shap_values = explainer.shap_values(X_test_scaled)
shap.summary_plot(shap_values, X_test_scaled, feature_names=X_test.columns, show=False)
plt.savefig('../results/wideshap_plot.png', dpi=300, bbox_inches='tight')  
plt.close()