In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyreadr
import lightgbm as lgb
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import dalex as dx

readRDS = robjects.r['readRDS']
df = readRDS('do_class_ohe.Rds')
df = pandas2ri.rpy2py_dataframe(df)

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

df_clean = clean_dataset(df)

X = df_clean.iloc[:, 2:-1].values
y = df_clean.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

model = LogisticRegression(solver='liblinear', random_state=997)

model.fit(X_train, y_train)

model = LogisticRegression(solver='liblinear', random_state=0).fit(X_train, y_train)
model.coef_

probs = model.predict_proba(X_test)
preds = probs[:,1]
Roc_Analysis(y_test, preds)

df_clean = clean_dataset(df)
X_withnames = df_clean.drop(columns=["BKG_BOOKING_ID","TKT_TICKET_NUMBER1","FCP_UPGRADED_FLAG.Y"])
y_withnames = df_clean["FCP_UPGRADED_FLAG.Y"]

logit_exp = dx.Explainer(model, X_withnames, y_withnames, label = "Logit Pipeline")
mp_rf = logit_exp.model_parts()
mp_rf.result
mp_rf.plot()

pd_logit = logit_exp.model_profile(variables = ['FCP_FLIGHT_DISTANCE', 'BKG_BOOKING_WINDOW_D'])
pd_logit.plot()

##########FUNCTIONS FOR PYTHON WORKFLOW###########
import sklearn.metrics as metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Funkcje poniżej rysują ROC oraz wyznaczają optymalny Cutoff (gdzie tpr = 1 - fpr)
def Find_Optimal_Cutoff(target, predicted):
    fpr, tpr, threshold = metrics.roc_curve(target, predicted)
    i = np.arange(len(tpr)) # index for df
    roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(threshold, index = i)})
    roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    # Plot tpr vs 1-fpr
    fig, ax = plt.subplots()
    plt.plot(roc['tpr'])
    plt.plot(roc['1-fpr'], color = 'red')
    plt.xlabel('1 - Specificity')
    plt.ylabel('Sensitivity')
    plt.title('Receiver operating characteristic')
    ax.set_xticklabels([])
    plt.show()
    print(roc.iloc[(roc.tf-0).abs().argsort()[:1]])


def Roc_Plot(fpr, tpr, threshold, roc_auc):
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('1 - Specificity')
    plt.ylabel('Sensitivity')
    plt.show()
    
    
def Roc_Analysis(target, predicted):
    
    #initial calculations
    fpr, tpr, threshold = metrics.roc_curve(target, predicted)
    roc_auc = metrics.auc(fpr, tpr)

    #execution of subfunctions
    Find_Optimal_Cutoff(target, predicted)
    Roc_Plot(fpr, tpr, threshold, roc_auc)