In [None]:
from collections import OrderedDict; from dash import ALL, dcc, html, Input, MATCH, Output, State;
from flask import Markup; from IPython.display import display, Markdown;
from matplotlib.ticker import MaxNLocator; from plotly.subplots import make_subplots;
from plotly.tools import mpl_to_plotly; from sklearn import svm,tree;
from sklearn.metrics import accuracy_score, roc_curve, auc;
from scipy.stats import iqr,kurtosis,median_abs_deviation,mode,pearsonr,spearmanr,skew; 
from tensorflow.keras import layers, models, losses; 
from torch.utils.data import TensorDataset, DataLoader;
import dash; import dash_useful_components as duc; import math; import matplotlib.pyplot as plt; 
import networkx as nx; import numpy as np; import numpy.random as rnd; import pandas as pd;
import os; import PIL; import plotly.express as px; import plotly.graph_objects as go; 
import requests as rq; import seaborn as sns; import sklearn as skl; import sklearn.linear_model as lm; 
import sklearn.discriminant_analysis as lda; import statistics as stttx;
import statsmodels as sm; import statsmodels.api as sma; import statsmodels.formula.api as smf; 
import requests as rq; import tensorflow as tf; import torch; import torch.nn as nn; 
import torch.nn.functional as F; import torch.optim as optim; import torchvision; 
import torchvision.transforms as transforms; import urllib.request; import warnings;

# file settings
warnings.simplefilter(action='ignore', category=Warning); sns.set(); np.set_printoptions(threshold=np.inf); 
pd.set_option('display.max_rows',999); pd.set_option('display.max_columns', 12);  
pd.set_option('display.max_rows',999); pd.set_option('display.width', 200);

# global settings
MRL = 3; #Measure Rounding Level
folder = "datasources"; files = ["cohorts.csv","datafile.csv","measures.csv","propositions.csv","schema.csv"]
subtitles = {'Title1':"Thakor Lab",
            'Title2':"Cerebrovascular Autoregulation and Post-Cardiac Arrest Resuscitation Therapies Team",
            'Title3':"Statistical Analysis GUI, v1.0",
            'Step00':"Select Response Variable: ", 'Step04':"Check Predictor Variable(s) to test: ",
            'Step06':"Select Measures to Display: ", 'Step07':"Select Model Proposition to Calculate: ",
            'Step09':"Enter the Configuration Settings for the selected model (default values pre-entered): ",
            'Step09.01':"Random Seed: ", 'Step09.02':"Percent of Data to use in Training vs Testing: ",
            'Step09.03':"Include Intercept value in Model: ", 'Step09.04':"Magnitude of Iteration Limit: ",
            'Step09.05':"Percentile to use as Threshold for Binarization of Response Variable: ", 
            'Step09.06':"Number of Layers in Model: ", 'Step09.07':"Branches or Nodes per Layers of Model: ",
            'Step09.08':"Learning Rate of Model: ",
            'Step12.01':"Histogram of Response Variable Values", 'Step12.02':"Histogram(s) of Predictor Variable Values",
            'Step13':"Table of Requested Measure(s)",'Step14':"Details of Requested Model"};

# globalized variables:
# df_DataProc; df_Meas; df_Prop; df_Schema; 
# namesMeas; namesPred; namesProp; namesResp;
# uniList

def loadData():
#. load data files into memory
    global df_Cohorts; df_Cohorts = pd.read_csv(folder+'/'+files[0]); 
    global df_DataProc; df_DataProc = pd.read_csv(folder+'/'+files[1]);
    global df_Meas; df_Meas = pd.read_csv(folder+'/'+files[2]); 
    global df_Prop; df_Prop = pd.read_csv(folder+'/'+files[3]);
    global df_Schema; df_Schema = pd.read_csv(folder+'/'+files[4]);
    global namesResp; namesResp = genFieldDict(['Response'],df_Schema); 
    global namesPred; namesPred = genFieldDict(['Predictor'],df_Schema);
    global namesMeas; namesMeas = genFieldDict(['Ready'],df_Meas); 
    global namesProp; namesProp = genFieldDict(['Ready'],df_Prop);
    global uniList; uniList = genFieldDict(['Variate'],df_Meas);

def genFieldDict(reqList,df):
#. generate field dictionary from dataframe based on specified parameter flag(s)
    reqQuery = buildQuery(reqList);
    fieldlist = df.query(reqQuery).Column.to_numpy();
    fieldDict = dict(enumerate(fieldlist.flatten(), 1));
    #fieldDictInv = dict((v, k) for k, v in fieldDictInv.items());
    return fieldDict;
    
def buildQuery(colList,valList = [1], ander=True):
    conj = " and " if ander else " or ";
    offset = len(conj); query = ""; eqstr = " == ";
    if (len(colList)>1):
        if (len(valList)>1):
            queryDict = dict(zip(colList,valList));
        else:
            queryDict = dict(zip(colList,valList*len(colList)));
        for k,v in queryDict.items():
            query = query + conj + k + eqstr + str(v);
        query = query[offset:(len(query)-offset)];
    else:
        query = str(colList[0]) + eqstr + str(valList[0]);
    return query;
    
def genMeas(varResp, varPred, varMeas):
# pull specified values from data sources -> pass to measure switch -> pass back to view
    retResp = {}; retVal = {}; measList = []; 
    for meas in varMeas:
        measList.append(namesMeas[meas]);
    respVals = df_DataProc[namesResp[varResp]];     
    for meas in measList:
        if (meas in uniList.values()):
            retResp[meas] = calcSwitch(meas,respVals);
    if (len(varPred)>0):
        predDict = {};
        for varP in varPred:
            predName = namesPred[varP];
            predVals = df_DataProc[predName];
            retPredCurr = {}; 
            for meas in measList:
                if (meas in uniList.values()):
                    retPredCurr[meas] = calcSwitch(meas,predVals);
                else:
                    retPredCurr[meas] = calcSwitch(meas,predVals,respVals);            
            predDict[predName] = retPredCurr;
            retVal['pred'] = predDict;
    retVal['resp'] = retResp;
    return retVal;

def calcSwitch(measName,varA,varB=[]):
# identify measure -> pass values to measure-specific function -> format result as single string -> pass back to view
    retVal = '';
# Bivariate Measures
    if (measName=='Chi-Squared-Independence'):
        val,pval = calcChiSq(varA,varB);        
        retVal = str(val)+"("+str(pval)+")";
    elif (measName=='Correlation-Pearson'):
        val,pval = calcCorrP(varA,varB);        
        retVal = str(val)+"("+str(pval)+")";
    elif (measName=='Correlation-Spearman'):
        val,pval = calcCorrS(varA,varB);
        retVal = str(val)+"("+str(pval)+")";
    elif (measName=='Covariance'):
        retVal = calcCovar(varA,varB);
# Univariate Measures
    elif (measName=='Interquartile-Range'):
        retVal = calcIQR(varA);
    elif (measName=='Kurtosis'):
        retVal = calcKurtosis(varA);
    elif (measName=='Maximum'):
        retVal = calcMax(varA);
    elif (measName=='Mean'):
        retVal = calcMean(varA);
    elif (measName=='Median'):
        retVal = calcMedian(varA);
    elif (measName=='Median-Absolute-Deviation'):
        retVal = calcMAD(varA);
    elif (measName=='Minimum'):
        retVal = calcMin(varA);
    elif (measName=='Mode'):
        retVal = calcMode(varA);
    elif (measName=='Normality'):
        val,pval = calcNormality(varA,varB);
        retVal = str(val)+"("+str(pval)+")";
    elif (measName=='Range'):
        retVal = calcRange(varA);
    elif (measName=='Relative-Standard-Deviation'):
        retVal = calcRSD(varA);
    elif (measName=='Skew'):
        retVal = calcSkew(varA);
    elif (measName=='Standard-Deviation'):
        retVal = calcStd(varA);
    return retVal;

# BEGIN MEASURE-SPECIFIC FUNCTIONS
# BEGIN BIVARIATE MEASURES
def calcChiSq(varA,varB):
    retV, retP, _, _ = chi2_contingency(np.array([varA,varB]), correction=False);
    retV = round(retV,MRL);
    retP = round(retP,MRL);
    return [retV,retP];

def calcCorrP(varA,varB):
    retV, retP = pearsonr(varA,varB); 
    retV = round(retV,MRL);
    retP = round(retP,MRL);
    return [retV,retP];

def calcCorrS(varA,varB):
    retV, retP = spearmanr(varA,varB); 
    retV = round(retV,MRL);
    retP = round(retP,MRL);
    return [retV,retP];

def calcCovar(varA,varB):
    return round(np.cov(a,b)[0][1],MRL);

# END BIVARIATE MEASURES
# BEGIN UNIVARIATE MEASURES
def calcIQR(varList):
    return round(iqr(varList),MRL);

def calcKurtosis(varList):
    return round(kurtosis(varList),MRL);

def calcMax(varList):
    return round(max(varList),MRL);

def calcMedian(varList):
    return round(stttx.median(varList),MRL);

def calcMAD(varList):
    return round(median_abs_deviation(varList),MRL);

def calcMean(varList):
    return round(np.mean(varList),MRL);

def calcMin(varList):
    return round(min(varList),MRL);

def calcMode(varList): 
    return round(mode(varList),MRL);

def calcNormality(varList): #TODO
    retV, retP = spearmanr(varList); 
    retV = round(retV,MRL);
    retP = round(retP,MRL);
    return [retV,retP];

def calcRange(varList):
    return round(calcMax(varList)-calcMin(varList),MRL);

def calcRSD(varList):
    return round(calcStd(varList)/calcMean(varList),MRL);

def calcSkew(varList):
    return round(skew(varList),MRL);

def calcStd(varList):
    return round(np.std(varList),MRL);
# END UNIVARIATE MEASURES
# END MEASURE-SPECIFIC FUNCTIONS

def genModel(varResp, varPred, varProp, d_Conf):
# pull specified values from data sources -> pass to model switch -> pass back to view
    retVal = {}; 
    respName = namesResp[varResp]; valsResp = df_DataProc[respName];
    valsPred = pd.DataFrame(); 
    for varP in varPred:
        predName = namesPred[varP];
        valsPred[predName] = df_DataProc[predName];
    propName = namesProp[varProp];
    retVal = modelSwitch(propName,valsResp,valsPred,d_Conf);
    return retVal;

def modelSwitch(propName,valsResp,valsPred,d_Conf):
# identify proposition -> pass values and configuration to proposition-specific function -> calculate error rates -> pass back to view
# TODO Display: Scatter plot(s) for Proposition:Linear Regression, Proposition:LDA, Proposition:SVM
# Node-Graph for Proposition:Decision Tree and Proposition:Neural Network
# Univariate: Box-and-whisker plot
# Bivariate: Scatter Plots
# TODO Propositions: Decision Tree (allows for inferences on Nominal data and has high interpretability)
# Linear Discrimint Analysis
# Support Vector Machine
    retVal = {};
    np.random.seed(d_Conf['Seed']); sample = np.random.uniform(size = len(valsResp.index)) < d_Conf['TrainPct'];
    trainResp = valsResp[sample]; testResp = valsResp[~sample];
    trainPred = valsPred[sample]; testPred = valsPred[~sample];
    binclass = (df_Prop[df_Prop.Column==propName].Binthresh==1);
    if(binclass):
        thresh = np.percentile(valsResp,d_Conf['BinThresh']);
        trainResp = (trainResp > thresh).astype(int); 
        testResp = (testResp > thresh).astype(int); 
    if (propName=='Neural-Network'):
        model = modelNN(trainResp,trainPred,d_Conf);
        retVal['model'] = model;
        modTrain = model(convNNType(trainPred)).detach().numpy().T[0];
        modTest =  model(convNNType(testPred)).detach().numpy().T[0];
    else:
        if (propName=='Decision-Tree'):
            retVal['model'] = modelDecTree(trainResp,trainPred,d_Conf);
        elif (propName=='Linear-Discriminant-Analysis'):
            retVal['model'] = modelLDA(trainResp,trainPred,d_Conf);
        elif (propName=='Linear-Regression'):
            retVal['model'] = modelLinReg(trainResp,trainPred,d_Conf);
        elif (propName=='Logistic-Regression'):
            retVal['model'] = modelLogReg(trainResp,trainPred,d_Conf);
        elif (propName=='Support-Vector-Machine'):
            retVal['model'] = modelLDA(trainResp,trainPred,d_Conf);
        modTrain = retVal['model'].predict(trainPred);
        modTest = retVal['model'].predict(testPred);
    retVal['error'] = {};
    retVal['error']['train'] = assessErr(trainResp.to_numpy(),modTrain,binclass);
    retVal['error']['test'] = assessErr(testResp.to_numpy(),modTest,binclass);
    if (binclass):
        retVal['roc'] = {};
        retVal['roc']['train'] = assessAUC(trainResp.to_numpy(),modTrain);
        retVal['roc']['test'] = assessAUC(testResp.to_numpy(),modTest);
    return retVal;

# BEGIN PROPOSITION-SPECIFIC FUNCTIONS
def modelDecTree(resp,pred,d_Conf):
    model = tree.DecisionTreeClassifier(); 
    model.fit(pred,resp)
    return model;

def modelLDA(resp,pred,d_Conf):
    model = lda.LinearDiscriminantAnalysis(tol=d_Conf['LearnRate']); 
    model.fit(pred,resp);
    return model;

def modelLinReg(resp,pred,d_Conf):
    model = lm.LinearRegression(fit_intercept=d_Conf['Intercept']);
    model.fit(pred,resp);
    return model;

def modelLogReg(resp,pred,d_Conf):
    model = lm.LogisticRegression(fit_intercept=d_Conf['Intercept'],max_iter=d_Conf['Magnitude']);
    model.fit(pred,resp);
    return model;

def modelSVM(resp,pred,d_Conf):
    model = SVC(kernel='linear',max_iter=d_Conf['Magnitude'],tol=d_Conf['LearnRate']); 
    model.fit(pred,resp);
    return model;

def convNNType(df):
    return torch.from_numpy(df.values).float();

def modelNN(rdat,pdat,d_Conf):
    pdat = convNNType(pdat); rdat = convNNType(rdat);
    inSz = pdat.shape[1]; outSz = 1;
    modelGraph = OrderedDict([('inLayer', nn.Linear(inSz,d_Conf['Nodes'])),('relu1', nn.ReLU())]);
    if (d_Conf['Layers']>1):
        for idx in range(1,d_Conf['Layers']):
            modelGraph['hl'+str(idx)] = nn.Linear(d_Conf['Nodes'],d_Conf['Nodes']); 
            modelGraph['relu'+str(1+idx)] = nn.ReLU();
    modelGraph['outLayer'] = nn.Linear(d_Conf['Nodes'],outSz); model = nn.Sequential(modelGraph);    
    model.zero_grad(); lossFn = nn.MSELoss(reduction='sum');
    optim = torch.optim.Adam(model.parameters(), d_Conf['LearnRate']);
    for idx in range(d_Conf['Magnitude']):
        currPred = model(pdat); currLoss = lossFn(currPred,rdat);
        optim.zero_grad(); currLoss.backward(); optim.step();
    return model; 
# END PROPOSITION-SPECIFIC FUNCTIONS

def assessErr(truth,prediction,bindata=False):
    if (bindata): # Percent Error
        retVal = sum(abs(np.subtract(truth,prediction))/len(truth));
    else: # RMS Error
        retVal = round(math.sqrt(sum((np.subtract(truth,prediction))**2)/len(truth)),2);
    return retVal; 

def assessAUC(truth,prediction):
    fpr, tpr, thresh = roc_curve(truth,prediction); calcAUC = auc(fpr, tpr); 
    return [fpr,tpr,calcAUC];

def makeROC(inDict,title):
    fig = go.Figure();
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], line={'color':'navy','width':2}));
    fig.add_trace(go.Scatter(x=inDict[0], y=inDict[1], line={'color':'darkorange','width':2,'dash':'dash'}));
    fig.update_layout(title=('ROC curve (area = '+str(round(inDict[2],MRL))+') for '+title)
                   ,xaxis_title='False Positive Rate'
                   ,yaxis_title='True Positive Rate')
    return fig;

def setDiff(listA,listB):
    return list(set(listA) - set(listB));

def setInt(listA,listB):
    return list(set(listA) & set(listB));
# END LOGIC FUNCTIONS

loadData(); 

In [18]:
pip install -r requirements.txt

Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Collecting statistics
  Using cached statistics-1.0.3.5.tar.gz (8.3 kB)
Collecting torchvision
  Using cached torchvision-0.12.0-cp39-cp39-win_amd64.whl (1.0 MB)
Building wheels for collected packages: sklearn, statistics
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1310 sha256=5185367f38ef615806d39655ab706f8981fb9ffad2823f1e7b51ccbfd5f08134
  Stored in directory: c:\users\19734\appdata\local\pip\cache\wheels\e4\7b\98\b6466d71b8d738a0c547008b9eb39bf8676d1ff6ca4b22af1c
  Building wheel for statistics (setup.py): started
  Building wheel for statistics (setup.py): finished with status 'done'
  Created wheel for statistics: filename=statistics-1.0.3.5-py3-none-any.whl size=7454 sha256=d8d660a7944c07e796469820c281ef6069ca7c29f11b95e7469adb3150d7e983
  Stored in directory: c:\users\1