# Setup

In [2]:
import pandas as pd
from datetime import datetime
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import math
import scipy.spatial.distance as ssd
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

def clean_cols(df):
    import re
    
    cols=list(df.columns.values)
    
    # Lowercase everything
    cols=list(map(lambda x: x.lower(), cols))
    
    # Remove special characters 
    cols = [re.sub(r'[^a-zA-Z0-9]','_',string) for string in cols]
    
    # Rename colums
    df.columns = cols

    return df

# EDA

In [119]:
df1=pd.read_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\mri_alzheimers\oasis_longitudinal.csv")
df2=pd.read_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\mri_alzheimers\oasis_cross-sectional.csv")

In [3]:
# EDA Report
from pandas_profiling import ProfileReport

pr1=ProfileReport(df1)
pr1.to_file(output_file="pandas_profiling1.html")
pr1

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))






# Data Prep

In [120]:
# Using longitudinal data fit leapsy
# Data Prep
from leaspy import Leaspy, Data, AlgorithmSettings, IndividualParameters, __watermark__

df1=pd.read_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\mri_alzheimers\oasis_longitudinal.csv")

# Clean columns
df1=clean_cols(df1)

# Change column names and move to front
df1.insert(0, 'TIME', df1.pop('age'))
df1.insert(0, 'ID', df1.pop('subject_id'))
df1.columns.tolist()

# Make df distinct by id and time
df1=df1.groupby(['ID', 'TIME']).first().reset_index()

# Filter df
df1 = df1[['ID', 'TIME', 'group', 'mr_delay', 'm_f', 'hand', 'educ', 'ses', 'mmse', 'cdr', 'etiv', 'nwbv', 'asf']]

# Change values to numbers
df1['group']= df1['group'].replace(['Nondemented', 'Demented', 'Converted'], ['0','1','1'])
df1['m_f']= df1['m_f'].replace(['M', 'F'], ['0','1'])
df1['hand']= df1['hand'].replace(['R', 'L'], ['0','1'])

# Convert feature columns to numberic
df1[['group', 'm_f', 'hand']] = df1[['group', 'm_f', 'hand']].apply(pd.to_numeric)

df1

# Normalize features

# Store data into dataframe object
df1 = df1.set_index(['ID', 'TIME'])
indices = [idx for idx in df1.index.unique('ID') if df1.loc[idx].shape[0] >= 2]
data = Data.from_dataframe(df1)

# LEASPY: LEArning Spatiotemporal Patterns in Python

In [5]:
# Atoti

# import atoti as tt
# session = tt.create_session()
# at_df=session.read_pandas(df1, table_name="at_df")
# cube = session.create_cube(at_df)
# session.visualize()

In [6]:
algo_settings = AlgorithmSettings('mcmc_saem', 
                                  n_iter=5000,           # n_iter defines the number of iterations
                                  progress_bar=True)     # To display a nice progression bar during calibration

leaspy = Leaspy("logistic", 
                source_dimension=2, # Optional
                noise_model='gaussian_diagonal', # Optional: To get a noise estimate per feature keep it this way (default)
                )

In [7]:
leaspy.fit(data, settings=algo_settings)

|##################################################|   5000/5000 iterations
Fit with `mcmc_saem` took: 10m 9s
The standard deviation of the noise at the end of the fit is:
group: 0.10%
mr_delay: 87104.57%
m_f: 48.29%
hand: 0.00%
educ: 1390.56%
ses: 185.98%
mmse: 2666.67%
cdr: 17.13%
etiv: 149626.86%
nwbv: 3.30%
asf: 24.05%


# Path Segmentation - MRI Alzheimers Data

In [121]:
import os
os.chdir(r"C:\Users\A4023862\Documents\GitHub")
from path_segmentation.functions import *

In [123]:
# Clustering - Agglomerative
n_clusters = 10

df2=remove_insig_features(df1,n_clusters,['ID', 'TIME'])
df2_jac=jaccard_dist(df2, [])
trainer_result,trainer_result_reduced,labels=cluster_agg(df2_jac, df2, df1, 10, 'ward')
trainer_result_reduced.shape,trainer_result.shape

10 % data count = 36
Significant Columns are: ID TIME group m_f educ ses cdr
Data Dimensions: (369, 5)
Jaccard Dataframe Dimensions: (369, 369)
Cluster counts:
 1    84
2    62
4    53
3    32
5    31
0    27
6    26
7    19
9    18
8    17
Name: cluster, dtype: int64
Silhouette Score: 0.33566838474360433


((369, 6), (369, 12))

In [131]:
# Clustering - KMeans
from sklearn.cluster import KMeans

def cluster_k(jaccard_df,df_reduced,df,n_clusters):
    #model = AgglomerativeClustering(n_clusters = n_clusters,linkage=linkage)
    model = KMeans(n_clusters = n_clusters, random_state=0)
    labels = model.fit_predict(jaccard_df)
    result = df.copy()
    result_reduced = df_reduced.copy()
    result['cluster'] = labels
    result_reduced['cluster'] = labels
    print("Cluster counts:\n",result.cluster.value_counts())
    #Silhouette Score
    print("Silhouette Score:",metrics.silhouette_score(jaccard_df,labels,metric = 'precomputed'))
    return result,result_reduced,labels

n_clusters = 20

df2=remove_insig_features(df1,n_clusters,['ID', 'TIME'])
df2_jac=jaccard_dist(df2, [])
trainer_result,trainer_result_reduced,labels=cluster_k(df2_jac, df2, df1, n_clusters)
trainer_result_reduced.shape,trainer_result.shape

20 % data count = 73
Significant Columns are: ID TIME group m_f educ ses
Data Dimensions: (369, 4)
Jaccard Dataframe Dimensions: (369, 369)
Cluster counts:
 4     34
16    31
7     31
13    29
3     24
6     24
1     19
9     18
15    17
8     17
12    16
2     16
5     15
14    15
0     13
18    12
17    11
19    11
10    10
11     6
Name: cluster, dtype: int64
Silhouette Score: 0.442760466022005


((369, 5), (369, 12))

In [10]:
drop_col_list = ['cluster']
cluster_analysis_df = non0elem_dist(trainer_result,drop_col_list,n_clusters)
print(cluster_analysis_df)

   group  mr_delay  m_f  hand  educ  ses  mmse  cdr  etiv  nwbv  asf
0      0        17    0     0    27   27    27    2    27    27   27
1     84        46   84     0    84   75    82   71    84    84   84
2     62        35    0     0    62   54    62   62    62    62   62
3     32        20    0     0    32   30    32   29    32    32   32
4      0        33   53     0    53   53    53    0    53    53   53
5      0        19   31     0    31   31    31    0    31    31   31
6      0        15   26     0    26   26    26    0    26    26   26
7      2        11    0     0    19   19    19    0    19    19   19
8      0        12    0     0    17   17    17    0    17    17   17
9      0        11   18     0    18   18    18    0    18    18   18


In [82]:
# Training

label = 'cluster'
train_split = 0.8
drop_col_list = []
dummy_cols=['m_f', 'group']

test_data,train_col_list,model = log_reg_train(trainer_result_reduced,label,train_split,drop_col_list,'ID', dummy_cols)

Total Patients: 150
Patients in Training Set: 120
Patients in Test Set: 30
Training Data with dummies shape: (298, 8)
No. of Clusters: 10
Training Logistic Regression Model...
Accuracy of logistic regression classifier on train set: 96.98%


In [85]:
# Testing

label = 'cluster'
train_col_list = train_col_list
model = model
test_or_pred = 0
log_reg_test(test_data,label,train_col_list,model,test_or_pred, 'ID', dummy_cols)

Patients in Test Set: 30
Test Data with dummies shape: (71, 7)
Predictions from Logistic Regression Model...
[4 4 1 1 2 2 2 2 4 4 4 4 1 1 3 3 3 6 6 8 7 7 2 2 2 9 9 2 2 4 4 4 3 3 0 0 2
 2 1 1 9 9 5 5 3 3 3 3 4 4 4 4 4 7 7 1 1 4 4 1 1 4 4 4 4 4 3 3 4 4 4]
No. of Clusters: 9
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00        10
           2       0.91      1.00      0.95        10
           3       1.00      0.92      0.96        12
           4       0.54      1.00      0.70        13
           5       1.00      1.00      1.00         2
           6       1.00      0.12      0.22        16
           7       1.00      1.00      1.00         4
           8       0.00      0.00      0.00         0
           9       0.50      1.00      0.67         2

    accuracy                           0.79        71
   macro avg       0.80      0.80      0.75        71
weighted avg       0.89

# AC_TPC - MRI Alzheimers Data

In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

# import random
# import os, sys

# from tensorflow.python.ops.rnn import _transpose_batch_time
# from sklearn.model_selection import train_test_split

# #performance metrics
# from sklearn.metrics import roc_auc_score, average_precision_score
# from sklearn.metrics import normalized_mutual_info_score, homogeneity_score, adjusted_rand_score
# from sklearn.metrics.cluster import contingency_matrix

# #user defined
# from utils_log import save_logging, load_logging
# from data_loader import import_data
# from class_AC_TPC import AC_TPC, initialize_embedding

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

ImportError: SystemError: <built-in method __contains__ of dict object at 0x00000217069B6A80> returned a result with an error set

# EMR Bots Data

In [58]:
# read_file = pd.read_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\100000-Patients\PatientCorePopulatedTable.txt',sep='\t')
# read_file.to_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\100000-Patients\PatientCorePopulatedTable.csv', index=None)
# read_file = pd.read_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\100000-Patients\AdmissionsCorePopulatedTable.txt',sep='\t')
# read_file.to_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\100000-Patients\AdmissionsCorePopulatedTable.csv', index=None)
# read_file = pd.read_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\100000-Patients\AdmissionsDiagnosesCorePopulatedTable.txt',sep='\t')
# read_file.to_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\100000-Patients\AdmissionsDiagnosesCorePopulatedTable.csv', index=None)
# read_file = pd.read_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\100000-Patients\LabsCorePopulatedTable.txt',sep='\t')
# read_file.to_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\100000-Patients\LabsCorePopulatedTable.csv', index=None)

# df1 = pd.read_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\10000-Patients\PatientCorePopulatedTable.csv')
# df2 = pd.read_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\10000-Patients\AdmissionsCorePopulatedTable.csv')
# df3 = pd.read_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\10000-Patients\AdmissionsDiagnosesCorePopulatedTable.csv')
# df4 = pd.read_csv (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\10000-Patients\LabsCorePopulatedTable.csv')
ccsr = pd.read_excel (r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\CCSR\ICD10 DIAGNOSIS_CCSR-Reference-File-v2021_0.xlsx', sheet_name = 'ICD DX_to_CCSR_Mapping', 
                      skiprows=1)

In [44]:
df3.shape

(36143, 4)

In [41]:
len(df3['PrimaryDiagnosisDescription'].unique())

2618

In [42]:
df3.head()

Unnamed: 0,PatientID,AdmissionID,PrimaryDiagnosisCode,PrimaryDiagnosisDescription
0,E74E9DF1-D8FD-41BC-8CDE-226CFE318E0B,1,E09.42,Drug or chemical induced diabetes mellitus wit...
1,E74E9DF1-D8FD-41BC-8CDE-226CFE318E0B,2,O29.123,Cardiac failure due to anesthesia during pregn...
2,E74E9DF1-D8FD-41BC-8CDE-226CFE318E0B,3,M84.561,"Pathological fracture in neoplastic disease, r..."
3,3AB69ECE-65F4-4D04-9E87-54E73C2DB4A8,1,G52.3,Disorders of hypoglossal nerve
4,3AB69ECE-65F4-4D04-9E87-54E73C2DB4A8,2,C40.31,Malignant neoplasm of short bones of right low...


In [60]:
ccsr=clean_cols(ccsr)
ccsr.head()

Unnamed: 0,icd_10_cm_code,icd_10_cm_code_1,icd_10_cm_code_description,request_labels__added_by_tr_,ccsr_category,ccsr_category_description,inpatient_default_ccsr__y_n_x_,outpatient_default_ccsr__y_n_x_,rationale
0,A000,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol...",,DIG001,Intestinal infection,Y,Y,06 Infectious conditions
1,A000,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol...",,INF003,Bacterial infections,N,N,06 Infectious conditions
2,A001,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor",,DIG001,Intestinal infection,Y,Y,06 Infectious conditions
3,A001,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor",,INF003,Bacterial infections,N,N,06 Infectious conditions
4,A009,A00.9,"Cholera, unspecified",,DIG001,Intestinal infection,Y,Y,06 Infectious conditions
