# Initial EDA

### Goals:
- ~~Filter data for only completed studies~~
- Create NLP features
- Filter feature list using RIG
- Save processed data


In [1]:
import pandas as pd
# import sklearn.tree
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import KBinsDiscretizer
# from sklearn.tree import DecisionTreeClassifier
# from sklearn import tree
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion

from autotest_data.helper_functions import ModelTransformer, SampleExtractor, DenseTransformer

Load the data saved from the database.

In [2]:
df = pd.read_csv("../data/00raw/main_table.csv.gz")
kw_df = pd.read_csv("../data/00raw/keywords_table.csv.gz")

It was not clear how to find completed studies from the database
(there were two compelling flags)

Checking the pivot table of those values to pick a filtering strategy.

In [3]:
pd.crosstab(df['completion_date_type'], df['enrollment_type'])

enrollment_type,Actual,Anticipated
completion_date_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Actual,1164,0
Anticipated,192,982


Complete the filter.

In [4]:
df = df[df["completion_date_type"]=="Actual"]
kw_df = kw_df[kw_df["nct_id"].isin(df["nct_id"].unique())]

Verify no missing data in the target column (enrollment)

In [5]:
df.isna().sum()

nct_id                  0
start_date              0
verification_date       0
completion_date         0
completion_date_type    0
study_type              0
brief_title             0
official_title          0
description             0
enrollment              0
enrollment_type         0
dtype: int64

### Investigate methods for selecting features by RIG

- First create token counts / TF-IDF matrix from text column
- Then try  using mutual_info_classifier
    + not happy with the estimates process, even after discretizing the target
- Then try computing with Decision Tree
    + feature_importances are normamlized!
    + digging into the tree give the optimal entropy change, still need to weight the average and divide

In [6]:
# X, Y = df["brief_title"], df[["enrollment"]]

In [7]:
# cvec = TfidfVectorizer(
#     stop_words="english",
#     max_features=50,
# )
#
# discritizer = KBinsDiscretizer(
#     n_bins=6,
#     encode="ordinal",
# )

In [8]:
# X_vec = cvec.fit_transform(X)
# Y_discrete = discritizer.fit_transform(Y)

In [9]:
# res = dict(zip(cvec.get_feature_names_out(),
#                mutual_info_classif(X_vec, Y_discrete[:,0], discrete_features=True)
#                ))

In [10]:
# dt = DecisionTreeClassifier(
#     criterion="entropy",
#     max_depth=1,
# )
#
# dt.fit(X_vec[:,[11]], Y_discrete[:,0])
# dt.tree_.node_count
# tree.plot_tree(dt)
# dt.tree_.impurity
# dt.tree_.capacity
# dt.tree_.n_node_samples


### create pipeline for vectorizing token counts

In [11]:
# create a pipeline to convert text columns to token count columns
# Using simple counts to be safe with no train/test split
binary = False
feats = 50

def text_pipeline(column_name, binary_flag, max_feats):
    return Pipeline([
                      ('text',SampleExtractor([column_name])),
                      ('dummify', CountVectorizer(binary=binary_flag,
                                                  max_features=max_feats,
                                                  stop_words="english")),
                      ('densify', DenseTransformer()),
                     ])

pipeline = Pipeline([
    ('features', FeatureUnion([
        ('brief_title', text_pipeline("brief_title", binary, feats)),
        ('official_title', text_pipeline("official_title", binary, feats)),
        ('description', text_pipeline("description", binary, feats)),
        ('cont_features', Pipeline([
                      ('continuous', SampleExtractor(['nct_id','enrollment'])), # potential bug when extracting single column
                      ])),
        ])),
])
# learn the vocabularies for each column
pipeline.fit(df)

# create column headers from vocabularies
col_names = []
col_names.extend(["brief_title_" + col for col in pipeline.steps[0][1].transformer_list[0][1].steps[1][1].get_feature_names_out()])
col_names.extend(["official_title_" + col for col in pipeline.steps[0][1].transformer_list[1][1].steps[1][1].get_feature_names_out()])
col_names.extend(["description_" + col for col in pipeline.steps[0][1].transformer_list[2][1].steps[1][1].get_feature_names_out()])
col_names.extend(["nct_id", "enrollment"])

# generate transformed data_frame
X = pd.DataFrame(pipeline.transform(df), columns=col_names)

In [12]:
X.shape

(1164, 152)

In [13]:
X.to_csv("../data/01interim/main_table_vectorized.csv.gz",index=False)

In [24]:
# naive tabulations.... too many keywords, switching to tokens
pd.crosstab(kw_df["nct_id"], kw_df["name"]).sum(axis=0).sort_values().tail(50)

name
brain stimulation     2
quality of life       3
Balance               3
Glucose               3
Bioavailability       3
postoperative pain    3
Quality of Life       3
hypertension          3
Inflammation          3
Hepatic Impairment    3
memory                3
Sleep                 3
manual therapy        3
spasticity            3
Healthy subjects      3
Walking               3
Mental Disorders      3
Rehabilitation        3
Pain                  3
Asthma                3
tDCS                  3
Stress                3
Heart Failure         3
Polyphenols           3
HBV                   3
breast cancer         3
Depression            3
Multiple sclerosis    3
eczema                3
microbiome            3
atopic dermatitis     3
simulation            3
children              3
mortality             4
immunogenicity        4
mHealth               4
general anesthesia    4
cancer                4
inflammation          4
Telemedicine          4
Nursing               4
safety     

In [39]:
cvec = CountVectorizer(
    stop_words="english",
    max_features=50,
    ngram_range=(1,3),
    binary=True,
)

cvec.fit(kw_df["name"])
cvec.get_feature_names_out()

array(['agents', 'anesthesia', 'block', 'brain', 'breast', 'cancer',
       'care', 'chronic', 'cognitive', 'control', 'diabetes', 'disease',
       'disorder', 'disorders', 'education', 'exercise', 'failure',
       'function', 'health', 'healthy', 'heart', 'heart failure',
       'intervention', 'life', 'low', 'lung', 'monitoring', 'non', 'pain',
       'patient', 'performance', 'physical', 'postoperative', 'pressure',
       'primary', 'quality', 'quality life', 'respiratory', 'risk',
       'sleep', 'stimulation', 'stroke', 'surgery', 'syndrome', 'therapy',
       'training', 'treatment', 'type', 'type diabetes', 'vaccine'],
      dtype=object)

In [45]:
X = pd.DataFrame(
    cvec.transform(kw_df["name"]).todense(),
    columns=cvec.get_feature_names_out()
)
X.sum()

agents           11
anesthesia       13
block            11
brain            12
breast           10
cancer           36
care             27
chronic          14
cognitive        14
control           9
diabetes         24
disease          34
disorder         11
disorders        10
education        11
exercise         19
failure          12
function         12
health           17
healthy          14
heart            14
heart failure    11
intervention     17
life             13
low              16
lung             10
monitoring       12
non              12
pain             38
patient          14
performance      10
physical         11
postoperative    11
pressure         13
primary          10
quality          16
quality life     11
respiratory      11
risk             11
sleep             9
stimulation       9
stroke           16
surgery          22
syndrome         10
therapy          32
training         13
treatment        13
type             17
type diabetes     9
vaccine          13


In [42]:
# spot check count for random term
kw_df[kw_df["name"].str.lower().str.contains("syndrome")]

Unnamed: 0,nct_id,name
427,NCT03338465,"Greater Trochanteric Pain Syndrome,"
1243,NCT03501420,Primary Sjögren's Syndrome
1267,NCT03503461,Gordon syndrome
1456,NCT03514420,Kobberling-Dunnigan syndrome (type 1 and 2)
2048,NCT03547609,Kabuki Syndrome
2059,NCT03548259,Carpal Tunnel Syndrome
2190,NCT03555617,Irritable Bowel Syndrome
3072,NCT03602482,postural tachycardia syndrome
3461,NCT03688516,Williams-Beuren syndrome
4167,NCT04075032,metabolic syndrome


In [56]:
# remove index from kw_df version since filtered rows won't match X index
X["nct_id"] = kw_df["nct_id"].values

X = X.groupby("nct_id").sum()
X

Unnamed: 0_level_0,agents,anesthesia,block,brain,breast,cancer,care,chronic,cognitive,control,...,stimulation,stroke,surgery,syndrome,therapy,training,treatment,type,type diabetes,vaccine
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NCT01893658,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NCT02689713,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NCT02744079,0,0,0,0,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NCT02771600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NCT02779543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCT05127083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
NCT05134493,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
NCT05141851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NCT05169476,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# check number of nct_id values matches rows in X
kw_df["nct_id"].nunique()

633

In [55]:
# from deeper investigation when rows mismatched...
kw_df.groupby("nct_id").count()

Unnamed: 0_level_0,name
nct_id,Unnamed: 1_level_1
NCT01893658,5
NCT02689713,4
NCT02744079,10
NCT02771600,4
NCT02779543,1
...,...
NCT05127083,1
NCT05134493,7
NCT05141851,1
NCT05169476,3


In [58]:
X.to_csv("../data/01interim/keywords_table_vectorized.csv.gz",index=False)