# Create Training Set


This is the first notebook in a series of Jupyter Notebooks linked to the project titled "__Developing Mechanism-Based Models for Complex Toxicology Study Endpoints Using Standardized Electronic Submission Data__".  They should be somewhat ordinal, i.e., should be run in order as each Notebook could depend on results from a previous notebook.  

This notebook requires a dependency script, `send.py`, which contains an object that is a live instance to the database and can be queried, etc.  This script requires two variables to be configured, `db_dir` and `send_db_file`.  `db_dir` should point to the directory that contains the SQLlite database and `send_db_file` should be the SQLite database file itself. 

This notebook will create a training set for hepatotoxicity modeling.  The only input needed is what animal/species to make the training set for.  This could be any controlled terminology for SPECIES in SEND.

A training set consists of a set of animals with features which are extracted clinical chemistry results, body weights and animal sex as well as histopathology findings classified as being either liver necrosis, steatosis, or cholestasis.  

A data folder will be created for each species and a file called `{species}_training_data.csv`.

In [2]:
# import necessary libraries

import pandas as pd, numpy as np
import re, os
from scipy import stats

## Choose species from controlled terminology

Define the variable `species` to be the target species for which to make a training set.  

In [3]:
# Define species to make a training set
# and make a seperate folder to store
# all the resulting data

species = 'RAT'

if not os.path.exists('data'):
    os.mkdir('data')
    
species_data = os.path.join('data', species)
if not os.path.exists(species_data):
    os.mkdir(species_data)

## Histopathology classification functions

The following functions will classify all the liver results in SEND as cholestasis, steatosis, or necrosis. 

In [4]:
def regrex_match(regrex, string: str):
    """ generic function to just max a string of text """
    pattern = re.compile(regrex, re.IGNORECASE)
    match = pattern.search(string)
    if match:
        return True
    return False


def is_cholestasis(finding: str):
    """ takes a text string from an MI finding and classifies whether or not
        the finding can be classified as cholestasis

        wikipedia link for Cholestasis: https://en.wikipedia.org/wiki/Cholestasis
    """
    regrex = r'chol(e|o|a)|bil(i|e)'
    return regrex_match(regrex, finding)

def is_steatosis(finding: str):

    """ takes a text string from an MI finding and classifies whether or not
        the finding can be classified as steatosis

        wikipedia link for steatosis: https://en.wikipedia.org/wiki/Steatosis
    """
    steatosis_regrex = r'fat|lipid|vacuol|acc|steat|congest'
    increased_regrex = r'decreas|lower'
    return regrex_match(steatosis_regrex, finding) and not regrex_match(increased_regrex, finding)

def is_necrosis(finding: str):
    """ takes a text string from an MI finding and classifies whether or not
        the finding can be classified as necrosis

        """
    steatosis_regrex = r'necros|fibros|degen|atroph|apop|deplet'
    return regrex_match(steatosis_regrex, finding)


def classify_helper(helper_fx, findings):
    """ goes through all the findings and returns if the helper function results in true """
    for finding in findings:
        if helper_fx(finding):
            return 1
    return 0

from functools import partial
classify_steatosis = partial(classify_helper, is_steatosis)
classify_cholestasis = partial(classify_helper, is_cholestasis)
classify_necrosis = partial(classify_helper, is_necrosis)

def get_classified_liver_results():
    """ will pull all liver results and classfy them as either necrosis, steatosis, cholestasis """
    mi = send_db.generic_query('SELECT STUDYID, USUBJID, MISTRESC FROM MI WHERE MISPEC="LIVER"')
    mi['STEATOSIS'] = mi.groupby(['STUDYID', 'USUBJID'])['MISTRESC'].transform(classify_steatosis)
    mi['CHOLESTASIS'] = mi.groupby(['STUDYID', 'USUBJID'])['MISTRESC'].transform(classify_cholestasis)
    mi['NECROSIS'] = mi.groupby(['STUDYID', 'USUBJID'])['MISTRESC'].transform(classify_necrosis)
    mi['MISTRESC'] = mi.groupby(['STUDYID', 'USUBJID'])['MISTRESC'].transform(lambda x: ';'.join(x))
    return mi.drop_duplicates(['STUDYID', 'USUBJID'])

## Treatment phase filtering functions

The following function will filter animals to a specified treatment phase

In [5]:
def filter_sacrifice_phase(df, phase='treatment'):
    
    import dateutil.parser
    import isodate
    
    """
    filters a dataframe containing animals (STUDYID and USUBJID) to only
    contain animals that were sacrificed during a specific phase
    either, screening recovery or treatment

    """

    starting_columns = df.columns


    ds = send_db.generic_query('SELECT STUDYID, USUBJID, DSDECOD, DSSTDTC '
                               'FROM DS WHERE DSDECOD != "RECOVERY SACRIFICE"')
    ds['SACTIME'] = ds.DSSTDTC.apply(lambda x: dateutil.parser.isoparse(x))

    df = df.merge(ds)
    dm = send_db.generic_query('SELECT STUDYID, USUBJID, ARMCD FROM DM').merge(df)
    ta = send_db.generic_query('SELECT STUDYID, ETCD, ARMCD, lower(EPOCH) as EPOCH FROM TA')

    animal_epochs = dm.merge(ta, on=['STUDYID', 'ARMCD'])

    se = send_db.generic_query('SELECT STUDYID, USUBJID, ETCD, SESTDTC, SEENDTC FROM SE')

    animal_times = animal_epochs.merge(se, on=['STUDYID', 'USUBJID', 'ETCD'])

    te = send_db.generic_query('SELECT STUDYID, ETCD, TEDUR FROM TE')
    animal_times = animal_times.merge(te, on=['STUDYID', 'ETCD'])

    # convert times to date python dates for easier comparison

    animal_times['EPOCH_START'] = animal_times.SESTDTC.apply(lambda x: dateutil.parser.isoparse(x))
    animal_times['ELEMENT_DUR'] = animal_times.TEDUR.apply(lambda x: isodate.parse_duration(x) if x else x)

    # animal_times['EPOCH_END'] = np.nan

    # sometimes end times are not populated,
    # for these cases in the TE domain contains
    # the duration so we can add these start date
    # to get the time.

    animal_times.loc[animal_times.SEENDTC != '', 'EPOCH_END'] = animal_times.loc[animal_times.SEENDTC != '', 'SEENDTC'].apply(lambda x: dateutil.parser.isoparse(x))
    animal_times.loc[animal_times.SEENDTC == '', 'EPOCH_END'] = animal_times.loc[animal_times.SEENDTC == '', 'EPOCH_START'] + pd.to_timedelta(animal_times.loc[animal_times.SEENDTC == '', 'ELEMENT_DUR'])
    animal_times = animal_times[animal_times.SACTIME.between(animal_times.EPOCH_START, animal_times.EPOCH_END)]

    # now we need to find the trial element that
    # a subject was sacrificed within


    screening_terms = "pre.*(tr(ea)?t|dos|test|study|exposure)|acclimat|screen|baseline|allocat|random"
    recovery_terms = "recovery|post.*(tr(ea)?t|dos|test|study|exposure)"
    treatment_terms = "tr(ea)?t|dos|test|exposure"
    treatment_terms_not = "off|non|free|holiday"


    def is_screening(epochs):
        return (epochs.str.contains(screening_terms)).all()

    def is_recovery(epochs):
        return (epochs.str.contains(recovery_terms)).all()

    def is_treatment(epochs):
        return (epochs.str.contains(treatment_terms) & ~(epochs.str.contains(treatment_terms_not))).all()

    if phase == 'screening':
        fx = is_screening
    elif phase == 'recovery':
        fx = is_recovery
    else:
        fx = is_treatment

    # group by study and subject in case there are multiple elements per code
    # and remove those that are note
    final_animals = animal_times[animal_times.groupby(['STUDYID', 'USUBJID'])['EPOCH'].transform(fx)]
    return final_animals.drop_duplicates(['STUDYID', 'USUBJID'])[starting_columns]

## Lab classification functions

One function for extracting only numeric results from the LB domain

In [6]:
def filter_text(x):
    """ returns null if x does not contain a valid numeric response,
    else it extracts that using a regex pattern """
    digit_pattern = r'[-+]?([0-9]*\.[0-9]+|[0-9]+)'
    digit_extract = re.search(digit_pattern, str(x))
    if digit_extract:
        return float(digit_extract.group(0))
    return np.nan


In `send.py` there is an object called `send_db` that contains an active connection to the current send database.

In [7]:
from send import send_db

# check to make there is a valid connection by counting
# the number of studies in the databases

num_studies = len(send_db.generic_query('SELECT DISTINCT STUDYID FROM AN'))
print(f"There are {num_studies} studies in the SEND db")

There are 1895 studies in the SEND db


In [8]:
# get a list of animals from the SEND DB
# and filter based off of species
animals = send_db.get_all_animals()
target_animals = animals[animals.SPECIES == species]

print(f"There are {len(target_animals)} {species} out of {len(animals)} records in the SEND DB")

There are 95464 RAT out of 170440 records in the SEND DB


In [9]:
# the modeling set should only contain
# terminal animals (i.e., not recovery animals)
# for that, we use the function filter_sacrifice_phase

target_animals = filter_sacrifice_phase(target_animals, phase='treatment')

print(f"There are {len(target_animals)} {species}")

  return func(self, *args, **kwargs)


There are 29304 RAT


### Identify studies with control groups 

Because we normalize the results to control groups, we need to filter out the animals that are in studies that do not have an negative control group that can be easily determined. 

In [10]:
# add the specific set codes for these animals
setcodes = send_db.generic_query('SELECT STUDYID, USUBJID, SETCD FROM DM')
target_animals = target_animals.merge(setcodes, on=['STUDYID', 'USUBJID'])

# need to now go through each group by study and identify
# the control animals.  We only include animals that
# are in a study with a negative control.

tx = send_db.generic_query("SELECT STUDYID, SETCD, TXVAL FROM TX WHERE TXPARMCD == 'TCNTRL'")
target_animals = target_animals.merge(tx, on=['STUDYID', 'SETCD'], how='left')

good_animals = []

standAlonesWords = ["placebo", "untreated", "sham"]
currentModifiers = ["negative", "saline", "peg", "vehicle", "citrate", "dextrose", "water", "air"]
control_expression = r'|'.join(standAlonesWords + currentModifiers)

for study, data in target_animals.groupby('STUDYID'):
    if (data.TXVAL.value_counts().shape[0] >= 1) and (
    data.TXVAL.str.contains(control_expression, case=False, na=False).any()):
        good_animals.append(data)

good_animals = pd.concat(good_animals)


# add a column to the dataframe that identifies a ra rat as a control or not
good_animals.loc[good_animals.TXVAL.str.contains(control_expression, case=False, na=False), 'IS_CONTROL'] = True
good_animals.loc[~good_animals.TXVAL.str.contains(control_expression, case=False, na=False), 'IS_CONTROL'] = False


print(f'There are {good_animals.shape[0]} {species} with a valid control group')

There are 27935 RAT with a valid control group


## Clinical chemistry results

In [12]:
# pull all the clin chem results from 
# the LB domain an filter out any results
# where we cant extract a numericaly value
lb = send_db.generic_query('SELECT STUDYID, '
                           'USUBJID, '
                           'LBSTRESC, '
                           'LBTESTCD, '
                           'upper(LBCAT) as LBCAT, '
                           'upper(LBSCAT) as LBSCAT, '
                           'upper(LBSPEC) as LBSPEC, '
                           'LBSTRESU FROM LB')
lb = lb[lb.USUBJID.isin(good_animals.USUBJID)]
lb.loc[:, 'LBSTRESC'] = lb.LBSTRESC.apply(filter_text)

Here we consider each unique LBTESTCD and LBSPEC to be a unique test and take the max response for each animal throughout the study.  For studies that have multiple units for a specific LBTESTCD-LBSPEC pair, we take the unit with the highest amount.  

__TODO__: See below.

In [13]:
lb = lb[lb.LBSTRESC.notnull()]
lb.loc[:, 'LBSTRESC_MAX'] = lb.groupby(['STUDYID', 'USUBJID',  'LBTESTCD', 'LBSPEC'])['LBSTRESC'].transform('max')
max_responses = lb.drop_duplicates(['STUDYID', 'USUBJID',  'LBTESTCD', 'LBSPEC', 'LBSTRESC_MAX'])
max_responses.loc[max_responses.LBSPEC == '', 'LBSPEC'] = 'UNSPECIFIED'
max_responses.loc[:, 'LBTESTCD_SPEC'] = max_responses.LBTESTCD + '-' + max_responses.LBSPEC

converted_tests = []


for gp, gp_data in max_responses.groupby(['STUDYID', 'LBTESTCD_SPEC']):

    # if there are more than one unit
    # for a test, take the unit thats most populated
    # Studies (at least in RATS confirmed) will have 
    # one unique unit and the rest blank.  Its safe
    # to assume these are all the same.  Otherwise,
    # if there are multiple units, take the highest
    # and remove the rest. 
    # TODO: to extend this for all cases.  
    if (gp_data.LBSTRESU.value_counts().shape[0] > 1) and (not (gp_data.LBSTRESU == '').any()):

        best_unit = gp_data.LBSTRESU.value_counts().index[0]

        unit_data = gp_data[gp_data.LBSTRESU == best_unit]
        converted_tests.append(unit_data)
    else:
        converted_tests.append(gp_data)


converted_tests = pd.concat(converted_tests)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Normalize the LB results to control groups for each lab test. We also add the classified liver results.  We do a left merge when adding because any animal without histopath results can be thought of as no disease.  This is due because often only control and high dose groups will get tested, and if they dont see anything they will not do histopath on the middle groups. 

__TODO__:
Consider not keeping animals without histopath results.  We don't necessarily need more inactive responses.....

In [14]:
# pivot the table to put it in wide format as oppost to long format
animals_pivot = converted_tests.pivot_table(index=['STUDYID', 'USUBJID'], 
                                            columns='LBTESTCD_SPEC', 
                                            values='LBSTRESC_MAX').reset_index()

# this line below was necessary 
# prior to loading it on the hive
# i think whatever version of pandas
# it must have been fixed
# del animals_pivot.columns.name

# get classifications for each animal
mi = get_classified_liver_results()

# identify the columns that contain clinical chemistry tests
tests = animals_pivot.columns[~animals_pivot.columns.isin(['STUDYID', 'USUBJID', 'SPECIES', 'SEX', 'IS_CONTROL',
                                                           'NECROSIS', 'CHOLESTASIS', 'STEATOSIS'])]

animals_with_mi = animals_pivot.merge(mi, how='left').merge(good_animals[['STUDYID', 'USUBJID', 'SPECIES', 'SEX', 
                                                                          'IS_CONTROL']])

for disease in ['NECROSIS', 'CHOLESTASIS', 'STEATOSIS']:
    animals_with_mi.loc[animals_with_mi[disease].isnull(), disease] = 0

for study, data in animals_with_mi.groupby(['STUDYID', 'SEX']):
    control_animals_mean = data[data.IS_CONTROL][tests].mean()
    animals_with_mi.loc[data.index, tests] = data[tests].divide(control_animals_mean)

## Add body weights

Create a set of body weight features as well.  These consist of fitting a linear regression line to the body weights throught the study and taking the slope and y interecept, as well as taking a different of the final weight to the animals first weight. 

These features are also normalized to control and contain `_NORM`.

In [15]:
bw = send_db.generic_query('SELECT STUDYID, USUBJID, BWSTRESN, BWSTRESU, BWTESTCD, BWDY FROM BW')
animals_bw = animals_with_mi[['STUDYID', 'USUBJID', 'IS_CONTROL', 'SEX']].merge(bw)
# # animals_bw = animals_bw[animals_bw.BWSTRESU != '']

# # identify control animals for normalization
# animals_bw['IS_CONTROL'] = animals_bw['IS_CONTROL'].astype(bool)

# these are the functions that will actually create
# the features from body weight data
def difference_fx(data):
    first_weight = data.sort_values(by='BWDY')['BWSTRESN'].iloc[0]
    second_weight = data.sort_values(by='BWDY')['BWSTRESN'].iloc[-1]
    return second_weight - first_weight

def slope_fx(data):

    slope, intercept, r_value, p_value, std_err = stats.linregress(data.BWDY, data.BWSTRESN)
    return slope

def intercept_fx(data):

    slope, intercept, r_value, p_value, std_err = stats.linregress(data.BWDY, data.BWSTRESN)
    return intercept

animals_bw_diff = animals_bw.groupby('USUBJID').apply(difference_fx).reset_index()
animals_bw_diff.columns = ['USUBJID', 'BWDIFF']

animals_bw = animals_bw.merge(animals_bw_diff)

animals_bw_slope = animals_bw.groupby('USUBJID').apply(slope_fx).reset_index()
animals_bw_slope.columns = ['USUBJID', 'BWSLOPE']

animals_bw = animals_bw.merge(animals_bw_slope)

animals_bw_int = animals_bw.groupby('USUBJID').apply(intercept_fx).reset_index()
animals_bw_int.columns = ['USUBJID', 'BWINTCEPT']

animals_bw = animals_bw.merge(animals_bw_int)

new_tests = ['BWDIFF', 'BWSLOPE', 'BWINTCEPT']
names = list(map(lambda x: '{}_NORM'.format(x), new_tests))

for name in names:
    animals_bw[name] = np.nan

animals_bw = animals_bw[['STUDYID', 'USUBJID', 'IS_CONTROL', 'SEX'] + new_tests + names].drop_duplicates()
animals_bw.index = animals_bw.USUBJID
for study, data in animals_bw.groupby(['STUDYID', 'SEX']):
    control_animals_mean = data[data.IS_CONTROL][new_tests].mean()

    for name in names:
        animals_bw.loc[data.index, name] = data[name.replace('_NORM', '')] / control_animals_mean[
            name.replace('_NORM', '')]



## Write results

Merge the final data frame and cache. 

In [16]:
final_data = animals_with_mi.merge(animals_bw.reset_index(drop=True).drop(['IS_CONTROL', 'SEX'], axis=1), 
                                   how='left', on=['STUDYID', 'USUBJID'])
training_file = os.path.join(species_data, f'{species}_training_data.csv')
final_data.to_csv(training_file)

print(f"There are {final_data.shape[0]} training points.")

There are 17285 training points.
