# Assign Metabolites to molecular classes 
## Assign molecular classes to superclasses

In [4]:
import json

import pandas as pd

In [5]:
data = pd.read_csv(r'../data/processed/combined_metabolites_data_with_model_params.csv').set_index('i')

In [2]:
lipid_classes = {
    # LIPIDS
    "TG"          : 'Glycerolipid',
    "Alkenyl-DG"  : 'Glycerolipid',
    "AC"          : 'Fatty Acyl',
    "FA"          : 'Fatty Acyl',
    "CE"          : 'Sterol Lipid',
    "SM"          : 'Sphingolipid',
    "Cer[NS]"     : 'Sphingolipid',
    "LysoPE"      : 'Phospholipid',
    "LysoPC"      : 'Phospholipid',
    "PC"          : 'Phospholipid',
    "PI"          : 'Phospholipid',
    "Plasmenyl-PC": 'Phospholipid',
    "Plasmanyl-PC": 'Phospholipid',
    "Plasmenyl-PE": 'Phospholipid',
    "PE"          : 'Phospholipid',
    "Plasmanyl-PE": 'Phospholipid',
    
    # ALL UNKNOWNS MAP TO GRAY
    "unknown"     : 'Unidentified',
    'Unidentified': 'Unidentified',
}


In [6]:
for m in data.loc[(data['Type'] == 'metabolite') & (data['ID'] != 'unknown'), 'ID'][::-1]:
    print(m)

1,5-Anhydro-D-glucitol
2-Mercaptoethanol
2-Naphthalenesulfonic acid
3-Hydroxybutyric acid
3-Indoxyl sulphate
3-Methylhistidine
4-Guanidinobutyric acid
4-Hydroxybenzaldehyde
4-Hydroxybutyric acid (GHB)
4-Oxoproline
5,5-Dimethylhydantoin
8-Hydroxyquinoline
Acetyl-β-methylcholine
Acrylic acid
Adenosine 5'-monophosphate
Alanine
alpha-Glycerylphosphorylcholine
Anhydrohexose
Anserine
Arginine
Ascorbic acid 2-sulfate
Asparagine
Beta alanine
Beta-D-Glucopyranuronic acid
Betaine
Carnitine
Choline
Citramalic acid
Creatine
Creatinine
Cytidine
Ergothioneine
Ethyl-beta-D-glucuronide
Gluconic acid
Glucose
Glutamic acid
Glutamine
Glyceric acid
Guanidinosuccinic acid
Hexose sugar
Iditol
Hippuric acid
Histidine
Hydrocinnamic acid
Hydroxycinnamic acid
Isoleucine
Lactic Acid
Leucine
Methylaminopyrimidine
Methylhistidine
N-Acetyl-L-aspartic acid
Pyroglutamic acid
N-Acetylneuraminic acid
N-Acetylornithine
N-Isovalerylglycine
N-Methyl-2-pyrrolidone
N3,N4-Dimethyl-L-arginine
N6,N6,N6-Trimethyl-L-lysine
Nicot

In [14]:
metab_classes = {
    "1,5-Anhydro-D-glucitol":           'Carbohydrate',
    "2-Mercaptoethanol":                'Alcohol',
    "2-Naphthalenesulfonic acid":       'Organic acid',
    "3-Hydroxybutyric acid":            'Organic acid',
    "3-Indoxyl sulphate":               'Organic acid',
    "3-Methylhistidine":                'Amino acid derivative',
    "4-Guanidinobutyric acid":          'Organic acid',
    "4-Hydroxybenzaldehyde":            'Alcohol',
    
    "4-Hydroxybutyric acid (GHB)":      'Organic acid',
    "4-Oxoproline":                     'Amino acid derivative',
    "5,5-Dimethylhydantoin":            'Amino acid derivative',
    "8-Hydroxyquinoline":               'Alcohol',
    "Acetyl-β-methylcholine":           'Amino acid derivative',
    "Acetyl-beta-methylcholine":        'Amino acid derivative',
    "Acrylic acid":                     'Organic acid',
    "Adenosine 5'-monophosphate":       'Nitrogen heterocycle',
    "Alanine":                          'Amino acid',
    "alpha-Glycerylphosphorylcholine":  'Organic acid',
    "Anhydrohexose":                    'Carbohydrate',
    
    "Anserine":                       'Amino acid derivative', 
    "Arginine":                       'Amino acid',
    "Ascorbic acid 2-sulfate":        'Organic acid',
    "Asparagine":                     'Amino acid',
    "Beta alanine":                   'Amino acid derivative',
    "Beta-D-Glucopyranuronic acid":   'Carbohydrate',
    "Betaine":                        'Amino acid derivative',  # N-trimethyl glycine (ammonium)
    "Carnitine":                      'Organic acid',
    "Choline":                        'Alcohol',
    "Citramalic acid":                'Organic acid',
    "Creatine":                       'Organic acid',
    
    "Creatinine":               'Nitrogen heterocycle',
    "Cytidine":                 'Nitrogen heterocycle',
    "Ergothioneine":            'Amino acid derivative',
    "Ethyl-beta-D-glucuronide": 'Carbohydrate',
    "Gluconic acid":            'Carbohydrate',
    "Glucose":                  'Carbohydrate',
    "Glutamic acid":            'Amino acid',
    "Glutamine":                'Amino acid',
    "Glyceric acid":            'Carbohydrate',
    "Guanidinosuccinic acid":   'Amino acid derivative',
    "Hexose sugar":             'Carbohydrate',
    
    "Iditol":                   'Carbohydrate',
    "Hippuric acid":            'Organic acid',
    "Histidine":                'Amino acid',
    "Hydrocinnamic acid":       'Organic acid',
    "Hydroxycinnamic acid":     'Organic acid',
    "Isoleucine":               'Amino acid',
    "Lactic Acid":              'Organic acid',
    "Leucine":                  'Amino acid',
    "Methylaminopyrimidine":    'Nitrogen heterocycle',
    "Methylhistidine":          'Amino acid derivative',
    "N-Acetyl-L-aspartic acid": 'Amino acid derivative',
    
    "Pyroglutamic acid":           'Amino acid derivative',
    "N-Acetylneuraminic acid":     'Carbohydrate',
    "N-Acetylornithine":           'Amino acid derivative',
    "N-Isovalerylglycine":         'Amino acid derivative',
    "N-Methyl-2-pyrrolidone":      'Nitrogen heterocycle',
    "N3,N4-Dimethyl-L-arginine":   'Amino acid derivative',
    "N6,N6,N6-Trimethyl-L-lysine": 'Amino acid derivative',
    "Nicotinamide":                'Nitrogen heterocycle',
    "Nicotinamide 1-oxide":        'Nitrogen heterocycle',
    "Ornithine":                   'Amino acid derivative',
    "Pentose sugar":               'Carbohydrate',
    "Phenylacetylglycine":         'Amino acid derivative',
    
    "Phenylalanine":     'Amino acid',
    "Pipecolic acid":    'Organic acid',
    "Proline":           'Amino acid',
    "Pseudouridine":     'Nitrogen heterocycle',
    "Pyroglutamic acid": 'Amino acid derivative',
    "Quinic acid":       'Organic acid',
    "Ribose":            'Carbohydrate',
    "Serine":            'Amino acid',
    "Stachydrine":       'Amino acid derivative',  # Proline Betaine (N-dimethyl ammonium)
    
    "Taurine":               'Amino acid derivative',
    "Threonic acid":         'Carbohydrate',  # sugar acid derived from Threose
    "Threonine":             'Amino acid',
    "Indole-3-acrylic acid": 'Organic acid',  # Also Nitrogen heterocycle (indole)
    "Trigonelline":          'Organic acid',  # also Nitrogen heterocycle (piperidine)
    "Tryptophan":            'Amino acid',
    "Tyrosine":              'Amino acid',
    "Uric acid":             'Nitrogen heterocycle',
    "Uridine":               'Nitrogen heterocycle',
    "Urocanic acid":         'Nitrogen heterocycle',
    
#     'AC 5:0': '', 
#     'AC 4:0': '', 
#     'AC 3:0': '', 
#     'AC 2:0': '', 
#     'AC 18:2': '',
#     'AC 18:1': '', 
#     'AC 18:0': '', 
#     'AC 16:1': '', 
#     'AC 16:0': '', 
#     'AC 14:1': '', 
#     'AC 14:0': '',
    
    'Alcohol':               'Alcohol',
    'Amino acid':            'Amino acid',
    'Amino acid derivative': 'Amino acid derivative',
    'Carbohydrate':          'Carbohydrate',
    'Nitrogen heterocycle':  'Nitrogen heterocycle',
    'Organic acid':          'Organic acid',
}

In [17]:
superclasses = {**lipid_classes, **metab_classes}
superclasses
# json.dump(superclasses, open(r'../data/metadata/compound_superclasses.json', 'w'))

{'TG': 'Glycerolipid',
 'Alkenyl-DG': 'Glycerolipid',
 'AC': 'Fatty Acyl',
 'FA': 'Fatty Acyl',
 'CE': 'Sterol Lipid',
 'SM': 'Sphingolipid',
 'Cer[NS]': 'Sphingolipid',
 'LysoPE': 'Phospholipid',
 'LysoPC': 'Phospholipid',
 'PC': 'Phospholipid',
 'PI': 'Phospholipid',
 'Plasmenyl-PC': 'Phospholipid',
 'Plasmanyl-PC': 'Phospholipid',
 'Plasmenyl-PE': 'Phospholipid',
 'PE': 'Phospholipid',
 'Plasmanyl-PE': 'Phospholipid',
 'unknown': 'Unidentified',
 'Unidentified': 'Unidentified',
 '1,5-Anhydro-D-glucitol': 'Carbohydrate',
 '2-Mercaptoethanol': 'Alcohol',
 '2-Naphthalenesulfonic acid': 'Organic acid',
 '3-Hydroxybutyric acid': 'Organic acid',
 '3-Indoxyl sulphate': 'Organic acid',
 '3-Methylhistidine': 'Amino acid derivative',
 '4-Guanidinobutyric acid': 'Organic acid',
 '4-Hydroxybenzaldehyde': 'Alcohol',
 '4-Hydroxybutyric acid (GHB)': 'Organic acid',
 '4-Oxoproline': 'Amino acid derivative',
 '5,5-Dimethylhydantoin': 'Amino acid derivative',
 '8-Hydroxyquinoline': 'Alcohol',
 'Acety

# Original code for making File Grouping (fg) dataframe

In [1]:
## For easier grouping of files based on RBG vs FBG, analytical order, pairing of samples, and rat label,
## create a dataframe file_grouping that contains all labels and groups per analysis file.
## Columns = rawfile name; lipidex final results filename; order; RBG vs FBG; rat number labels

file_grouping = pd.read_csv(r'..\data\metadata\20210729_AJ_Toh_RatPlasma_Sequence_Exported.csv', skiprows=1)
file_grouping['analytical_run_order'] = range(1, 68)
file_grouping = file_grouping[['File Name', 'analytical_run_order']]

rawfile_cols = fr_raw.filter(regex='.raw').columns

# Dictionary of rawfile names to Lipidex final results file names (which includes (F1), (F2), etc. in column name)
sample_name_to_file_name_map = {}

for file in rawfile_cols.to_list():
    for sfile in file_grouping['File Name'].to_list():
        if sfile in file:
            sample_name_to_file_name_map[sfile] = file

# correlates rawfile name to lipidex final results filename
file_grouping['fr_name'] = file_grouping['File Name'].apply(lambda name: sample_name_to_file_name_map[name] )
    
# Extract the rat label from filename using this horrible chain of str.split() methods 
file_grouping['rat_label'] = file_grouping[file_grouping['File Name'].str.contains('Glucose_T')]['File Name'] \
                                           .str.split('Glucose_T').str[1] \
                                           .str.split('M_2021').str[0] \
                                           .astype('float')

def parse_bg_type(filename):
    if 'RBG' in filename:
        return 'RBG'
    if 'FBG' in filename:
        return 'FBG'

file_grouping['bg_type'] = file_grouping[file_grouping['File Name'].str.contains('RBG|FBG')]['File Name'] \
                                         .apply(parse_bg_type)

# if a file has quant_file == True, then it's one of the 60 runs we will use for data analysis
file_grouping['quant_file'] = ~file_grouping['analytical_run_order'].isin([1,2,3,4,5,6,67])

# extract the week label from filename using chain of str.split()
file_grouping['week'] = file_grouping[file_grouping['File Name'].str.contains('Glucose_T')]['File Name'] \
                                           .str.split('_').str[6] \
                                           .str.split('wk').str[0].astype('float').astype('Int64')

# Extract the lipidex file number from lipidex fr_name 
file_grouping['lipidex_file_number'] = file_grouping['fr_name'].str.extract(r'\(F(.*?)\)').astype('int')

file_grouping['date'] = pd.to_datetime(file_grouping[file_grouping['quant_file'] == True]['File Name'].str.split('_').str[5])

def gluc_tol(rat_label):
    # Original email was wrong, 1091 and 1093 are normal, NOT 1092 and 1093. 
    if rat_label in [1091, 1093]:
        return 'normal'
    elif rat_label in [1076, 1082, 1101]:
        return 'diabetic'
    elif rat_label in [1092, 1060, 1102, 1062, 1074]:
        return 'impaired'
    else:
        return None

file_grouping['glucose_tolerance'] = file_grouping['rat_label'].apply(gluc_tol)

# Include measured blood glucose value (random or fasted) in column 'bg'
# bg = obs.loc[(obs['animal'].isin(animals_used)) & (obs['BG type'].isin(['RBG', 'FBG']))]\
#        [['animal', 'BG type', 'BG', 'weeks old note']].dropna()
# bg['week'] = bg['weeks old note'].str.split(' ').str[0]
# bg = bg.drop('weeks old note', axis=1)
# bg = bg[(bg['week'] == '8') | (bg['week'] == '9') | (bg['week'] == '10')]  ## Why won't .drop(.isin(['4', '11', '12'])) work???
# bg['week'] = bg['week'].astype('int').astype('Int64')
# bg['animal'] = bg['animal'].astype('float')
# # rename cols so that merge('left') will work to include BG 
# bg = bg.rename({'BG type': 'bg_type', 'animal': 'rat_label', 'BG':'bg'}, axis=1)

# file_grouping = file_grouping.merge(bg, how='left')

# file_grouping.to_csv(r'..\data\metadata\file_grouping.csv', index=False)
non_data_filenames = file_grouping.loc[file_grouping['quant_file'] == False, 'File Name'].str.split('_').str[-1]

file_grouping

NameError: name 'pd' is not defined

# Original Animal Phenotypes DF

In [None]:
# # obs is the dataframe of observations collected by collaborators. 
# # data were originally in a pivoted format, so observations were cleaned into "tidy data format"
# obs = pd.read_excel(r'..\data\metadata\20210421_FBGRBGOGTT_Huishi sample sheet.xlsx', sheet_name='cleaned')
# # Remove the T and M from T1060M
# obs['animal'] = obs['animal'].str.strip('TM').astype('int')
# obs['datetime'] = obs['when'].apply(dateutil.parser.parse)

# animal_phenotypes = pd.read_excel(r'..\data\metadata\20210421_FBGRBGOGTT_Huishi sample sheet.xlsx', sheet_name='animal phenotypes')
# animal_phenotypes['animal'] = animal_phenotypes['animal'].str.strip('TM').astype('int')

# fr_quant_filenames_in_analytical_order = file_grouping[file_grouping['quant_file'] == True]['fr_name'].to_list()

# animals_used = [1091, 1093, 1060, 1062, 1074, 1092, 1102, 1076, 1082, 1101]
# diabetic = [1076, 1082, 1101]
# impaired = [1060, 1062, 1074, 1092, 1102]
# normal = [1091, 1093]