In [1]:
import pandas as pd
import glob

In [2]:
def load_df(path, encoding="utf-8"):
    loaded_pd = pd.read_csv(path, sep="\t", encoding=encoding)
    pd_name = path.split("/")[-1].split(".")[0]
    return pd_name, loaded_pd

In [3]:
paths_expression = glob.glob("data/expression data/*.txt")

In [4]:
expression_data = []
for path in paths_expression:
    expression_data.append(load_df(path))

In [5]:
paths_other = glob.glob("data/*.txt")
aligner_path = "data/phenotypes_id_aligner.txt"
other_data = []
for path in paths_other:
    if path == aligner_path:
        other_data.append(load_df(path, encoding="latin"))
    else:
        other_data.append(load_df(path))

## We have loaded the dataframes to lists

Each list contains tuples (name, dataframe)
- expression_data: tables with gene expressions
- other_data: all the remaining tables

### Predicting immune response

We decide to use the 
The gene expression is the best indicator of the phenotype, so we check if there is missing data on expression that is of interest to us.

In [15]:
expression_data[0][0]

'Muscle_CD'

In [42]:
pheno_ids_df = pd.read_pickle("processed_data/PhenoID_Disease.pkl")

In [44]:
pheno_ids_df

Unnamed: 0_level_0,Shown_pheno,Disease
PhenoID,Unnamed: 1_level_1,Unnamed: 2_level_1
X657,H5N1 influenza A virus mortality rate 30 days ...,Influenza
X893,H5N1 influenza A virus survival time (10^4 EID...,Influenza
X3655,"Malaria infection lethality, Plasmodium yoelii...",Malaria
X4233,"Malaria susceptibility, murine Plasmodium yoel...",Malaria
X593,"H1N1 (PR8) influenza A virus (2x10E3 FFU), med...",Influenza
X1868,"H1N1 (PR8) influenza A virus (2x10E3 FFU), med...",Influenza
X2558,"H1N1 (PR8) influenza A virus (2x10E3 FFU), med...",Influenza
X544,"H1N1 (PR8) influenza A virus (2x10E3 FFU), med...",Influenza
X1481,"H1N1 (PR8) influenza A virus (2x10E3 FFU), med...",Influenza
X2637,"H1N1 (PR8) influenza A virus (2x10E3 FFU), med...",Influenza


In [46]:
pheno_joined = other_data[1][1].set_index("PhenoID")
pheno_joined = pheno_joined.join(pheno_ids_df)

In [54]:
pd.to_pickle(pheno_joined[~pheno_joined["Disease"].isna()], "processed_data/pheno_joined_present.pkl")
pd.to_pickle(pheno_joined[pheno_joined["Disease"].isna()], "processed_data/pheno_joined_absent.pkl")

In [39]:
pheno_ids = ["X3873"]
strains = []
for pheno_id in pheno_ids:
    pheno_df = other_data[1][1][]
    pheno_present = pheno_df[pheno_df.columns[~pheno_df.isnull().all()]].iloc[:,1:].transpose()
    pheno_present.columns = ["Expression"]
    pheno_absent = pheno_df[pheno_df.columns[pheno_df.isnull().all()]].transpose()
    pheno_absent.columns = ["Expression"]

In [35]:
other_data[1][1][other_data[1][1]["PhenoID"] == "X3873"]

Unnamed: 0,PhenoID,B6D2F1,D2B6F1,C57BL.6J,DBA.2J,BXD1,BXD2,BXD5,BXD6,BXD8,...,BXD90,BXD91,BXD93,BXD94,BXD95,BXD98,BXD99,BXD100,BXD101,BXD102
5089,X3873,,,100.22,158.28,,,,,,...,,,,,,,,76.12,,111.26


In [41]:
pheno_absent

Unnamed: 0,Expression
B6D2F1,
D2B6F1,
BXD1,
BXD2,
BXD5,
...,...
BXD94,
BXD95,
BXD98,
BXD99,


In [12]:
[name for name, table in expression_data]

['Muscle_CD',
 'Cerebellum',
 'Liver_HFD',
 'Striatum',
 'Liver_CD',
 'VTA',
 'Adrenal_Female',
 'Hypothalamus_Female',
 'Amygdala_Female',
 'Liver_metabolite_HFD',
 'Spleen_Male',
 'Pituitary_Female',
 'ScWAT_HFD',
 'ScWAT_CD',
 'Brain_INIA',
 'PFC',
 'Muscle_metabolite_HFD',
 'Muscle_metabolite_CD',
 'Heart_CD',
 'Pituitary_Male',
 'Hypothalamus_Male',
 'Heart_HFD',
 'Midbrain',
 'BAT',
 'Muscle_HFD',
 'Nucleus_accumbens',
 'Kidney_Female',
 'Brain_UTHSC',
 'Amygdala_Male',
 'Adrenal_Male',
 'Kidney_Male',
 'Hippocampus',
 'Bone_Femur',
 'Gastrointestinal',
 'LiverProt_CD',
 'LiverProt_HFD',
 'Spleen_Female',
 'Liver_metabolite_CD',
 'Lung',
 'Eye']

In [13]:
[name for name, table in other_data]

['map_BXD', 'Phenotype', 'genotype_BXD', 'phenotypes_id_aligner']

In [61]:
list(other_data[3][1][other_data[3][1]["Category"] == "Immune"].sort_values("Strains", ascending=False)["Phenotype"])

['Infectious disease, immune system: H5N1 influenza A virus mortality rate 30 days after infection (10^4 EID-50 of HK213 virus in 30 microliters saline) [% death]',
 'Infectious disease, immune system: H5N1 influenza A virus survival time (10^4 EID-50 of HK213 virus in 30 microliters saline) [days, max to 30]',
 'Immune system: ELISA-3x, IgG class antibody binding to TSHR A-subunit protein in ELISA 4 weeks after 3 immunizations with TSHR A-subunit adenovirus [OD490 nm]',
 'Immune system, endocrinology: serum inhibition of TSH binding to the TSHR before immunization [% inhibition] ',
 'Immune system, endocrinology: Serum inhibition of TSH binding to the TSHR 4 weeks after 3 immunizations with TSHR A-subunit adenovirus in young adult females [% inhibition]',
 'Immune system, endocrine system: TBI-2x, serum inhibition of TSH binding to the TSHR 1 week after 2 immunizations with TSHR A-subunit adenovirus [% inhibition]',
 'Immune system: ELISA-2x, IgG class antibody binding to TSHR A-subun

In [None]:
other_data[3][1][other_data[3][1]["Category"] == "Activity"].sort_values("Strains", ascending=False)

In [44]:
other_data[3][1][other_data[3][1]["Phenotype"] == "Infectious disease, immune system: H5N1 influenza A virus survival time (10^4 EID-50 of HK213 virus in 30 microliters saline) [days, max to 30]"]

Unnamed: 0,PhenoID,PhenoID.1,RecordID,Phenotype,Shown_pheno,Category,Category_ID,color,Phenotype_post,Phenotype_pre,Authors,Senior.Author,Abbreviation,Quantitive.trait,Tissues,Diet,Strains
1601,X893,893,10866,"Infectious disease, immune system: H5N1 influe...",H5N1 influenza A virus survival time (10^4 EID...,Immune,1602.0,5.0,"Infectious disease, immune system: H5N1 influe...","Infectious disease, immune system: H5N1 influe...","Boon ACM, deBeauchamp J, Hollmann A, Luke J, K...",Webby,,Yes,,,69


In [42]:
list(other_data[3][1].sort_values("Strains", ascending=False)[16:]["Phenotype"])

['Eye, visual system, morphology: Eye weight, fresh unfixed, corrected for age [mg]',
 'Central nervous system, morphology: Forebrain weight, adjusted by age, sex, body weight and BXD epoch [mg]',
 'Central nervous system, cardiovascular system: Infarct volume in neocortex, binary score, following experimental stroke (permanent middle cerebral artery and ipsilateral common carotid artery occlusion) in adult males and females (2- to 4-months old), 24 ',
 'Central nervous system, cardiovascular system: Infarct volume in neocortex (log transformed) following experimental stroke (permanent middle cerebral artery and ipsilateral common carotid artery occlusion) in adult males and females (2- to 4-months old), ',
 'Central nervous system, cardiovascular system: Infarct volume in neocortex following experimental stroke (permanent middle cerebral artery and ipsilateral common carotid artery occlusion) in adult males and females (2- to 4-months old), 24 hr survival (al',
 'Eye, visual system: I