# HNE Analysis & Transciptomic Prediction
*   HNE Data: OSD-557
*   RNA Seq Data: OSD-255

In [5]:
%%capture
!pip install import_ipynb
import import_ipynb
from google.colab import drive
drive.flush_and_unmount()
drive.mount("mnt")

In [6]:
m = __import__("mnt/MyDrive/Colab Notebooks/Methods")

Collecting scanpy
  Downloading scanpy-1.10.4-py3-none-any.whl.metadata (9.3 kB)
Collecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.11.1-py3-none-any.whl.metadata (8.2 kB)
Collecting legacy-api-wrap>=1.4 (from scanpy)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting pynndescent>=0.5 (from scanpy)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting session-info (from scanpy)
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting umap-learn!=0.5.0,>=0.5 (from scanpy)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting array-api-compat!=1.5,>1.4 (from anndata>=0.8->scanpy)
  Downloading array_api_compat-1.9.1-py3-none-any.whl.metadata (1.6 kB)
Collecting stdlib_list (from session-info->scanpy)
  Downloading stdlib_list-0.11.0-py3-none-any.whl.metadata (3.3 kB)
Downloading scanpy-1.10.4-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_rows', 5)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score


scaler = StandardScaler()

In [8]:
data = {}
meta = {}

# HNE Init & Filtering

In [9]:
pd.set_option('display.max_rows', 50)

data["HNE"] = m.read_phenotype_data("557", "LSDS-1_immunostaining_microscopy_HNEtr_Transformed_Reusable_Results")
data["HNE"]

Unnamed: 0,Source Name,Sample Name,sumcount,sumEC,sunarea,dentotal,denEC
0,F15,F15_Mouse_Eye,124,24,1.149408,107.881593,20.880308
1,F16,F16_Mouse_Eye,97,28,0.946872,102.442566,29.57105
2,F17,F17_Mouse_Eye,159,28,0.875718,181.56529,31.973762
3,F18,F18_Mouse_Eye,90,23,0.650403,138.37566,35.362669
4,F19,F19_Mouse_Eye,106,15,1.086317,97.577417,13.808125
5,F20,F20_Mouse_Eye,172,28,1.235269,139.240973,22.667135
6,GC15,GC15_Mouse_Eye,66,24,1.097939,60.112605,21.859129
7,GC16,GC16_Mouse_Eye,58,20,1.186757,48.872669,16.852644
8,GC17,GC17_Mouse_Eye,57,16,1.378913,41.336894,11.603339
9,GC18,GC18_Mouse_Eye,64,19,0.955575,66.975401,19.883322


In [10]:
meta["HNE"] = m.read_meta_data("557")
meta["HNE"][["Source Name", "Sample Name", "Factor Value[Spaceflight]"]]

Unnamed: 0,Source Name,Sample Name,Factor Value[Spaceflight]
0,F10,F10_Mouse_Eye,Space Flight
1,F12,F12_Mouse_Eye,Space Flight
2,F13,F13_Mouse_Eye,Space Flight
3,F14,F14_Mouse_Eye,Space Flight
4,F15,F15_Mouse_Eye,Space Flight
5,F16,F16_Mouse_Eye,Space Flight
6,F17,F17_Mouse_Eye,Space Flight
7,F18,F18_Mouse_Eye,Space Flight
8,F19,F19_Mouse_Eye,Space Flight
9,F20,F20_Mouse_Eye,Space Flight


# RNA-Seq Analysis & Filtering

In [11]:
data["RNA"] = m.read_rnaseq_data("255_rna_seq_STAR_Unnormalized_Counts")
data["RNA"]

Unnamed: 0.1,Unnamed: 0,GSM3932693,GSM3932694,GSM3932695,GSM3932696,GSM3932697,GSM3932698,GSM3932699,GSM3932700,GSM3932701,GSM3932702,GSM3932703,GSM3932704,GSM3932705,GSM3932706,GSM3932707,GSM3932708
0,ENSMUSG00000000001,236,292,329,269,175,257,293,258,244,267,256,253,194,285,253,289
1,ENSMUSG00000000003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,ENSMUSG00000000028,27,38,36,32,12,27,31,36,36,43,26,22,21,27,32,44
3,ENSMUSG00000000031,8,2,2,25,0,3,6,8,1,7,1,2,3,4,3,1
4,ENSMUSG00000000037,8,20,24,24,11,19,11,19,19,13,10,15,6,20,30,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56835,ERCC-00164,4,0,5,0,0,0,5,4,2,0,1,1,0,1,0,1
56836,ERCC-00165,12,14,20,8,7,14,23,11,16,19,7,6,10,16,7,9
56837,ERCC-00168,2,3,0,4,2,3,1,2,1,0,4,0,0,0,2,0
56838,ERCC-00170,25,39,43,21,31,25,63,41,30,22,15,34,31,45,32,51


In [12]:
meta["RNA"] = m.read_meta_data("255")
meta["RNA"]

Unnamed: 0,Source Name,Sample Name,Characteristics[Organism],Term Source REF,Term Accession Number,Characteristics[Strain],Term Source REF.1,Term Accession Number.1,Characteristics[Animal Source],Characteristics[Sex],...,Parameter Value[absorbed radiation dose rate],Unit.6,Term Source REF.15,Term Accession Number.15,Parameter Value[ionizing radiation],Term Source REF.16,Term Accession Number.16,Parameter Value[ionizing radiation categorized by source],Term Source REF.17,Term Accession Number.17
0,GC9,GSM3932693,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,,Not Applicable,OSD,https://osdr.nasa.gov/,non-irradiated,RBO,http://purl.obolibrary.org/obo/RBO_00005025,Not Applicable,NCIT,http://purl.obolibrary.org/obo/NCIT_C48660
1,GC11,GSM3932694,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,,Not Applicable,OSD,https://osdr.nasa.gov/,non-irradiated,RBO,http://purl.obolibrary.org/obo/RBO_00005025,Not Applicable,NCIT,http://purl.obolibrary.org/obo/NCIT_C48660
2,GC15,GSM3932695,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,,Not Applicable,OSD,https://osdr.nasa.gov/,non-irradiated,RBO,http://purl.obolibrary.org/obo/RBO_00005025,Not Applicable,NCIT,http://purl.obolibrary.org/obo/NCIT_C48660
3,GC16,GSM3932696,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,,Not Applicable,OSD,https://osdr.nasa.gov/,non-irradiated,RBO,http://purl.obolibrary.org/obo/RBO_00005025,Not Applicable,NCIT,http://purl.obolibrary.org/obo/NCIT_C48660
4,GC17,GSM3932697,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,,Not Applicable,OSD,https://osdr.nasa.gov/,non-irradiated,RBO,http://purl.obolibrary.org/obo/RBO_00005025,Not Applicable,NCIT,http://purl.obolibrary.org/obo/NCIT_C48660
5,GC18,GSM3932698,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,,Not Applicable,OSD,https://osdr.nasa.gov/,non-irradiated,RBO,http://purl.obolibrary.org/obo/RBO_00005025,Not Applicable,NCIT,http://purl.obolibrary.org/obo/NCIT_C48660
6,GC19,GSM3932699,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,,Not Applicable,OSD,https://osdr.nasa.gov/,non-irradiated,RBO,http://purl.obolibrary.org/obo/RBO_00005025,Not Applicable,NCIT,http://purl.obolibrary.org/obo/NCIT_C48660
7,GC20,GSM3932700,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,,Not Applicable,OSD,https://osdr.nasa.gov/,non-irradiated,RBO,http://purl.obolibrary.org/obo/RBO_00005025,Not Applicable,NCIT,http://purl.obolibrary.org/obo/NCIT_C48660
8,F9,GSM3932701,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,0.237,milligray per day,UO,http://purl.obolibrary.org/obo/UO_0010063,space radiation,RBO,http://purl.obolibrary.org/obo/RBO_00000002,outer space,RBO,http://purl.obolibrary.org/obo/ENVO_01000637
9,F11,GSM3932702,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6J,EFO,http://www.ebi.ac.uk/efo/EFO_0000606,Jackson Laboratory,Male,...,0.237,milligray per day,UO,http://purl.obolibrary.org/obo/UO_0010063,space radiation,RBO,http://purl.obolibrary.org/obo/RBO_00000002,outer space,RBO,http://purl.obolibrary.org/obo/ENVO_01000637


In [18]:
data["Filtered_RNA"] = m.drop_nans(data["RNA"])
data["Filtered_RNA"]

Unnamed: 0.1,Unnamed: 0,GSM3932693,GSM3932694,GSM3932695,GSM3932696,GSM3932697,GSM3932698,GSM3932699,GSM3932700,GSM3932701,GSM3932702,GSM3932703,GSM3932704,GSM3932705,GSM3932706,GSM3932707,GSM3932708
0,ENSMUSG00000000001,236,292,329,269,175,257,293,258,244,267,256,253,194,285,253,289
1,ENSMUSG00000000003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,ENSMUSG00000000028,27,38,36,32,12,27,31,36,36,43,26,22,21,27,32,44
3,ENSMUSG00000000031,8,2,2,25,0,3,6,8,1,7,1,2,3,4,3,1
4,ENSMUSG00000000037,8,20,24,24,11,19,11,19,19,13,10,15,6,20,30,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56835,ERCC-00164,4,0,5,0,0,0,5,4,2,0,1,1,0,1,0,1
56836,ERCC-00165,12,14,20,8,7,14,23,11,16,19,7,6,10,16,7,9
56837,ERCC-00168,2,3,0,4,2,3,1,2,1,0,4,0,0,0,2,0
56838,ERCC-00170,25,39,43,21,31,25,63,41,30,22,15,34,31,45,32,51


In [19]:
data["Filtered_RNA"] = m.filter_genes(data["Filtered_RNA"], drop='non-coding')
data["Filtered_RNA"]

ParseError: mismatched tag: line 62, column 2 (<string>)

In [None]:
def filter_gene_expression(df, expression_level):
    # Calculate the threshold number of samples (80% of total samples)
    sample_count_threshold = int(0.8 * (df.shape[1] - 1))

    # Function to check if a gene's expression level is below the threshold in 80% or more of samples
    def gene_below_expression_level(row):
        # Count samples with expression level below the given threshold
        below_threshold_count = (row[1:] < expression_level).sum()
        # Check if this count is greater than or equal to the threshold
        return below_threshold_count < sample_count_threshold

    # Filter the dataframe by applying the function on each row
    filtered_df = df[df.apply(gene_below_expression_level, axis=1)]

    return filtered_df

In [None]:
data["Filtered_RNA"] = filter_gene_expression(data["Filtered_RNA"], expression_level=0)
data["Filtered_RNA"]

In [None]:
data["Filtered_RNA"] = filter_gene_expression(data["Filtered_RNA"], expression_level=50)
data["Filtered_RNA"]

In [None]:
rna_seq = data["Filtered_RNA"]
# convert gene id to gene symbol
# remove any genes that do not have a gene symbol
mg=m.mygene.MyGeneInfo()
ens=list(rna_seq['Unnamed: 0'])
ginfo = mg.querymany(ens, scopes='ensembl.gene')
drop_list=list()
hit_list=list()
for g in ginfo:
    if not 'symbol' in g:
        drop_list.append(g['query'])
    else:
        if not g['symbol'] in hit_list:
            rna_seq.loc[rna_seq['Unnamed: 0']==g['query'], 'Unnamed: 0']=g['symbol']
            hit_list.append(g['symbol'])

print('RNA-seq data shape before: ', rna_seq.shape)
rna_seq = rna_seq[~rna_seq['Unnamed: 0'].isin(drop_list)]
print('RNA-seq data shape after: ', rna_seq.shape)

data["Filtered_RNA"] = rna_seq

In [None]:
data["RNA"] = data["Filtered_RNA"]

In [None]:
sample_to_source = pd.Series(meta["RNA"]["Source Name"].values, index=meta["RNA"]["Sample Name"]).to_dict()
data["RNA"].rename(columns=sample_to_source, inplace=True)

data["RNA"].columns

In [None]:
data["RNA"] = data["RNA"].T
data["RNA"].columns = data["RNA"].iloc[0]
data["RNA"] = data["RNA"].iloc[1:]
data["RNA"].reset_index(inplace=True)
data["RNA"].rename(columns={'index': 'Source Name'}, inplace=True)
data["RNA"].rename_axis("", axis=1, inplace=True)

In [None]:
data["RNA"]

# Intersect Datasets

In [None]:
# Copy RNA data
df = data["RNA"].copy()

# Get Source Names that exist in both RNA and HNE
common_source_names = df["Source Name"].isin(data["HNE"]['Source Name'])

# Step 1: Filter both RNA and HNE based on matching Source Names
df_filtered = df[common_source_names].copy()  # Keep only rows in RNA that match with HNE
filtered_HNE = data["HNE"][data["HNE"]['Source Name'].isin(df_filtered['Source Name'])].copy()

# Step 2: Set 'Source Name' as the index for both DataFrames (for potential further alignment)
filtered_HNE.set_index('Source Name', inplace=True)
df_filtered.set_index('Source Name', inplace=True)

# Step 3: Reset index (optional, if you want 'Source Name' back as a column)
df_filtered.reset_index(inplace=True)
filtered_HNE.reset_index(inplace=True)

# Display the filtered RNA and HNE (now aligned without null rows)
filtered_HNE.drop(columns=["Sample Name", "sunarea", "sumEC"], inplace=True)

In [None]:
df = df.join(filtered_HNE.set_index('Source Name'), on='Source Name').copy()
df.dropna(inplace=True)

In [None]:
df.columns

In [None]:
df

In [None]:
# Assuming your DataFrame is named df
#df.to_csv('mnt/MyDrive/Colab Notebooks/Capstone/dataframe.csv', index=False)