<a href="https://colab.research.google.com/github/aaronc09/peds_lupus_flare_prediction_ML_gene-expression/blob/main/1_PREPROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Give Google Colab access to my Google Drive files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install GEOparse which is used to parse gene expression data from the NCBI Gene Expression Omnibus database
!pip install GEOparse
import GEOparse




In [3]:
# Minimize error messages to only to only criticial messages when parsing the gene-expression data
import logging
logging.getLogger("GEOparse").setLevel(logging.CRITICAL)

In [4]:
# Download the GSE65391 data into MyDrive
gse = GEOparse.get_GEO(geo="GSE65391", destdir="/content/drive/MyDrive/")

In [5]:
import pandas as pd

# Forming a dataframe for the "gene expression" values
all_expr = []
for gsm_id, gsm in gse.gsms.items():
    df = gsm.table[["ID_REF", "VALUE"]].copy()
    df = df.rename(columns={"VALUE": gsm_id})
    df.set_index("ID_REF", inplace=True)
    all_expr.append(df)
expr_matrix = pd.concat(all_expr, axis=1).T
expr_matrix.index.name = "gsm_id"

# Forming a dataframe for "other characteristics" (or other pertinent information) from the original data
rows = []
for gsm_id, gsm in gse.gsms.items():
    row = {"gsm_id": gsm_id}
    for k, v in gsm.metadata.items():
        if k == "characteristics_ch1":
          for characteristic in v:
            char_key, char_val = characteristic.split(": ", 1)
            row[char_key] = char_val
        else:
          row[k] = "; ".join(v) if isinstance(v, list) else v
    rows.append(row)
other_info_df = pd.DataFrame(rows).set_index("gsm_id")

# Merge the "gene expression" and "other characteristics" into one dataframe
full_df = pd.merge(other_info_df, expr_matrix, left_index=True, right_index=True, how='inner')


In [6]:
# 996 samples each containing 43,799 gene-expression values
expr_matrix.shape


(996, 43799)

In [7]:
# 996 samples each containing other pertitnent non-gene-expression info- pertinent, non-gene-expression info (such as subject ID, SLEDAI scores, days since diagnosis, etc..)
other_info_df.shape

(996, 118)

In [8]:
# Gene-expression value columns combined with the other pertinent information
full_df.shape

(996, 43917)

In [9]:
full_df.head()

Unnamed: 0_level_0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,taxid_ch1,...,ILMN_3311145,ILMN_3311150,ILMN_3311155,ILMN_3311160,ILMN_3311165,ILMN_3311170,ILMN_3311175,ILMN_3311180,ILMN_3311185,ILMN_3311190
gsm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM1594219,whole blood-BAY-H377-V1-Healthy-2,GSM1594219,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,BAY-H377,V1,Healthy",Homo sapiens,9606,...,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,4.076274
GSM1594220,whole blood-BAY-H290-V1-Healthy-2,GSM1594220,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,BAY-H290,V1,Healthy",Homo sapiens,9606,...,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,4.076274
GSM1594221,whole blood-BAY-H303-V1-Healthy-2,GSM1594221,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,BAY-H303,V1,Healthy",Homo sapiens,9606,...,3.321928,3.321928,3.321928,3.321928,4.259919,3.321928,3.321928,3.321928,3.321928,4.076274
GSM1594222,whole blood-BAY-H380-V1-Healthy-2,GSM1594222,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,BAY-H380,V1,Healthy",Homo sapiens,9606,...,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,4.076274
GSM1594223,whole blood-BAY-H306-V1-Healthy-2,GSM1594223,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,BAY-H306,V1,Healthy",Homo sapiens,9606,...,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,4.076274


In [10]:
drive_path = '/content/drive/MyDrive/full_df.pkl'
full_df.to_pickle(drive_path)

In [11]:
# Include only lupus subjects by removing healthly control samples from the merged dataframe
#  then add a new column to show the visit numbers for each Lupus subject's longitudinal office visits
lup_df = full_df[full_df["disease state"] == "SLE"].copy()

lup_df["days_since_diagnosis"] = pd.to_numeric(lup_df["days_since_diagnosis"], errors="coerce")
lup_df["sledai"] = pd.to_numeric(lup_df["sledai"], errors="coerce")

lup_df["calculated_visit_num"] = (lup_df
 .sort_values(["subject", "days_since_diagnosis"])
 .groupby("subject").cumcount() + 1)

In [12]:
lup_df.shape

(924, 43918)

In [13]:
lup_df.head()

Unnamed: 0_level_0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,taxid_ch1,...,ILMN_3311150,ILMN_3311155,ILMN_3311160,ILMN_3311165,ILMN_3311170,ILMN_3311175,ILMN_3311180,ILMN_3311185,ILMN_3311190,calculated_visit_num
gsm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM1594268,whole blood-SLE-161-V3-SLE-1,GSM1594268,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,SLE-161,V3,SLE",Homo sapiens,9606,...,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,5.847195,3
GSM1594269,whole blood-SLE-144-V9-SLE-1,GSM1594269,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,SLE-144,V9,SLE",Homo sapiens,9606,...,3.321928,3.321928,3.321928,4.193292,3.321928,3.321928,3.321928,3.321928,4.076274,7
GSM1594270,whole blood-SLE-80-V14-SLE-1,GSM1594270,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,SLE-80,V14,SLE",Homo sapiens,9606,...,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,3.321928,4.101621,14
GSM1594271,whole blood-SLE-197-V1-SLE-1,GSM1594271,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,SLE-197,V1,SLE",Homo sapiens,9606,...,3.941341,3.321928,3.321928,4.129184,3.321928,3.321928,3.321928,3.321928,4.152275,1
GSM1594272,whole blood-SLE-254-V1-SLE-1,GSM1594272,Public on Mar 31 2016,Jan 28 2015,Mar 16 2023,RNA,1,"whole blood,SLE-254,V1,SLE",Homo sapiens,9606,...,3.499143,3.321928,3.321928,4.217559,3.321928,3.321928,3.321928,3.321928,4.202764,1


In [14]:
# Rename the newly formed dataframe as "current" (referring to a subject's current office visit)
current = lup_df.copy()

# Form a new dataframe named "subsequent" (referring to a subject's subsequent office visit vist) with only 4 columns from the the lup_df dataframe
subsequent = lup_df[["subject", "calculated_visit_num", "days_since_diagnosis", "sledai"]].copy()

# Subtract 1 from each value in the "calculated_visit_num" column of the "subsequent" dataframe.
subsequent["calculated_visit_num"] = subsequent["calculated_visit_num"] - 1

# Then merge the "current" and "subsequent" dataframes on the same "subject" and "calcuated_visit" row levels
#  this will result in the office visit's current "sledai" score" and the subseqeunt office visit's "sledai score" both on the same row in the dataframe
# Add the suffix "_current" to the 'days_since_diagnosis' and 'sledai' column names in the "current" dataframe
#  and add the suffix "_subsequent" to the 'days_since_diagnsosis' and 'sledai' column names in the "subsequent" dataframe
joined = current.merge(
    subsequent,
    on=["subject", "calculated_visit_num"],
    suffixes=("_current", "_subsequent")
)

In [15]:
# Viewing a section of the newly merged dataframe- "joined"
joined[["subject", "calculated_visit_num", "sledai_current", "sledai_subsequent", "days_since_diagnosis_current", "days_since_diagnosis_subsequent"]].sort_values(by=["subject", "calculated_visit_num"]).head(10)


Unnamed: 0,subject,calculated_visit_num,sledai_current,sledai_subsequent,days_since_diagnosis_current,days_since_diagnosis_subsequent
669,SLE-105,1,2,2,2146.0,2237.0
459,SLE-105,2,2,2,2237.0,2321.0
566,SLE-105,3,2,2,2321.0,2419.0
590,SLE-105,4,2,2,2419.0,2507.0
195,SLE-110,1,4,4,1497.0,1583.0
143,SLE-110,2,4,4,1583.0,1824.0
103,SLE-110,3,4,4,1824.0,1940.0
122,SLE-110,4,4,4,1940.0,2118.0
136,SLE-110,5,4,8,2118.0,2206.0
88,SLE-110,6,8,4,2206.0,2386.0


In [16]:
# The new "joined" dataframe's number of rows has been decreased from the "prior" dataframe's 924 to 766
#  because each pediatric lupus subject's final office visit has been removed (since the final visit does not have a "subsequent" sledai value)
# There are a total of 158 lupus subjects so the correspondingly final visits (158 rows) have been removed resulting in 766 rows.
joined.shape

(766, 43920)

In [17]:
# Form  a new dataframe that includes gene-expressions from longitudinal office visits that are 90 days or less between visits
delta = joined["days_since_diagnosis_subsequent"] - joined["days_since_diagnosis_current"]
lupus_final_df = joined[(delta > 0) & (delta <= 90)].copy()


In [18]:
average_day_difference = (lupus_final_df["days_since_diagnosis_subsequent"] - lupus_final_df["days_since_diagnosis_current"]).abs().mean()
print(f"average difference in days between lupus office visits: {average_day_difference:.1f}")

average difference in days between lupus office visits: 52.2


In [19]:
days_difference = lupus_final_df["days_since_diagnosis_subsequent"] - lupus_final_df["days_since_diagnosis_current"]
min_days_diff = days_difference.min()
max_days_diff = days_difference.max()

print(f"The range of the difference in days since diagnosis is from {min_days_diff} to {max_days_diff}.")

The range of the difference in days since diagnosis is from 7.0 to 90.0.


In [20]:
days_difference = lupus_final_df["days_since_diagnosis_subsequent"] - lupus_final_df["days_since_diagnosis_current"]
median_days_diff = days_difference.median()

print(f"The median difference in days since diagnosis is {median_days_diff:.1f}.")

The median difference in days since diagnosis is 54.0.


In [21]:
# Add a new colum "preflare_bool": Preflare_bool is "True" if SLEDAI increases by ≥ 4 at the subsequent visit, or else "False"
lupus_final_df["preflare_bool"] = (lupus_final_df["sledai_subsequent"] - lupus_final_df["sledai_current"]) >= 4

In [22]:
lupus_final_df[["subject", "calculated_visit_num", "sledai_current", "sledai_subsequent", "days_since_diagnosis_current", "days_since_diagnosis_subsequent","preflare_bool"]].sort_values(by=["subject", "calculated_visit_num"]).head(10)

Unnamed: 0,subject,calculated_visit_num,sledai_current,sledai_subsequent,days_since_diagnosis_current,days_since_diagnosis_subsequent,preflare_bool
459,SLE-105,2,2,2,2237.0,2321.0,False
590,SLE-105,4,2,2,2419.0,2507.0,False
195,SLE-110,1,4,4,1497.0,1583.0,False
136,SLE-110,5,4,8,2118.0,2206.0,True
582,SLE-121,5,0,12,1512.0,1521.0,True
341,SLE-121,6,12,10,1521.0,1528.0,False
405,SLE-121,7,10,2,1528.0,1556.0,False
515,SLE-121,8,2,6,1556.0,1645.0,True
434,SLE-121,10,8,0,1808.0,1864.0,False
438,SLE-121,11,0,4,1864.0,1892.0,True


In [23]:
# "lupus_final_df" dataframe's rows decreased from 766 to 440 since all visits without a subsequent office visit within <=90 days have been removed
# One column increased since a pre-flare Boolean column ("True" or "False") was added to the dataframe
lupus_final_df.shape

(440, 43921)

In [24]:
# Of the 440 office visits with a subsequent visit occurring within 90 days or less
# 367 lupus subject vists were NOT in a pre-flare state and 73 were in a pre-flare state
lupus_final_df["preflare_bool"].value_counts()

Unnamed: 0_level_0,count
preflare_bool,Unnamed: 1_level_1
False,367
True,73


In [25]:
# New Dataframe with 440 samples
lupus_final_df.shape

(440, 43921)

In [26]:
# Saving the newly formed dataframe "lupus_final_df" to a file on MyDrive
drive_path = '/content/drive/MyDrive/lupus_final_df.pkl'
lupus_final_df.to_pickle(drive_path)