In [1]:
import pandas as pd

## Get Information from Metadata
https://www.synapse.org/Synapse:syn65941775

In [None]:
# Read in the data
biospecimen = pd.read_csv("UCI_Clu-rs2279590_h2kb_biospecimen_metadata.csv")
individual = pd.read_csv("UCI_Clu-rs2279590_h2kb_IndividualID.csv")
rnaseqmetadata = pd.read_csv("UCI_Clu-rs2279590_h2kb-AssayRnaSeqMetadata.csv")

# Drop empty rows (all-NaN) for each dataframe
biospecimen = biospecimen.dropna(how='all')
individual = individual.dropna(how='all')
rnaseqmetadata = rnaseqmetadata.dropna(how='all')


# Print basic info for each dataframe
for name, df in [("biospecimen", biospecimen), ("individual", individual), ("rnaseqmetadata", rnaseqmetadata)]:
    print(f"\n=== {name.upper()} DATAFRAME ===")
    print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
    print(f"Columns: {list(df.columns)}")
    if 'individualID' in df.columns:
        print(f"Number of unique individualIDs: {df['individualID'].nunique()}")
        print(f"Number of rows with missing individualID: {df['individualID'].isna().sum()}")
    else:
        print("'individualID' column not found.")



=== BIOSPECIMEN DATAFRAME ===
Shape: 114 rows x 17 columns
Columns: ['individualID', 'specimenID', 'specimenIdSource', 'organ', 'tissue', 'BrodmannArea', 'sampleStatus', 'tissueWeight', 'tissueVolume', 'nucleicAcidSource', 'cellType', 'fastingState', 'isPostMortem', 'samplingAge', 'samplingAgeUnits', 'visitNumber', 'assay']
Number of unique individualIDs: 77
Number of rows with missing individualID: 0

=== INDIVIDUAL DATAFRAME ===
Shape: 77 rows x 27 columns
Columns: ['individualID', 'climbID', 'microchipID', 'birthID', 'matingID', 'individualIdSource', 'materialOrigin', 'sex', 'species', 'generation', 'dateBirth', 'ageDeath', 'ageDeathUnits', 'brainWeight', 'rodentWeight', 'rodentDiet', 'bedding', 'room', 'waterpH', 'treatmentDose', 'treatmentType', 'stockNumber', 'genotype', 'genotypeBackground', 'individualCommonGenotype', 'modelSystemName', 'officialName']
Number of unique individualIDs: 77
Number of rows with missing individualID: 0

=== RNASEQMETADATA DATAFRAME ===
Shape: 114 ro

In [3]:
# Merge the dataframes

merged1_df = pd.merge(biospecimen, individual, on='individualID', how='outer')
merged_df = pd.merge(merged1_df, rnaseqmetadata, on='specimenID', how='outer')

print("\n=== MERGED DATAFRAME INFO ===")
print(f"Shape: {merged_df.shape[0]} rows x {merged_df.shape[1]} columns")
print(f"Columns: {list(merged_df.columns)}")

if 'individualID' in merged_df.columns:
    print(f"Number of unique individualIDs: {merged_df['individualID'].nunique()}")
    print(f"Number of rows with missing individualID: {merged_df['individualID'].isna().sum()}")
else:
    print("'individualID' column not found in merged_df.")

if 'specimenID' in merged_df.columns:
    print(f"Number of unique specimenIDs: {merged_df['specimenID'].nunique()}")
    print(f"Number of rows with missing specimenID: {merged_df['specimenID'].isna().sum()}")
else:
    print("'specimenID' column not found in merged_df.")

print("\nFirst 5 rows of merged_df:")
print(merged_df.head())



=== MERGED DATAFRAME INFO ===
Shape: 114 rows x 67 columns
Columns: ['individualID', 'specimenID', 'specimenIdSource', 'organ', 'tissue', 'BrodmannArea', 'sampleStatus', 'tissueWeight', 'tissueVolume', 'nucleicAcidSource', 'cellType', 'fastingState', 'isPostMortem', 'samplingAge', 'samplingAgeUnits', 'visitNumber', 'assay_x', 'climbID', 'microchipID', 'birthID', 'matingID', 'individualIdSource', 'materialOrigin', 'sex', 'species', 'generation', 'dateBirth', 'ageDeath', 'ageDeathUnits', 'brainWeight', 'rodentWeight', 'rodentDiet', 'bedding', 'room', 'waterpH', 'treatmentDose', 'treatmentType', 'stockNumber', 'genotype', 'genotypeBackground', 'individualCommonGenotype', 'modelSystemName', 'officialName', 'Component', 'libraryID', 'assay_y', 'platform', 'RIN', 'referenceSet', 'rnaBatch', 'libraryBatch', 'sequencingBatch', 'libraryPrep', 'libraryPreparationMethod', 'libraryVersion', 'isStranded', 'readStrandOrigin', 'readLength', 'runType', 'totalReads', 'validBarcodeReads', 'DV200', 'inp

In [7]:
# Just looking at some stuff, not neccessary to run
# Print unique entries for the "tissue" column in merged_df
if "tissue" in merged_df.columns:
    unique_tissues = merged_df["tissue"].dropna().unique()
    print("\nUnique entries in 'tissue' column:")
    for t in unique_tissues:
        print(f"- {t}")
    print(f"\nTotal unique tissues: {len(unique_tissues)}")
else:
    print("'tissue' column not found in merged_df.")

# Print unique entries for the "genotype" column in merged_df
if "genotype" in merged_df.columns:
    unique_genotypes = merged_df["genotype"].dropna().unique()
    print("\nUnique entries in 'genotype' column:")
    for g in unique_genotypes:
        print(f"- {g}")
    print(f"\nTotal unique genotypes: {len(unique_genotypes)}")
else:
    print("'genotype' column not found in merged_df.")


Unique entries in 'tissue' column:
- hippocampus
- cerebral cortex

Total unique tissues: 2

Unique entries in 'genotype' column:
- 5XFAD_carrier
- 5XFAD_noncarrier
- 5XFAD_carrier, Clu-rs2279590_KI_homozygous
- Clu-rs2279590_KI_homozygous

Total unique genotypes: 4


In [4]:
# GET THE DATA FOR THE SPREADSHEET (https://docs.google.com/spreadsheets/d/11vmntFrno9ubNMOE8bbMejEROPygmeJZ/edit?pli=1&gid=757405125#gid=757405125)
# Group merged_df by "ageDeath", "sex", and "genotype" and count the number of rows in each group
grouped = merged_df.groupby(["ageDeath", "sex", "genotype"]).size().reset_index(name='count')

print("\n=== GROUPED COUNTS BY ageDeath, sex, genotype ===")
print(grouped)

print(f"\nNumber of unique groups: {grouped.shape[0]}")



=== GROUPED COUNTS BY ageDeath, sex, genotype ===
    ageDeath     sex                                    genotype  count
0        4.0  female                               5XFAD_carrier      5
1        4.0  female  5XFAD_carrier, Clu-rs2279590_KI_homozygous      4
2        4.0  female                            5XFAD_noncarrier      3
3        4.0  female                 Clu-rs2279590_KI_homozygous      5
4        4.0    male                               5XFAD_carrier      5
5        4.0    male  5XFAD_carrier, Clu-rs2279590_KI_homozygous      5
6        4.0    male                            5XFAD_noncarrier      5
7        4.0    male                 Clu-rs2279590_KI_homozygous      5
8       12.0  female                               5XFAD_carrier     10
9       12.0  female  5XFAD_carrier, Clu-rs2279590_KI_homozygous      9
10      12.0  female                            5XFAD_noncarrier     10
11      12.0  female                 Clu-rs2279590_KI_homozygous      9
12      12.0 

## Validate Metadata
Look at the actual RNAseq data and make sure that all samples exist and align with what is stated in the metadata.

https://www.synapse.org/Synapse:syn65941772

In [None]:
# In terminal, run:
# synapse list syn65941772 > UCI_Clu-h2kbKI_rnaseqdata_list.csv

df = pd.read_csv("UCI_Clu-h2kbKI_rnaseqdata_list.csv", header=None, names=["syn_id", "file_name"])

In [None]:
# Parse the filename to get the metadata
def parse_filename(fname):
    # Split by underscore
    parts = str(fname).split("_")
    # Defensive: if not enough parts, return Nones
    if len(parts) < 7:
        return {
            "lane": None,
            "tissue": None,
            "sex": None,
            "genotype": None,
            "age": None,
            "individualID": None,
            "read": None
        }
    # Find the index of the sex (first "M" or "F")
    try:
        sex_idx = next(i for i, p in enumerate(parts) if p in ("M", "F"))
    except StopIteration:
        sex_idx = 2  # fallback, but may be wrong
    # Find the index of the age (endswith "mo")
    try:
        age_idx = next(i for i, p in enumerate(parts) if p.endswith("mo"))
    except StopIteration:
        age_idx = sex_idx + 2  # fallback
    # Find the index of the individualID (should be after age)
    try:
        indiv_idx = age_idx + 1
        individualID = parts[indiv_idx]
    except IndexError:
        individualID = None
    # Read is always the last part before extension, e.g. ..._1.fq.gz or ..._2.fq.gz
    read_part = parts[-1]
    # If read_part contains a dot, split and take the first part (e.g. "1.fq.gz" -> "1")
    read = read_part.split(".")[0]
    # Genotype is everything between sex and age
    genotype = "_".join(parts[sex_idx + 1:age_idx])
    return {
        "lane": parts[0],
        "tissue": parts[1],
        "sex": parts[sex_idx],
        "genotype": genotype,
        "age": parts[age_idx],
        "individualID": individualID,
        "read": read
    }

parsed = df["file_name"].apply(parse_filename)
parsed_df = pd.DataFrame(parsed.tolist())

In [None]:
# Make sure all samples have both read1 and read2
# For each unique sample (excluding 'read'), check if both read=1 and read=2 exist
group_cols = [col for col in parsed_df.columns if col != "read"]
read_counts = parsed_df.groupby(group_cols)["read"].nunique().reset_index()
# Entries where the number of unique reads is not 2 (i.e., missing a read)
missing_reads = read_counts[read_counts["read"] != 2]
if not missing_reads.empty:
    print("Entries missing either read=1 or read=2:")
    display(missing_reads)
else:
    print("All entries have both read=1 and read=2.")

In [None]:
# Drop read 2
# Merge parsed_df back to df to keep all columns
merged_df_validation = pd.concat([df.reset_index(drop=True), parsed_df.reset_index(drop=True)], axis=1)

# Identify columns to compare for "exact same value" (excluding 'read')
compare_cols = [col for col in parsed_df.columns if col != "read"]

# Sort so that read=1 comes before read=2 for duplicates
merged_df_sorted = merged_df_validation.sort_values(by=compare_cols + ["read"])

# Drop duplicates, keeping the first (which will be read=1 if both exist)
dedup_df = merged_df_sorted.drop_duplicates(subset=compare_cols, keep="first")

new_df = dedup_df[parsed_df.columns].reset_index(drop=True)

In [None]:
# Get metadata to compare
# Group merged_df by "age", "sex", and "genotype" and count the number of rows in each group
grouped = new_df.groupby(["age", "sex", "genotype"]).size().reset_index(name='count')

print("\n=== GROUPED COUNTS BY age, sex, genotype ===")
print(grouped)

print(f"\nNumber of unique groups: {grouped.shape[0]}")

## Explore what the differences are

In [None]:
filtered_df = new_df[(new_df["sex"] == "F") & (new_df["age"] == "12mo") & (new_df["genotype"] == "5xFADHEMI_CLU-h2kbKI_HO")]
print("Rows with sex=F, age=12mo, genotype=5xFADHEMI_CLU-h2kbKI_HO:")
print(filtered_df)

In [6]:
# Filter merged_df for the specified criteria
filtered_df = merged_df[
    (merged_df["ageDeath"] == 12.0) &
    (merged_df["sex"] == "female") &
    (merged_df["genotype"] == "5XFAD_carrier, Clu-rs2279590_KI_homozygous")
]

print("\nRows where ageDeath=12mo, sex=female, and genotype=5XFAD_carrier, Clu-rs2279590_KI_homozygous:")
print(filtered_df)



Rows where ageDeath=12mo, sex=female, and genotype=5XFAD_carrier, Clu-rs2279590_KI_homozygous:
     individualID specimenID specimenIdSource  organ           tissue  \
37        12682.0    12682lh          UCI_TMF  brain      hippocampus   
38        12682.0    12682lc          UCI_TMF  brain  cerebral cortex   
39        12683.0    12683lh          UCI_TMF  brain      hippocampus   
40        12683.0    12683lc          UCI_TMF  brain  cerebral cortex   
41        12688.0    12688lh          UCI_TMF  brain      hippocampus   
42        12688.0    12688lc          UCI_TMF  brain  cerebral cortex   
43        12689.0    12689lh          UCI_TMF  brain      hippocampus   
44        12689.0    12689lc          UCI_TMF  brain  cerebral cortex   
112       12680.0    12680lc          UCI_TMF  brain  cerebral cortex   

     BrodmannArea sampleStatus  tissueWeight  tissueVolume nucleicAcidSource  \
37            NaN       frozen           NaN           NaN         bulk cell   
38           