In [1]:
import pandas as pd

## Get Information from Metadata
https://www.synapse.org/Synapse:syn50944327

In [18]:
# Read in the data
biospecimen = pd.read_csv("UCI_BIN1_biospecimen_metadata.csv") #syn51747924
individual = pd.read_csv("UCI_BIN1_individualID_metadata.csv") #syn51747927
assay_metadata = pd.read_csv("UCI_BIN1_RNAseq_assay_template.csv") #syn51747930

# Drop empty rows (all-NaN) for each dataframe
biospecimen = biospecimen.dropna(how='all')
individual = individual.dropna(how='all')
assay_metadata = assay_metadata.dropna(how='all')


# Print basic info for each dataframe
for name, df in [("biospecimen", biospecimen), ("individual", individual), ("assay_metadata", assay_metadata)]:
    print(f"\n=== {name.upper()} DATAFRAME ===")
    print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
    print(f"Columns: {list(df.columns)}")
    if 'individualID' in df.columns:
        print(f"Number of unique individualIDs: {df['individualID'].nunique()}")
        print(f"Number of rows with missing individualID: {df['individualID'].isna().sum()}")
    else:
        print("'individualID' column not found.")
    if 'specimenID' in df.columns:
        print(f"Number of unique specimenIDs: {df['specimenID'].nunique()}")
        print(f"Number of rows with missing specimenIDs: {df['specimenID'].isna().sum()}")
    else:
        print("'specimenID' column not found.")



=== BIOSPECIMEN DATAFRAME ===
Shape: 653 rows x 17 columns
Columns: ['individualID', 'specimenID', 'specimenIdSource', 'organ', 'tissue', 'BrodmannArea', 'sampleStatus', 'tissueWeight', 'tissueVolume', 'nucleicAcidSource', 'cellType', 'fastingState', 'isPostMortem', 'samplingAge', 'samplingAgeUnits', 'visitNumber', 'assay']
Number of unique individualIDs: 112
Number of rows with missing individualID: 0
Number of unique specimenIDs: 499
Number of rows with missing specimenIDs: 0

=== INDIVIDUAL DATAFRAME ===
Shape: 112 rows x 27 columns
Columns: ['individualID', 'climbID', 'microchipID', 'birthID', 'matingID', 'individualIdSource', 'materialOrigin', 'sex', 'species', 'generation', 'dateBirth', 'ageDeath', 'ageDeathUnits', 'brainWeight', 'rodentWeight', 'rodentDiet', 'bedding', 'room', 'waterpH', 'treatmentDose', 'treatmentType', 'stockNumber', 'genotype', 'genotypeBackground', 'individualCommonGenotype', 'modelSystemName', 'officialName']
Number of unique individualIDs: 112
Number of r

In [None]:
# Merge the dataframes
merged1_df = pd.merge(biospecimen, individual, on='individualID', how='outer')
merged_df = pd.merge(merged1_df, assay_metadata, on='specimenID', how='inner')

print("\n=== MERGED DATAFRAME INFO ===")
print(f"Shape: {merged_df.shape[0]} rows x {merged_df.shape[1]} columns")
print(f"Columns: {list(merged_df.columns)}")

if 'individualID' in merged_df.columns:
    print(f"Number of unique individualIDs: {merged_df['individualID'].nunique()}")
    print(f"Number of rows with missing individualID: {merged_df['individualID'].isna().sum()}")
else:
    print("'individualID' column not found in merged_df.")

if 'specimenID' in merged_df.columns:
    print(f"Number of unique specimenIDs: {merged_df['specimenID'].nunique()}")
    print(f"Number of rows with missing specimenID: {merged_df['specimenID'].isna().sum()}")
else:
    print("'specimenID' column not found in merged_df.")

print("\nFirst 5 rows of merged_df:")
print(merged_df.head())

print(f"Total number of unique specimenIDs in merged_df: {merged_df['specimenID'].nunique()}")
specimenid_counts = merged_df['specimenID'].value_counts(dropna=False)
num_singleton_specimenids = (specimenid_counts == 1).sum()
print(f"Number of specimenIDs that only appear once in merged_df: {num_singleton_specimenids}")

num_multi_specimenids = (specimenid_counts > 1).sum()
print(f"Number of specimenIDs that appear more than once in merged_df: {num_multi_specimenids}")


=== MERGED DATAFRAME INFO ===
Shape: 74 rows x 78 columns
Columns: ['individualID', 'specimenID', 'specimenIdSource', 'organ', 'tissue', 'BrodmannArea', 'sampleStatus', 'tissueWeight', 'tissueVolume', 'nucleicAcidSource', 'cellType', 'fastingState', 'isPostMortem', 'samplingAge', 'samplingAgeUnits', 'visitNumber', 'assay_x', 'climbID', 'microchipID', 'birthID', 'matingID', 'individualIdSource', 'materialOrigin', 'sex', 'species', 'generation', 'dateBirth', 'ageDeath', 'ageDeathUnits', 'brainWeight', 'rodentWeight', 'rodentDiet', 'bedding', 'room', 'waterpH', 'treatmentDose', 'treatmentType', 'stockNumber', 'genotype', 'genotypeBackground', 'individualCommonGenotype', 'modelSystemName', 'officialName', 'libraryID', 'assay_y', 'platform', 'RIN', 'referenceSet', 'rnaBatch', 'libraryBatch', 'sequencingBatch', 'libraryPrep', 'libraryPreparationMethod', 'libraryVersion', 'isStranded', 'readStrandOrigin', 'readLength', 'runType', 'totalReads', 'validBarcodeReads', 'DV200', 'Unnamed: 19', 'Un

In [21]:
# Just looking at some stuff, not neccessary to run
# Print unique entries for the "tissue" column in merged_df
if "tissue" in merged_df.columns:
    unique_tissues = merged_df["tissue"].dropna().unique()
    print("\nUnique entries in 'tissue' column:")
    for t in unique_tissues:
        print(f"- {t}")
    print(f"\nTotal unique tissues: {len(unique_tissues)}")
else:
    print("'tissue' column not found in merged_df.")

# Print unique entries for the "genotype" column in merged_df
if "genotype" in merged_df.columns:
    unique_genotypes = merged_df["genotype"].dropna().unique()
    print("\nUnique entries in 'genotype' column:")
    for g in unique_genotypes:
        print(f"- {g}")
    print(f"\nTotal unique genotypes: {len(unique_genotypes)}")
else:
    print("'genotype' column not found in merged_df.")

# Print the number of entries for each tissue for each genotype
if "genotype" in merged_df.columns and "tissue" in merged_df.columns:
    counts = merged_df.groupby(["genotype", "tissue"]).size().reset_index(name="count")
    print("\nNumber of entries for each tissue for each genotype:")
    if counts.empty:
        print("No data found for genotype and tissue combinations.")
    else:
        for g in counts["genotype"].unique():
            print(f"- {g}:")
            sub = counts[counts["genotype"] == g]
            for _, row in sub.iterrows():
                print(f"    - {row['tissue']}: {row['count']}")
else:
    print("Either 'genotype' or 'tissue' column not found in merged_df.")

# Print the number of unique (specimenID, individualID) pairs in merged_df
if 'specimenID' in merged_df.columns and 'individualID' in merged_df.columns:
    unique_pairs = merged_df[['specimenID', 'individualID']].drop_duplicates()
    print(f"\nNumber of unique (specimenID, individualID) pairs: {len(unique_pairs)}")
else:
    print("Either 'specimenID' or 'individualID' column not found in merged_df.")






Unique entries in 'tissue' column:
- hippocampus

Total unique tissues: 1

Unique entries in 'genotype' column:
- 5XFAD_carrier, BIN1-K358R_homozygous
- 5XFAD_carrier
- BIN1-K358R_homozygous
- 5XFAD_noncarrier

Total unique genotypes: 4

Number of entries for each tissue for each genotype:
- 5XFAD_carrier:
    - hippocampus: 19
- 5XFAD_carrier, BIN1-K358R_homozygous:
    - hippocampus: 19
- 5XFAD_noncarrier:
    - hippocampus: 17
- BIN1-K358R_homozygous:
    - hippocampus: 19

Number of unique (specimenID, individualID) pairs: 74


In [22]:
# GET THE DATA FOR THE SPREADSHEET (https://docs.google.com/spreadsheets/d/11vmntFrno9ubNMOE8bbMejEROPygmeJZ/edit?pli=1&gid=757405125#gid=757405125)
# Group merged_df by "ageDeath", "sex", and "genotype" and count the number of rows in each group
grouped = merged_df.groupby(["ageDeath", "sex", "genotype"]).size().reset_index(name='count')

print("\n=== GROUPED COUNTS BY ageDeath, sex, genotype ===")
print(grouped)

print(f"\nNumber of unique groups: {grouped.shape[0]}")



=== GROUPED COUNTS BY ageDeath, sex, genotype ===
    ageDeath     sex                              genotype  count
0          4  female                         5XFAD_carrier      5
1          4  female  5XFAD_carrier, BIN1-K358R_homozygous      4
2          4  female                      5XFAD_noncarrier      2
3          4  female                 BIN1-K358R_homozygous      4
4          4    male                         5XFAD_carrier      5
5          4    male  5XFAD_carrier, BIN1-K358R_homozygous      5
6          4    male                      5XFAD_noncarrier      5
7          4    male                 BIN1-K358R_homozygous      5
8         12  female                         5XFAD_carrier      5
9         12  female  5XFAD_carrier, BIN1-K358R_homozygous      5
10        12  female                      5XFAD_noncarrier      5
11        12  female                 BIN1-K358R_homozygous      5
12        12    male                         5XFAD_carrier      4
13        12    male  5XF

In [12]:
# For each group in grouped, get the unique individualIDs from merged_df
print("\n=== IndividualIDs and specimenIDs for each (ageDeath, sex, genotype) group ===")
for idx, row in grouped.iterrows():
    age = row["ageDeath"]
    sex = row["sex"]
    genotype = row["genotype"]
    # Filter merged_df for this group
    mask = (
        (merged_df["ageDeath"] == age) &
        (merged_df["sex"] == sex) &
        (merged_df["genotype"] == genotype)
    )
    ind_ids = merged_df.loc[mask, "individualID"].dropna().unique()
    spec_ids = merged_df.loc[mask, "specimenID"].dropna().unique()
    print(f"- ageDeath: {age}, sex: {sex}, genotype: {genotype}")
    print(f"  individualIDs ({len(ind_ids)}): {', '.join(map(str, ind_ids))}")
    print(f"  specimenIDs ({len(spec_ids)}): {', '.join(map(str, spec_ids))}")

# Collect all unique individualIDs from the groups into a list
all_individualIDs = []
for idx, row in grouped.iterrows():
    age = row["ageDeath"]
    sex = row["sex"]
    genotype = row["genotype"]
    mask = (
        (merged_df["ageDeath"] == age) &
        (merged_df["sex"] == sex) &
        (merged_df["genotype"] == genotype)
    )
    ids = merged_df.loc[mask, "individualID"].dropna().unique()
    all_individualIDs.extend(ids)

# Remove duplicates while preserving order
seen = set()
all_individualIDs_unique = []
for x in all_individualIDs:
    if x not in seen:
        all_individualIDs_unique.append(x)
        seen.add(x)

print("\nAll individualIDs from all groups (unique):")
print(all_individualIDs_unique)
print(f"\nNumber of unique individualIDs: {len(all_individualIDs_unique)}")


=== IndividualIDs for each (ageDeath, sex, genotype) group ===
- ageDeath: 4, sex: female, genotype: 5XFAD_carrier
  individualIDs (5): 11626, 11625, 11617, 11616, 11615
  specimenIDs (30): 11626p, 11626lcif, 11626lhif, 11626lhsf, Pool-E, Pool-U, 11626lh, 11625p, 11625lcif, 11625lhif, 11625lcsf, 11625lhsf, 11625lh, 11617p, 11617lcif, 11617lhif, 11617lhsf, 11617lh, 11616p, 11616lcif, 11616lhif, 11616lcsf, 11616lhsf, 11616lh, 11615p, 11615lcif, 11615lhif, 11615lcsf, 11615lhsf, 11615lh
- ageDeath: 4, sex: female, genotype: 5XFAD_carrier, BIN1-K358R_homozygous
  individualIDs (7): 13053, 13052, 13048, 13022, 13020, 13019, 13143
  specimenIDs (30): 13053p, 13053lcif, 13053lhif, 13053lcsf, 13053lhsf, Pool-G, Pool-W, 13052p, 13052lhsf, 13048p, 13048lcif, 13048lhif, 13048lcsf, 13048lhsf, 13048lh, 13022p, 13022lcif, 13022lhif, 13022lcsf, 13022lhsf, 13022lh, 13020p, 13020lcif, 13020lhif, 13020lcsf, 13020lhsf, 13020lh, 13019lcif, 13019lhif, 13019lh
- ageDeath: 4, sex: female, genotype: 5XFAD_non

## Validate Metadata
Look at the actual RNAseq data and make sure that all samples exist and align with what is stated in the metadata.
https://www.synapse.org/Synapse:syn50944329

In [10]:
# In terminal, run:
# synapse list syn50944329 > UCI_Bin1K358R_rnaseqdata_list.csv

df = pd.read_csv("UCI_Bin1K358R_rnaseqdata_list.csv", header=None, names=["syn_id", "file_name"])

In [11]:
# Parse the filename to get the metadata
def parse_filename(fname):
    # Split by underscore
    parts = str(fname).split("_")
    # Defensive: if not enough parts, return Nones
    if len(parts) < 7:
        return {
            "lane": None,
            "tissue": None,
            "sex": None,
            "genotype": None,
            "age": None,
            "individualID": None,
            "read": None
        }
    # Find the index of the sex (first "M" or "F")
    try:
        sex_idx = next(i for i, p in enumerate(parts) if p in ("M", "F"))
    except StopIteration:
        sex_idx = 1  # fallback, but may be wrong
    # Find the index of the age (endswith "mo")
    try:
        age_idx = next(i for i, p in enumerate(parts) if p.endswith("mo"))
    except StopIteration:
        age_idx = sex_idx + 2  # fallback
    # Find the index of the individualID (should be after lane)
    try:
        indiv_idx = age_idx + 2
        individualID = parts[indiv_idx]
    except IndexError:
        individualID = None
    # Find the index of the lane (should be after age)
    try:
        lane_idx = age_idx + 1
        lane = parts[lane_idx]
    except IndexError:
        lane = None
    # Read is always the last part before extension, e.g. ..._1.fq.gz or ..._2.fq.gz
    read_part = parts[-1]
    # If read_part contains a dot, split and take the first part (e.g. "1.fq.gz" -> "1")
    read = read_part.split(".")[0]
    # Genotype is everything between sex and age
    genotype = "_".join(parts[sex_idx + 1:age_idx])
    return {
        "lane": lane,
        "tissue": parts[0],
        "sex": parts[sex_idx],
        "genotype": genotype,
        "age": parts[age_idx],
        "individualID": individualID,
        "read": read
    }

parsed = df["file_name"].apply(parse_filename)
parsed_df = pd.DataFrame(parsed.tolist())
print(parsed_df)

    lane tissue sex   genotype   age individualID read
0     B1   hipp   F  5XFADHEMI  12mo        12433   R1
1     B1   hipp   F  5XFADHEMI  12mo        12433   R2
2     B1   hipp   F  5XFADHEMI  12mo        12440   R1
3     B1   hipp   F  5XFADHEMI  12mo        12440   R2
4     B1   hipp   F  5XFADHEMI  12mo        12450   R1
..   ...    ...  ..        ...   ...          ...  ...
143   B1   hipp   M    Bin1_HO   4mo        13037   R2
144   B1   hipp   M    Bin1_HO   4mo        13041   R1
145   B1   hipp   M    Bin1_HO   4mo        13041   R2
146   B1   hipp   M    Bin1_HO   4mo        13045   R1
147   B1   hipp   M    Bin1_HO   4mo        13045   R2

[148 rows x 7 columns]


In [12]:
# Make sure all samples have both read1 and read2
# For each unique sample (excluding 'read'), check if both read=1 and read=2 exist
group_cols = [col for col in parsed_df.columns if col != "read"]
read_counts = parsed_df.groupby(group_cols)["read"].nunique().reset_index()
# Entries where the number of unique reads is not 2 (i.e., missing a read)
missing_reads = read_counts[read_counts["read"] != 2]
if not missing_reads.empty:
    print("Entries missing either read=1 or read=2:")
    display(missing_reads)
else:
    print("All entries have both read=1 and read=2.")

All entries have both read=1 and read=2.


In [13]:
# Drop read 2
# Merge parsed_df back to df to keep all columns
merged_df_validation = pd.concat([df.reset_index(drop=True), parsed_df.reset_index(drop=True)], axis=1)

# Identify columns to compare for "exact same value" (excluding 'read')
compare_cols = [col for col in parsed_df.columns if col != "read"]

# Sort so that read=1 comes before read=2 for duplicates
merged_df_sorted = merged_df_validation.sort_values(by=compare_cols + ["read"])

# Drop duplicates, keeping the first (which will be read=1 if both exist)
dedup_df = merged_df_sorted.drop_duplicates(subset=compare_cols, keep="first")

new_df = dedup_df[parsed_df.columns].reset_index(drop=True)

In [14]:
# Get metadata to compare
# Group merged_df by "age", "sex", and "genotype" and count the number of rows in each group
grouped_validation = new_df.groupby(["age", "sex", "genotype"]).size().reset_index(name='count')

print("\n=== GROUPED COUNTS BY age, sex, genotype ===")
print(grouped_validation)

print(f"\nNumber of unique groups: {grouped_validation.shape[0]}")


=== GROUPED COUNTS BY age, sex, genotype ===
     age sex           genotype  count
0   12mo   F          5XFADHEMI      5
1   12mo   F  5xFADHEMI_Bin1_HO      5
2   12mo   F            5xFADWT      5
3   12mo   F             BIN1HO      1
4   12mo   F            Bin1_HO      4
5   12mo   M          5XFADHEMI      4
6   12mo   M  5xFADHEMI_Bin1_HO      5
7   12mo   M            5xFADWT      5
8   12mo   M            Bin1_HO      5
9    4mo   F          5xFADHEMI      5
10   4mo   F  5xFADHEMI_Bin1_HO      4
11   4mo   F            5xFADWT      2
12   4mo   F            Bin1_HO      4
13   4mo   M          5xFADHEMI      5
14   4mo   M  5xFADHEMI_Bin1_HO      5
15   4mo   M            5xFADWT      5
16   4mo   M            Bin1_HO      5

Number of unique groups: 17


In [23]:
# For each group in grouped_validation, print the individualIDs in that group
print("\n=== individualIDs per group (age, sex, genotype) ===")
for idx, row in grouped_validation.iterrows():
    age = row['age']
    sex = row['sex']
    genotype = row['genotype']
    # Filter new_df for this group
    group_df = new_df[(new_df['age'] == age) & (new_df['sex'] == sex) & (new_df['genotype'] == genotype)]
    individual_ids = group_df['individualID'].unique()
    print(f"Group (age={age}, sex={sex}, genotype={genotype}):")
    print(f"  individualIDs: {[int(x) for x in list(individual_ids)]}")

# Print all individualIDs from all groups in grouped_validation
all_individual_ids_validation = new_df['individualID'].unique()
# Ensure all elements of all_individual_ids_validation are ints
all_individual_ids_validation = [int(x) for x in all_individual_ids_validation]
print("\nAll individualIDs from all groups:")
print(list(all_individual_ids_validation))
print(f"\nNumber of unique individualIDs: {len(all_individual_ids_validation)}")



# Are there any individualIDs in all_individual_ids_validation that are not in all_individualIDs_unique?
# Assuming all_individualIDs_unique is defined elsewhere in the notebook
missing_ids = set(all_individual_ids_validation) - set(all_individualIDs_unique)
if missing_ids:
    print("\nindividualIDs in validation not in all_individualIDs_unique:")
    print(list(missing_ids))
else:
    print("\nAll individualIDs in validation are present in all_individualIDs_unique.")




=== individualIDs per group (age, sex, genotype) ===
Group (age=12mo, sex=F, genotype=5XFADHEMI):
  individualIDs: [12433, 12440, 12450, 12443, 12452]
Group (age=12mo, sex=F, genotype=5xFADHEMI_Bin1_HO):
  individualIDs: [12576, 12841, 13035, 12839, 12981]
Group (age=12mo, sex=F, genotype=5xFADWT):
  individualIDs: [12426, 12441, 12442, 12432, 12487]
Group (age=12mo, sex=F, genotype=BIN1HO):
  individualIDs: [13029]
Group (age=12mo, sex=F, genotype=Bin1_HO):
  individualIDs: [13021, 13030, 13033, 13032]
Group (age=12mo, sex=M, genotype=5XFADHEMI):
  individualIDs: [12420, 12424, 12422, 12802]
Group (age=12mo, sex=M, genotype=5xFADHEMI_Bin1_HO):
  individualIDs: [12977, 12980, 13025, 12979, 13026]
Group (age=12mo, sex=M, genotype=5xFADWT):
  individualIDs: [12419, 12429, 12434, 12421, 12435]
Group (age=12mo, sex=M, genotype=Bin1_HO):
  individualIDs: [12976, 13013, 13012, 13017, 13018]
Group (age=4mo, sex=F, genotype=5xFADHEMI):
  individualIDs: [11615, 11616, 11625, 11626, 11617]
Grou

In [11]:
# Create a list of all unique specimenIDs from assay_metadata
unique_specimenIDs = list(assay_metadata['specimenID'].unique())
unique_specimenIDs_no_lh = [int(sid.replace("lh", "")) for sid in unique_specimenIDs]

# Are there any individualIDs in all_individual_ids_validation that are not in all_individualIDs_unique?
missing_ids = set(unique_specimenIDs_no_lh) - set(all_individualIDs_unique)
if missing_ids:
    print("\nindividualIDs in unique_specimenIDs_no_lh not in all_individualIDs_unique:")
    print(list(missing_ids))
else:
    print("\nAll individualIDs in unique_specimenIDs_no_lh are present in all_individualIDs_unique.")




All individualIDs in unique_specimenIDs_no_lh are present in all_individualIDs_unique.
