# Metadata Processing

## <font color='#000000'>Table of contents<font><a class='anchor' id='top'></a>

- [1. View and streamline available meta data files](#1.-view-and-streamline-available-meta-data-files)
  - [A. Assay metadata](#a.-assay-metadata)
  - [B. Biospec metadata](#b.-biospec-metadata)
  - [C. Individual metadata](#c.-individual-metadata)
  - [D. Merging the metadata files](#d.-merging-the-metadata-files)
    - [D1. Make sure relevant columns do not have NaN values](#d1.-make-sure-relevant-columns-do-not-have-nan-values)
    - [D2. Rename 'Control' to 'Pathology Control' in 'Consensus clinical diagnosis'](#d2.-rename-'control'-to-'pathology-control'-in-'consensus-clinical-diagnosis')
  - [E. RIN Score](#e.-rin-score)
  - [F. Load Continuous Pseudo-progression score (CPS) from Gabitto et al.](#f.-load-continuous-pseudo-progression-score-(cps)-from-gabitto-et-al.)
- [2. View metadata attributes](#2.-view-metadata-attributes)
  - [A. Unfiltered metadata](#a.-unfiltered-metadata)
  - [B. Filter metadata by diagnosis of interest](#b.-filter-metadata-by-diagnosis-of-interest)
  - [C. Save meta data file](#c.-save-meta-data-file)
- [3. Switch for brain regions (Start here once metadata file is made)](#3.-switch-for-brain-regions-(start-here-once-metadata-file-is-made))
  - [A. Choose desired brain region](#a.-choose-desired-brain-region)
    - [A1. Please choose one of the below:](#a1.-please-choose-one-of-the-below:)
  - [B. Create '10X_ID' Column for folder recognition in snRNAseq_processing](#b.-create-'10x_id'-column-for-folder-recognition-in-snrnaseq_processing)
    - [B1. Merge and save brain region specific metadata](#b1.-merge-and-save-brain-region-specific-metadata)
- [4. Test](#4.-test)

In [None]:
import snRNAseq_processing_FUNCTIONS as myfunc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from IPython.display import display, Markdown



In [None]:
# Set pandas to display all columns and rows (I want to see all the info!)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## 1. View and streamline available meta data files

### A. Assay metadata

In [None]:
assay_df = pd.read_csv('/tscc/nfs/home/aopatel/SEA_AD_SNRNASEQ_MTG/SEA-AD_assay_snRNAseq_metadata.csv') # Change for ATAC 1/2
assay_df

In [None]:
relevant_columns = ['assay', 'specimenID', 'platform', 'RIN', 'rnaBatch', 
                    'libraryBatch','sequencingBatch']  
assay_df = assay_df[relevant_columns]
assay_df

### B. Biospec metadata 

In [None]:
biospec_df = pd.read_csv('/tscc/nfs/home/aopatel/SEA_AD_SNRNASEQ_MTG/SEA-AD_biospecimen_metadata.csv')
# Display the contents
biospec_df

In [None]:
relevant_columns = ['individualID', 'specimenID', 'tissue', 'samplingAge', 'assay', 
                    'nucleicAcidSource'] 
biospec_df = biospec_df[relevant_columns]

# Here they have labeled snRNAseq samples as scrnaSeq, I know confusing!)
biospec_df = biospec_df[biospec_df['assay'] == 'scrnaSeq'] # Change For ATAC 2/2
print(biospec_df)

### C. Individual metadata

In [None]:
# Read the .xlsx file
indiv_df = pd.read_csv('/tscc/nfs/home/aopatel/SEA_AD_SNRNASEQ_MTG/SEA-AD_individual_metadata.csv')
indiv_df

In [None]:
relevant_columns = ['individualID', 'sex', 'ADNC', 'Braak',
                   'CERAD','Thal phase','Cognitive status',
                    'Consensus clinical diagnosis',
                   'dataset']
indiv_df = indiv_df[relevant_columns]

In [None]:
unique_individuals = indiv_df['individualID'].nunique()
print(unique_individuals) #95 unique individuals in SEA-AD cohort (snRNA-seq)

### D. Merging the metadata files


In [None]:
merged_df = pd.merge(biospec_df, indiv_df, on='individualID', how='left')

In [None]:
print(merged_df)


In [None]:
print(merged_df.shape[0])  # Outputs the number of rows

In [None]:
merged_df

In [None]:
# Filter coloumns
merged_df_subset = merged_df.drop(columns=['assay'])

# Perform the merge
meta_data = pd.merge(assay_df, merged_df_subset, on='specimenID', how='left')

# Now meta_data has only one 'assay' column, from assay_df to avoid assay_x during merge
meta_data # Must have 790 rows

In [None]:
print(meta_data.columns.tolist())



<div class="alert alert-block alert-info">
<b> Make sure relevant columns do not have NaN values
</div>

In [None]:
# Check for columns with NaN values
nan_columns = meta_data.columns[meta_data.isna().any()].tolist()

# Print the columns with NaN values
print("Columns with NaN values:", nan_columns)

In [None]:
# Make sure that NaN values for above listed columns exist
# because they are not annotated for Neurotypical Reference 
meta_test = meta_data[meta_data['Consensus clinical diagnosis'].isna()]
meta_test

In [None]:
# Dictionary mapping columns to their replacement values for NaN
fill_values = {
    'ADNC': 'Not AD',
    'Braak': '0.0',
    'CERAD': '0.0',
    'Thal phase': 'Thal 0',
    'Cognitive status': 'No dementia',
    'Consensus clinical diagnosis': 'Neurotypical'
}

# Replace NaN values in specified columns
meta_data.fillna(fill_values, inplace=True)

# Filter for 'neurotypical' and count unique individualIDs
neurotypical_count = meta_data[meta_data['Consensus clinical diagnosis'] == 'Neurotypical']['individualID'].nunique()

print(f"Number of unique individualIDs with 'Consensus clinical diagnosis' as 'Neurotypical': {neurotypical_count}")
# Must be 6, only 5 in MTG though



<div class="alert alert-block alert-info">
<b>Rename 'Control' to 'Pathology Control' in 'Consensus clinical diagnosis'
</div>


In [None]:
meta_data['Consensus clinical diagnosis'] = meta_data['Consensus clinical diagnosis'].replace('Control', 'Pathology Control')

In [None]:
meta_data # sanity check!

### E. RIN Score 

In [None]:
myfunc.rin_viewer(meta_data)

### F. Load Continuous Pseudo-progression score (CPS) from Gabitto et al.

In [None]:
cps = pd.read_csv('donor_scores.csv')
cps = cps.rename(columns={'Donor ID': 'individualID'})
cps

In [None]:
# Merge meta_data and result using a left join on individualID
meta_data = meta_data.merge(cps, on='individualID', how='left')
meta_data

## 2. View metadata attributes

### A. Unfiltered metadata 

In [None]:
myfunc.view_sex_diagnostics(meta_data)

### B. Filter metadata by diagnosis of interest

In [None]:
# List of diagnoses to filter
diagnoses = [
    "Alzheimers disease",
    "Pathology Control",
    "Neurotypical"

]

# Filter the DataFrame
meta_data = meta_data[meta_data['Consensus clinical diagnosis'].isin(diagnoses)]

In [None]:
myfunc.view_sex_diagnostics(meta_data)

In [None]:
myfunc.view_diagnosis_region(meta_data)

### C. Save meta data file

In [None]:
meta_data.to_csv('meta_data.csv', index=False)

## 3. Switch for brain regions (Start here once metadata file is made) 

### A. Choose desired brain region

#### A1. Please choose one of the below:
##### 'middle temporal gyrus'
##### 'dorsolateral prefrontal cortex'
##### 'medial entorhinal cortex'
##### 'primary visual cortex'
##### 'inferior temporal gyrus'
##### 'superior temporal gyrus'

In [None]:
# Load metadata
meta_data=pd.read_csv('meta_data.csv')
meta_data

In [None]:
# Select desired tissue
tissue='middle temporal gyrus'
specific_brain_region_meta = meta_data[meta_data['tissue'] == tissue]
specific_brain_region_meta

In [None]:
indiv=specific_brain_region_meta['individualID'].nunique()
spec=specific_brain_region_meta['specimenID'].nunique()

print("Number of donors in this brain region (",tissue, "):",indiv)
print("Number of samples in this brain region (", tissue, "):", spec)

In [None]:
# Upload 
synapse_query_df = pd.read_csv('/tscc/lustre/ddn/scratch/aopatel/mtg_fastq/SYNAPSE_TABLE_QUERY_161185670.csv')
synapse_query_df

### B. Create '10X_ID' Column for folder recognition in snRNAseq_processing

In [None]:
# Extract everything before "_S01"
synapse_query_df["10X_ID"] = synapse_query_df["name"].str.extract(r"^(.*)_S01")
synapse_query_df = synapse_query_df[['specimenID', '10X_ID']]
synapse_query_df = synapse_query_df.drop_duplicates(subset="10X_ID")
synapse_query_df

In [None]:
synapse_query_df['10X_ID'].nunique() #Number of files is greater since its all diagnoses

#### B1. Merge and save brain region specific metadata

In [None]:
specific_brain_region_meta  = specific_brain_region_meta.merge(synapse_query_df, on='specimenID', how='left')
if (specific_brain_region_meta['10X_ID'].nunique() == spec):
    print("Performed correctly, all samples downloaded accounted for.")
else: 
    print('Performed incorrectly, all samples not accounted for.')

In [None]:
specific_brain_region_meta.to_csv('mtg_meta_data.csv', index=False) # Change file name for correct brain region

## 3. Test

In [None]:
mtg_meta_data=pd.read_csv('mtg_meta_data.csv')
mtg_meta_data

In [None]:
# Must equal number of unique individuals in this brain region
unique_individuals = mtg_meta_data['individualID'].nunique()
print(unique_individuals)

In [None]:
# Must queal number of unique samples/specimens 
unique_x_ids = mtg_meta_data['10X_ID'].nunique()
unique_x_ids