In [6]:
# final function, with saving results to CSV files, and limited the certain div range, update threshold to 90%, add include_trials
def find_and_process_activity_data(base_path, include_trials=None):
    """
    Recursively find and process 'Compiled_ActivityScan.csv' in each 'Activity' subfolder
    within the given base directory, limiting processing to included trials if specified.
    Outputs two CSV files listing eligible and ineligible subfolders with their corresponding
    percentages of qualified lines.

    Parameters:
    - base_path (str): The base directory to start the search from.
    - include_trials (list, optional): List of trial subfolder names to include in the processing.

    Returns:
    - None: Results are printed directly and saved into CSV files.
    """
    import os
    import pandas as pd

    eligible = []
    ineligible = []
    
    def process_single_file(file_path):
        try:
            data = pd.read_csv(file_path)
            data_subset = data[(data['DIV'] >= 14) & (data['DIV'] <= 28)]
            data_subset.loc[:, 'NeuronType'] = data_subset['NeuronType'].str.strip()
            if 'WT' not in data_subset['NeuronType'].unique() or data_subset[data_subset['NeuronType'] == 'WT'].empty:
                return ("Missing WT data.", "nan", False)

            WT = data_subset[data_subset['NeuronType'] == 'WT']
            WT = WT.dropna(subset=['Active_area'])
            criteria = 50
            grouped_by_run = WT.groupby(['Chip_ID', 'Well', 'DIV'])['Active_area'].apply(lambda x: (x < criteria).mean())
            percentage_low_activity_lines = (grouped_by_run > 0.5).mean() * 100
            percentage_qualified_lines = 100 - percentage_low_activity_lines
            
            if percentage_low_activity_lines > 10:
                result_message = "The dataset is not qualified. More than 10% of lines show low activity (<50%)."
                return (result_message, f"{percentage_qualified_lines:.2f}%", False)
            else:
                result_message = "The dataset is qualified. Less than or equal to 10% of lines show low activity (<50%)."
                return (result_message, f"{percentage_qualified_lines:.2f}%", True)
        except Exception as e:
            return (f"Error processing file: {str(e)}", "nan", False)

    for root, dirs, files in os.walk(base_path):
        subfolder_name = root.split(os.sep)[-1]
        # Process only specified trials if include_trials is not None
        if include_trials is not None and subfolder_name not in include_trials:
            continue
        if 'Activity' in dirs:
            activity_path = os.path.join(root, 'Activity')
            csv_file = os.path.join(activity_path, 'Compiled_ActivityScan.csv')
            
            if os.path.exists(csv_file):
                result = process_single_file(csv_file)
                print(f"{subfolder_name}: {result[0]} Percentage of qualified lines: {result[1]}")
                if result[2]:
                    eligible.append((subfolder_name, result[1]))
                else:
                    ineligible.append((subfolder_name, result[1]))
            else:
                print(f"{subfolder_name}: Missing 'Compiled_ActivityScan.csv'")
                ineligible.append((subfolder_name, "nan"))
        else:
            if root.count(os.sep) - base_path.count(os.sep) == 1:  # Only report missing in direct subfolders of base_path
                print(f"{subfolder_name}: Missing 'Activity' folder")
                ineligible.append((subfolder_name, "nan"))
    
    # Save results to CSV files
    pd.DataFrame(eligible, columns=['Trial', 'Percentage of Qualified Lines']).to_csv(os.path.join(base_path, 'Eligible_Cohorts.csv'), index=False)
    pd.DataFrame(ineligible, columns=['Trial', 'Percentage of Qualified Lines']).to_csv(os.path.join(base_path, 'Ineligible_Cohorts.csv'), index=False)


In [7]:
# new usage
base_path = "/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/CSRA/QualityCheck/QualityCheck"
include_trials = ['CDKL5-E6D_T1_C1_05152024', 'SYNGAP1_T1_C1_03212024',
       'ADNP_T2_10262023', 'KCNT1_T3_C1_03122024', 
        'ADNP_T3_11072023', 'SHANK3_T1_11222023',
       'ADNP_T4_C1_06282024', 'CHD8_T2_C1_08252023'] # no 'KCNT1_T1_08082023','SPTAN1_T1_07192023'
find_and_process_activity_data(base_path, include_trials=include_trials)

CDKL5-E6D_T1_C1_05152024: The dataset is qualified. Less than or equal to 10% of lines show low activity (<50%). Percentage of qualified lines: 100.00%
SYNGAP1_T1_C1_03212024: The dataset is qualified. Less than or equal to 10% of lines show low activity (<50%). Percentage of qualified lines: 93.75%
ADNP_T2_10262023: The dataset is qualified. Less than or equal to 10% of lines show low activity (<50%). Percentage of qualified lines: 100.00%
KCNT1_T3_C1_03122024: The dataset is not qualified. More than 10% of lines show low activity (<50%). Percentage of qualified lines: 10.00%
ADNP_T3_11072023: The dataset is qualified. Less than or equal to 10% of lines show low activity (<50%). Percentage of qualified lines: 100.00%
SHANK3_T1_11222023: The dataset is qualified. Less than or equal to 10% of lines show low activity (<50%). Percentage of qualified lines: 100.00%
ADNP_T4_C1_06282024: The dataset is qualified. Less than or equal to 10% of lines show low activity (<50%). Percentage of qual

In [None]:
# final function, with saving results to CSV files, and limited the certain div range
def find_and_process_activity_data(base_path):
    """
    Recursively find and process 'Compiled_ActivityScan.csv' in each 'Activity' subfolder
    within the given base directory. Outputs two CSV files listing eligible and ineligible subfolders
    with their corresponding percentages of qualified lines.
    
    Parameters:
    - base_path (str): The base directory to start the search from.
    
    Returns:
    - None: Results are printed directly and saved into CSV files.
    """
    import os
    import pandas as pd

    eligible = []
    ineligible = []
    
    def process_single_file(file_path):
        try:
            data = pd.read_csv(file_path)
            data_subset = data[(data['DIV'] >= 14) & (data['DIV'] <= 28)]
            data_subset.loc[:, 'NeuronType'] = data_subset['NeuronType'].str.strip()
            if 'WT' not in data_subset['NeuronType'].unique() or data_subset[data_subset['NeuronType'] == 'WT'].empty:
                return ("Missing WT data.", "nan", False)

            WT = data_subset[data_subset['NeuronType'] == 'WT']
            WT = WT.dropna(subset=['Active_area'])
            criteria = 50
            grouped_by_run = WT.groupby(['Chip_ID','Well','DIV'])['Active_area'].apply(lambda x: (x < criteria).mean())
            percentage_low_activity_lines = (grouped_by_run > 0.5).mean() * 100
            percentage_qualified_lines = 100 - percentage_low_activity_lines
            
            if percentage_low_activity_lines > 50:
                result_message = "The dataset is not qualified. More than 50% of lines show low activity (<50%)."
                return (result_message, f"{percentage_qualified_lines:.2f}%", False)
            else:
                result_message = "The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%)."
                return (result_message, f"{percentage_qualified_lines:.2f}%", True)
        except Exception as e:
            return (f"Error processing file: {str(e)}", "nan", False)

    for root, dirs, files in os.walk(base_path):
        subfolder_name = root.split(os.sep)[-1]
        if 'Activity' in dirs:
            activity_path = os.path.join(root, 'Activity')
            csv_file = os.path.join(activity_path, 'Compiled_ActivityScan.csv')
            
            if os.path.exists(csv_file):
                result = process_single_file(csv_file)
                print(f"{subfolder_name}: {result[0]} Percentage of qualified lines: {result[1]}")
                if result[2]:
                    eligible.append((subfolder_name, result[1]))
                else:
                    ineligible.append((subfolder_name, result[1]))
            else:
                print(f"{subfolder_name}: Missing 'Compiled_ActivityScan.csv'")
                ineligible.append((subfolder_name, "nan"))
        else:
            if root.count(os.sep) - base_path.count(os.sep) == 1:  # Only report missing in direct subfolders of base_path
                print(f"{subfolder_name}: Missing 'Activity' folder")
                ineligible.append((subfolder_name, "nan"))
    
    # Save results to CSV files
    pd.DataFrame(eligible, columns=['Trial', 'Percentage of Qualified Lines']).to_csv(os.path.join(base_path, 'Eligible_Cohorts.csv'), index=False)
    pd.DataFrame(ineligible, columns=['Trial', 'Percentage of Qualified Lines']).to_csv(os.path.join(base_path, 'Ineligible_Cohorts.csv'), index=False)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

syngap_path = '/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Qualitycheck/SYNGAP1_T3_C1_08092024/Activity/Compiled_ActivityScan.csv'
data=pd.read_csv(syngap_path)

In [None]:
# final function, with saving results to CSV files, and limited the certain div range
def find_and_process_activity_data(base_path):
    """
    Recursively find and process 'Compiled_ActivityScan.csv' in each 'Activity' subfolder
    within the given base directory. Outputs two CSV files listing eligible and ineligible subfolders
    with their corresponding percentages of qualified lines.
    
    Parameters:
    - base_path (str): The base directory to start the search from.
    
    Returns:
    - None: Results are printed directly and saved into CSV files.
    """
    import os
    import pandas as pd

    eligible = []
    ineligible = []
    
    def process_single_file(file_path):
        try:
            data = pd.read_csv(file_path)
            data_subset = data[(data['DIV'] >= 14) & (data['DIV'] <= 28)]
            data_subset.loc[:, 'NeuronType'] = data_subset['NeuronType'].str.strip()
            if 'WT' not in data_subset['NeuronType'].unique() or data_subset[data_subset['NeuronType'] == 'WT'].empty:
                return ("Missing WT data.", "nan", False)

            WT = data_subset[data_subset['NeuronType'] == 'WT']
            WT = WT.dropna(subset=['Active_area'])
            criteria = 50
            grouped_by_run = WT.groupby(['Chip_ID','Well','DIV'])['Active_area'].apply(lambda x: (x < criteria).mean())
            percentage_low_activity_lines = (grouped_by_run > 0.5).mean() * 100
            percentage_qualified_lines = 100 - percentage_low_activity_lines
            
            if percentage_low_activity_lines > 50:
                result_message = "The dataset is not qualified. More than 50% of lines show low activity (<50%)."
                return (result_message, f"{percentage_qualified_lines:.2f}%", False)
            else:
                result_message = "The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%)."
                return (result_message, f"{percentage_qualified_lines:.2f}%", True)
        except Exception as e:
            return (f"Error processing file: {str(e)}", "nan", False)

    for root, dirs, files in os.walk(base_path):
        subfolder_name = root.split(os.sep)[-1]
        if 'Activity' in dirs:
            activity_path = os.path.join(root, 'Activity')
            csv_file = os.path.join(activity_path, 'Compiled_ActivityScan.csv')
            
            if os.path.exists(csv_file):
                result = process_single_file(csv_file)
                print(f"{subfolder_name}: {result[0]} Percentage of qualified lines: {result[1]}")
                if result[2]:
                    eligible.append((subfolder_name, result[1]))
                else:
                    ineligible.append((subfolder_name, result[1]))
            else:
                print(f"{subfolder_name}: Missing 'Compiled_ActivityScan.csv'")
                ineligible.append((subfolder_name, "nan"))
        else:
            if root.count(os.sep) - base_path.count(os.sep) == 1:  # Only report missing in direct subfolders of base_path
                print(f"{subfolder_name}: Missing 'Activity' folder")
                ineligible.append((subfolder_name, "nan"))
    
    # Save results to CSV files
    pd.DataFrame(eligible, columns=['Trial', 'Percentage of Qualified Lines']).to_csv(os.path.join(base_path, 'Eligible_Cohorts.csv'), index=False)
    pd.DataFrame(ineligible, columns=['Trial', 'Percentage of Qualified Lines']).to_csv(os.path.join(base_path, 'Ineligible_Cohorts.csv'), index=False)

In [None]:
# Example usage
base_path = "/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/QualityCheck"
find_and_process_activity_data(base_path)

SPTAN_1: The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%). Percentage of qualified lines: 100.00%
CDKL5-E6D_T1_C1_05152024: The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%). Percentage of qualified lines: 100.00%
SYNGAP1_T1_C1_03212024: The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%). Percentage of qualified lines: 93.75%
B6J Hyb: The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%). Percentage of qualified lines: 58.33%
SYNGAP1_T3_C1_08092024: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 0.00%
B6J_T1_02232024_PS: The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%). Percentage of qualified lines: 58.33%
KCNT_T1: The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%). Percentage of qualified lines: 100.00%
KCNT1_T3_C

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [26]:
data

Unnamed: 0,DIV,Chip_ID,Well,NeuronType,Active_area
0,4,M05506,1,WT,0.43
1,6,M05506,1,WT,0.24
2,7,M05506,1,WT,0.22
3,11,M05506,1,WT,0.00
4,14,M05506,1,WT,0.00
...,...,...,...,...,...
163,7,M06844,6,HET,12.23
164,11,M06844,6,HET,6.24
165,14,M06844,6,HET,5.33
166,18,M06844,6,HET,2.32


In [27]:
data_subset = data[(data['DIV'] >= 14) & (data['DIV'] <= 28)]

In [28]:
data_subset

Unnamed: 0,DIV,Chip_ID,Well,NeuronType,Active_area
4,14,M05506,1,WT,0.00
5,18,M05506,1,WT,0.00
6,21,M05506,1,WT,0.00
11,14,M05506,2,WT,0.00
12,18,M05506,2,WT,0.00
...,...,...,...,...,...
159,18,M06844,5,HET,3.51
160,21,M06844,5,HET,
165,14,M06844,6,HET,5.33
166,18,M06844,6,HET,2.32


In [34]:
# extract WT
WT = data_subset[data_subset['NeuronType'] == 'WT']
# drop rows with Active_Area with NaN values
WT = WT.dropna(subset=['Active_area'])

In [30]:
chip_ids = WT['Chip_ID'].unique()
chip_ids

array(['M05506', 'M08024', 'M08034', 'M06844'], dtype=object)

In [36]:
# Define the criteria for filtering: lines with Activity_area < 50, filter out if 50% of WT units exhibit lower than 50% activity area

criteria = 50

# Group by 'Chip_ID','Well','DIV' and calculate the percentage of rows with 'Active_area' < 50 for each combination
grouped_by_run = WT.groupby(['Chip_ID','Well','DIV'])['Active_area'].apply(lambda x: (x < criteria).mean())

# Calculate the percentage of lines with low activity
percentage_low_activity_run_ids = (grouped_by_run > 0.5).mean() * 100

# Calculate the percentage of qualified lines
percentage_qualified_run_ids = 100 - percentage_low_activity_run_ids

# Determine if the dataset is qualified or not based on the criteria
if percentage_low_activity_run_ids > 50:
    print("The dataset is not qualified. More than 50% of lines show low activity (<50%).")
else:
    print("The dataset is qualified. Less than or equal to 50% of lines show low activity (<50).")

# Print the percentage of qualified lines
print(f"Percentage of qualified lines: {percentage_qualified_run_ids:.2f}%")

The dataset is not qualified. More than 50% of lines show low activity (<50%).
Percentage of qualified lines: 0.00%


In [32]:
grouped_by_run

Chip_ID  Well  DIV
M05506   1     14     1.0
               18     1.0
               21     1.0
         2     14     1.0
               18     1.0
               21     1.0
         3     14     1.0
               18     1.0
               21     1.0
M06844   1     14     1.0
               18     1.0
               21     0.0
         2     14     1.0
               18     1.0
               21     0.0
         3     14     1.0
               18     1.0
               21     0.0
M08024   1     14     1.0
               18     1.0
               21     1.0
         2     14     1.0
               18     1.0
               21     1.0
         3     14     1.0
               18     1.0
               21     1.0
M08034   1     14     1.0
               18     1.0
               21     1.0
         2     14     1.0
               18     1.0
               21     1.0
         3     14     1.0
               18     1.0
               21     1.0
Name: Active_area, dtype: float64

In [71]:
def batch_process_datasets(file_paths):
    """
    Batch process multiple datasets to check their qualification based on the criteria 
    that less than or equal to 50% of unique lines exhibit more than 50% of rows with low activity (<50).

    Parameters:
    - file_paths (list of str): List of paths to CSV files.

    Returns:
    - dict: A dictionary with file paths as keys and tuples as values containing the qualification message
            and the percentage of qualified lines for each dataset.
    """
    import pandas as pd
    
    # Function to process each file and determine qualification
    def process_single_file(file_path):
        data = pd.read_csv(file_path)
        WT = data[data['NeuronType'] == 'WT']
        criteria = 50
        grouped_by_run = WT.groupby('Run_ID')['Active_area'].apply(lambda x: (x < criteria).mean())
        percentage_low_activity_run_ids = (grouped_by_run > 0.5).mean() * 100
        percentage_qualified_run_ids = 100 - percentage_low_activity_run_ids
        
        if percentage_low_activity_run_ids > 50:
            result_message = "The dataset is not qualified. More than 50% of lines show low activity (<50)."
        else:
            result_message = "The dataset is qualified. Less than or equal to 50% of lines show low activity (<50)."
        
        return result_message, percentage_qualified_run_ids

    # Dictionary to store results
    results = {}
    
    # Process each file and store the result
    for path in file_paths:
        try:
            results[path] = process_single_file(path)
        except Exception as e:
            results[path] = (f"Error processing file: {str(e)}", None)
    
    return results

In [72]:
# Example usage with a list of file paths
file_paths = ['/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/QualityCheck/SYNGAP_T1_ALL/Activity/Compiled_ActivityScan.csv', 
              '/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/QualityCheck/SYNGAP_T1/Activity/Compiled_ActivityScan.csv']  # Add more file paths as needed
batch_results = batch_process_datasets(file_paths)

# Display results for each file processed
for path, (message, percentage) in batch_results.items():
    print(f"File: {path}")
    print(message)
    if percentage is not None:
        print(f"Percentage of qualified lines: {percentage:.2f}%")
    print("---")

File: /Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Outputs/SYNGAP_T1_ALL/Activity/Compiled_ActivityScan.csv
The dataset is not qualified. More than 50% of lines show low activity (<50).
Percentage of qualified lines: 39.29%
---
File: /Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Outputs/SYNGAP_T1/Activity/Compiled_ActivityScan.csv
The dataset is not qualified. More than 50% of lines show low activity (<50).
Percentage of qualified lines: 43.75%
---


In [73]:
def find_and_process_activity_data(base_path):
    """
    Recursively find and process 'Compiled_ActivityScan.csv' in each 'Activity' subfolder
    within the given base directory.
    
    Parameters:
    - base_path (str): The base directory to start the search from.
    
    Returns:
    - None: Results are printed directly.
    """
    import os
    import pandas as pd
    def process_single_file(file_path):
        try:
            data = pd.read_csv(file_path)
            if 'WT' not in data['NeuronType'].unique():
                return ("Missing WT data.", "nan%")

            WT = data[data['NeuronType'] == 'WT']
            if WT.empty:
                return ("Missing WT data.", "nan%")

            criteria = 50
            grouped_by_run = WT.groupby(['Chip_ID','Well','DIV'])['Active_area'].apply(lambda x: (x < criteria).mean())
            percentage_low_activity_run_ids = (grouped_by_run > 0.5).mean() * 100
            percentage_qualified_run_ids = 100 - percentage_low_activity_run_ids
            
            if percentage_low_activity_run_ids > 50:
                result_message = "The dataset is not qualified. More than 50% of lines show low activity (<50%)."
            else:
                result_message = "The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%)."
            
            return (result_message, f"{percentage_qualified_run_ids:.2f}%")
        except Exception as e:
            return (f"Error processing file: {str(e)}", None)

    for root, dirs, files in os.walk(base_path):
        if 'Activity' in dirs:
            activity_path = os.path.join(root, 'Activity')
            csv_file = os.path.join(activity_path, 'Compiled_ActivityScan.csv')
            subfolder_name = root.split(os.sep)[-1]
            
            if os.path.exists(csv_file):
                result = process_single_file(csv_file)
                print(f"{subfolder_name}: {result[0]} Percentage of qualified lines: {result[1]}")
            else:
                print(f"{subfolder_name}: Missing 'Compiled_ActivityScan.csv'")
        else:
            if root.count(os.sep) - base_path.count(os.sep) == 1:  # Only report missing in direct subfolders of base_path
                subfolder_name = root.split(os.sep)[-1]
                print(f"{subfolder_name}: Missing 'Activity' folder")

In [74]:
# Uprated Example usage
base_path = "/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/QualityCheck"
find_and_process_activity_data(base_path)

ADNP_Therapy_T2: Missing WT data. Percentage of qualified lines: nan%
SYNGAP_T1_ALL: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 34.38%
TEST: Missing 'Activity' folder
SYNGAP_T1: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 43.75%
SYNGAP_Therapy_T1: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 25.00%
SYNGAP_T2: Missing WT data. Percentage of qualified lines: nan%
TEST_2: Missing 'Activity' folder
whatever: Missing WT data. Percentage of qualified lines: nan%


In [None]:
# Tunning parameters

In [3]:
import pandas as pd
pd.show_versions()




INSTALLED VERSIONS
------------------
commit                : d9cdd2ee5a58015ef6f4d15c7226110c9aab8140
python                : 3.9.18.final.0
python-bits           : 64
OS                    : Darwin
OS-release            : 23.6.0
Version               : Darwin Kernel Version 23.6.0: Mon Jul 29 21:14:30 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6000
machine               : arm64
processor             : arm
byteorder             : little
LC_ALL                : None
LANG                  : None
LOCALE                : None.UTF-8

pandas                : 2.2.2
numpy                 : 1.26.4
pytz                  : 2024.1
dateutil              : 2.9.0
setuptools            : 72.1.0
pip                   : 24.0
Cython                : None
pytest                : None
hypothesis            : None
sphinx                : None
blosc                 : None
feather               : None
xlsxwriter            : None
lxml.etree            : None
html5lib              : None
pymysql        