<a href="https://colab.research.google.com/github/bschelske/MMP9/blob/master/PET_detective.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PET Detective
*Phycoertherin (PE) Tag Detective*

Using imageJ macro outputs, this code will background subtract cell containing chambers and create a new csv of only cell containing chambers.



Mount ben drive to access ben files

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Compile functions and import packages

In [None]:
import re
import os
import pandas as pd
#Functions for entire process


#Find files:

def find_files():
  parent_dir = "/content/drive/My Drive/MMP9 Before Petdet/"
  files =[parent_dir+f for f in os.listdir(parent_dir) if os.path.isfile(parent_dir+f)]
  labels = [f[:-4] for f in os.listdir(parent_dir) if os.path.isfile(parent_dir+f)]
  # print('files in ' + parent_dir +':')
  # for f in labels: print(f)
  return files

#Sort drive files into individual lists
def sort_drive(files):
  #Create empty lists
  intden_files = []
  petdet_files = []

  #Sort inputted files list
  files.sort()

  #Loop through drive list and sort into new lists
  for i, file in enumerate(files):
    if i % 2 == 0:
      intden_files.append(file)
    else:
      petdet_files.append(file)

  print(f'Inteden_files: {intden_files}')
  print(f'petdet_files: {petdet_files}')
  return intden_files, petdet_files



#Check dates of inputted paths. Path is 43 char long, date is 8 char long (51)
def get_date(data, pet):
  date_data = data[43:51]
  date_pet = pet[43:51]
  if date_data != date_pet:
    print(f'Error: mismatch between {data[43:]} and {pet[43:]}')
  else:
    # print(f'Processing {data[43:]} + {pet[43:]}')
    return date_data, date_pet


def extract_sample_name(input_string):
    # Define the regular expression pattern to capture the desired substring
    pattern = r'\/\d{8}_(\w+)_(?:IntDen|petdet)\.csv$'

    # Use re.search to find the first match of the pattern in the input_string
    match = re.search(pattern, input_string)

    if match:
        # Extract the captured group (the substring within the parentheses) from the match
        return match.group(1)
    else:
        # Return None if no match is found
        return None


#Function that inputs an intdentcsv and petdet path
#Returns cells_df, background_df
def create_cells_df_and_background_df(data, pet):
  #Remove first column of data
  cells_df = pd.read_csv(data)
  cells_df = cells_df.iloc[:,1:] #remove index column

  #Make copy of cells_df and rename to background
  background_df = cells_df.copy()

  #Convert petdet csv into df
  pet_df = pd.read_csv(pet)

  #Count number of cells
  cells = pet_df['Cell'].sum()
  print(f'{pet[43:]} contains {cells} cells')

  #Loop through pet_df cell column and separate data into two dfs
  for i in range(len(pet_df.index)):

    #Accessing correct columns
    name = 'Chamber '+ str(i+1) + ' RawIntDen'

    #If there is a cell, remove that data from the background list
    if pet_df.loc[i]['Cell'] > 0:
      background_df = background_df.drop([name],  axis=1)

    #If there isn't a cell, remove that data from the cell list
    else:
      cells_df = cells_df.drop([name],  axis=1)

  return cells_df, background_df

def background_subtraction(cells_df, background_df):
    #Get average of background
    average_background_df = background_df.copy()
    average_background_df = average_background_df.mean(axis = 1)
    average_background_df = average_background_df.to_frame()
    average_background_df.columns = ['Average Background']
    #Subtract the background averages from times in cells_df
    cells_df = cells_df.sub(average_background_df['Average Background'], axis=0)
    return cells_df, average_background_df

def background_division(cells_df, background_df):
  #Get average of background
  average_background_df = background_df.copy()
  average_background_df = average_background_df.mean(axis = 1)
  average_background_df = average_background_df.to_frame()
  average_background_df.columns = ['Average Background']
  #Subtract the background averages from times in cells_df
  cells_df = cells_df.div(average_background_df['Average Background'], axis=0)
  return cells_df, average_background_df

def dicarlo_norm(cells_df, background_df):
  average_background = background_df.iloc[0].mean()
  cells_df = cells_df / average_background

  # for column in cells_df.columns:
  #   divisor = cells_df[column][0]  # Get the first index value of the column
  #   cells_df[column] = cells_df[column] / divisor
  return cells_df

def binning_correction(cells_df):
  #Fold change code
  for column in cells_df.columns:
    divisor = cells_df[column][0]  # Get the first index value of the column
    cells_df[column] = cells_df[column] / divisor
  return cells_df

def format_background_df(average_background_df, cell_type, intden_file):
  average_background_df['Cell Type'] = cell_type
  average_background_df.index.name = 'Time'

  # Reset index and create 'Time' column
  average_background_df.reset_index(inplace=True)
  average_background_df.rename(columns={'index': 'Time'}, inplace=True)
  average_background_df['File'] = intden_file
  return average_background_df



Use the functions to create csv files of cell intensities and background intensities

In [None]:
#Utilizing all functions:

files = find_files()
save_cells_path = '/content/drive/My Drive/MMP9 Strip Plot/'
save_background_path = '/content/drive/My Drive/MMP9 Strip Plot/Background/'

#Sort the google drive and separate files
intden_files, petdet_files = sort_drive(files)

#Loop through each file within the lists, zip allows iteration of two lists in parallel
for intden_file, petdet_file in zip(intden_files, petdet_files):

  #Get the dates of each file, make sure they match
  date_data, date_pet = get_date(intden_file, petdet_file)
  if date_data == date_pet:

    #Get cell type from file name
    cell_type = extract_sample_name(intden_file)

    #Rename Cells if necessary
    if cell_type == 'MDA': cell_type = 'MDA MB 231'
    if cell_type == 'PMA': cell_type = 'MDA + PMA'

    #Create dataframes
    cells_df, background_df = create_cells_df_and_background_df(intden_file, petdet_file)

    #Normalize df by dividing each value by t0 value
    # dicarlo_df = dicarlo_norm(cells_df, background_df)

    # #Background subtract the average background chamber at each time point to cell containing chambers, return avg background
    # cells_df, average_background_df = background_subtraction(cells_df, background_df)

    # cells_df, average_background_df = background_division(cells_df, background_df)

    #Binning correction: change intensity to fold change vs chamber intensity at t=0
    cells_df = binning_correction(cells_df)

    # #Format average background (Add time column, cell type, filename)
    # average_background_df = format_background_df(average_background_df, cell_type, intden_file)

    #Declare paths for processed files
    cells_formatted_name = f"{save_cells_path}{date_data}_{cell_type}_cells_only.csv"
    background_formatted_name = f"{save_background_path}{date_data}_{cell_type}_background.csv"

    #Save processed files
    # cells_df.to_csv(cells_formatted_name)


    # dicarlo_df.to_csv(cells_formatted_name)
    # average_background_df.to_csv(background_formatted_name)

    print(f"{cells_formatted_name} saved")
    # print(f"{background_formatted_name} saved \n")


  #Stop loop if the dates of files do not match
  else:
    break

print('we done')

Inteden_files: ['/content/drive/My Drive/MMP9 Before Petdet/20220527_A375_IntDen.csv', '/content/drive/My Drive/MMP9 Before Petdet/20220607_A375_IntDen.csv', '/content/drive/My Drive/MMP9 Before Petdet/20220706_MDA_IntDen.csv', '/content/drive/My Drive/MMP9 Before Petdet/20220803_A375_IntDen.csv', '/content/drive/My Drive/MMP9 Before Petdet/20221012_PMA_IntDen.csv', '/content/drive/My Drive/MMP9 Before Petdet/20221102_PMA_IntDen.csv', '/content/drive/My Drive/MMP9 Before Petdet/20221109_PMA_IntDen.csv', '/content/drive/My Drive/MMP9 Before Petdet/20221116_MDA_IntDen.csv', '/content/drive/My Drive/MMP9 Before Petdet/20221121_MDA_IntDen.csv']
petdet_files: ['/content/drive/My Drive/MMP9 Before Petdet/20220527_A375_petdet.csv', '/content/drive/My Drive/MMP9 Before Petdet/20220607_A375_petdet.csv', '/content/drive/My Drive/MMP9 Before Petdet/20220706_MDA_petdet.csv', '/content/drive/My Drive/MMP9 Before Petdet/20220803_A375_petdet.csv', '/content/drive/My Drive/MMP9 Before Petdet/20221012_

In [None]:
def make_detective(intden_files, petdet_files):
  headers = {'Chamber #': [], 'Cell':[], 'Petdet File': [], 'Matching File':[]}
  unfiltered_df = pd.DataFrame(data=headers)

  for intden_file, petdet_file in zip(intden_files, petdet_files):
    #Get the dates of each file, make sure they match
    date_data, date_pet = get_date(intden_file, petdet_file)
    if date_data == date_pet:
      #Get cell type from file name
      cell_type = extract_sample_name(intden_file)

      #Rename Cells if necessary
      if cell_type == 'MDA': cell_type = 'MDA MB 231'
      if cell_type == 'PMA': cell_type = 'MDA + PMA'

      matching_file = f"{save_cells_path}{date_data}_{cell_type}_cells_only.csv"

      temp_df = pd.read_csv(petdet_file)
      # temp_df = temp_df[temp_df['Cell']>0]
      temp_df = temp_df.iloc[:, 1:] #remove first unnamed column
      temp_df = temp_df.drop('Max Intensity', axis=1)
      temp_df['Petdet File'] = petdet_file
      temp_df['Matching File'] = matching_file
      temp_df['Date'] = date_pet
      temp_df['Cell Type'] = cell_type

      unfiltered_df = pd.concat([unfiltered_df, temp_df], axis=0)
      # Reset the index of the combined DataFrame
      unfiltered_df.reset_index(drop=True, inplace=True)

      the_detective = unfiltered_df.filter(regex='^(?!Unnamed).*')

    #Stop loop if the dates of files do not match
    else:
      break

  return the_detective

the_detective = make_detective(intden_files, petdet_files)

In [None]:
#Declare paths for processed files
background_formatted_name = f"{save_background_path}{date_data}_{cell_type}_background.csv"
detective_save_path = '/content/drive/My Drive/the_detective.csv'

#Save processed files
the_detective.to_csv(detective_save_path)

In [None]:
count = the_detective['Chamber #'].count()
cell_sum = the_detective['Cell'].sum()
multi_capture = the_detective[the_detective['Cell'] > 1].count()

print(cell_sum)
print(count)
print(multi_capture[1])
percent_multi = 1- multi_capture[1]/ count
print(f'{percent_multi:.1%}')


568.0
1440
37
97.4%


In [None]:
# Use this block if the detective includes 0 captures

total_chambers = the_detective['Chamber #'].count()
cell_sum = the_detective['Cell'].sum()

double_capture = the_detective[the_detective['Cell'] == 2].count()
multi_capture = the_detective[the_detective['Cell'] > 1].count()

total_capture = the_detective[the_detective['Cell'] > 0].sum()
single_capture = the_detective[the_detective['Cell'] == 1].count()

print(f'Number of chambers: {total_chambers} ({total_chambers} / 160 = {total_chambers / 160:.0f} experiments)')
print(f'Number of double capture: {double_capture[1]}')
print(f'Number of multi capture (>= 2): {multi_capture[1]}')

print(f'Number of Single capture: {single_capture[1]}')

percent_double = double_capture[1]/ total_chambers
print(f'Double Capture / Total chambers: {percent_double:.1%}')

percent_single = single_capture[1]/ total_chambers
print(f'Single capture / Total chambers: {percent_single:.1%}')


percent_single = single_capture[1]/ cell_sum
print(f'Single capture / cell_sum: {percent_single:.1%}')


Chamber #        492
Cell             492
Petdet File      492
Matching File    492
Date             492
Cell Type        492
dtype: int64