In [3]:
import sys
import pathlib
import pandas as pd

In [8]:
# setting up paths
data_dir = pathlib.Path("../../data").resolve(strict=True)
mitocheck_data_path = (data_dir / "raw/mitocheck_data/extracted_features").resolve(strict=True)


# creating 
processed_dir = (data_dir / "processed").resolve()
processed_dir.mkdir(exist_ok=True)

## Dataset Loading Procedure

In this section, we load three essential datasets: negative controls, positive controls, and training data.

- **Negative Controls:** These cells are treated with scramble siRNA, indicating no specific gene inhibition. Hence, no significant effects are expected.
  
- **Positive Controls:** These cells are treated with siRNA targeting specific genes, providing a basis for comparison and validation.

    - **COPB1 siRNA:** COPB1 is involved in intracellular protein trafficking. When targeted by siRNA, reduced COPB1 levels disrupt protein transport, potentially affecting secretion, membrane integrity, and organelle homeostasis.

    - **INCENP siRNA:** INCENP regulates cell division. siRNA against INCENP leads to disrupted mitosis, causing defects in chromosome alignment, kinetochore-microtubule attachment, and cytokinesis, potentially resulting in cell death or genomic instability.

    - **KIF11 siRNA:** KIF11 mediates mitotic spindle formation. Targeting KIF11 with siRNA impairs spindle assembly and chromosome segregation, leading to mitotic arrest, aberrant chromosome segregation, and potential cell cycle defects or cell death.

- **Training Data:** These are labeled datasets meticulously curated from the Mitocheck consortium, serving as the foundation for our analysis.

The shapes of the dataframe are in (number of cells, morphological features) format

In [12]:
# loading in dataset
neg_df = pd.concat([pd.read_csv(data_path) for data_path in list((mitocheck_data_path / "negative_control_data/merged_features").resolve(strict=True).glob("*.csv.gz"))])
pos_df = pd.concat([pd.read_csv(data_path) for data_path in list((mitocheck_data_path / "positive_control_data/merged_features").resolve(strict=True).glob("*.csv.gz"))])
training_df = pd.concat([pd.read_csv(data_path) for data_path in list((mitocheck_data_path / "training_data/merged_features").resolve(strict=True).glob("*.csv.gz"))])

In [19]:
# looking at the sizes of th dataset
print("Negative control dataset is:", neg_df.shape)
print("Positive control dataset is:", pos_df.shape)
print("training dataset is:", training_df.shape)

negative control dataset is: (779993, 1457)
Positive control dataset is: (612059, 1457)
training dataset is: (64513, 1458)


In [24]:
print("siRNA's used for negative controls:", neg_df["Metadata_Gene"].unique())
print("siRNA's used for positive controls:", pos_df["Metadata_Gene"].unique())

siRNA's sued for negative controls: ['negative control']
siRNA's used for positive controls: ['COPB' 'ENSG00000149503' 'KIF11']


In [29]:
print("Number of plates in the neg dataset", len(neg_df["Metadata_Plate"].unique()))
print("Number of plates in the pos dataset", len(pos_df["Metadata_Plate"].unique()))
print("Number of plates in the training dataset", len(train>ing_df["Metadata_Plate"].unique()))

Number of plates in the neg dataset 510
Number of plates in the pos dataset 510
Number of plates in the training dataset 67


## Exploring The training Dataset

['Unnamed: 0',
 'Cell_UUID',
 'Location_Center_X',
 'Location_Center_Y',
 'Metadata_Plate',
 'Metadata_Well',
 'Metadata_Site',
 'Metadata_Plate_Map_Name',
 'Metadata_DNA',
 'Metadata_Gene',
 'Metadata_Gene_Replicate',
 'CP__AreaShape_Area',
 'CP__AreaShape_BoundingBoxArea',
 'CP__AreaShape_BoundingBoxMaximum_X',
 'CP__AreaShape_BoundingBoxMaximum_Y',
 'CP__AreaShape_BoundingBoxMinimum_X',
 'CP__AreaShape_BoundingBoxMinimum_Y',
 'CP__AreaShape_Center_X',
 'CP__AreaShape_Center_Y',
 'CP__AreaShape_Compactness',
 'CP__AreaShape_ConvexArea',
 'CP__AreaShape_Eccentricity',
 'CP__AreaShape_EquivalentDiameter',
 'CP__AreaShape_EulerNumber',
 'CP__AreaShape_Extent',
 'CP__AreaShape_FormFactor',
 'CP__AreaShape_MajorAxisLength',
 'CP__AreaShape_MaxFeretDiameter',
 'CP__AreaShape_MaximumRadius',
 'CP__AreaShape_MeanRadius',
 'CP__AreaShape_MedianRadius',
 'CP__AreaShape_MinFeretDiameter',
 'CP__AreaShape_MinorAxisLength',
 'CP__AreaShape_Orientation',
 'CP__AreaShape_Perimeter',
 'CP__AreaShape