# Select cells from database

This notebook is intended to retrieve single-cells from a database file.

Requirements:
- Nuclei, Cells and Cytoplasm objects named in that way;
- Output database .db file from ExportToDatabase (CellProfiler);
- Each database file is named after the Metadata_Plate name, e.g. 220607_092050_Plate_1;
    - Provide a plate_list with all plate names: `plate_list = ['plate1', 'plate2', ...]`
- All databased files must be organized such:
    ```
    |- backend 
    |   |- <project_name>
    |   |   |- <plate_name>
    |   |   |   |- <plate1.db>
    |   |   |- <plate_name>
    |   |   |   |- <plate2.db>
    ...
    ```


# Import libraries

In [1]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.cluster import KMeans
# from openpyxl import Workbook, load_workbook
import os
import easygui as eg
from numpy import random
import sqlite3
import pycytominer
from pycytominer.cyto_utils.cells import SingleCells

## Import database file

### Import every plate and then join them

In [2]:
plate_list = ['190905_182437_Plate_1', 
'190906_172229_Plate_1'
]
compartments = ['Cells', 'Nuclei', 'Cytoplasm']

- Common path will be sqlite:/// + the location of the directory on your computer (until <project_name>)

In [3]:
common_path = fr"G://My Drive//Fernanda Mestrado//Paper Mestrado//Redo_Analysis_Paper//analysis//Vimentin"

## Open database and list tables

In [4]:
df_join=[]
for plate in plate_list:
    conn = sqlite3.connect(fr"{common_path}//{plate}.db")
    conn_cursor = conn.cursor()
    df_image = pd.read_sql_query("SELECT * FROM Per_Image", conn)
    df_image['Metadata_Site'] = df_image['Image_Metadata_Site']
    df_image['Metadata_Plate'] = df_image['Image_Metadata_Plate']
    df_image['Metadata_Well'] = df_image['Image_Metadata_Well']
    metadata_cols = df_image[['ImageNumber', 'Metadata_Plate', 'Metadata_Well', 'Metadata_Site']]
    df_list=[]
    for eachcompartment in compartments:
        df_temp = pd.read_sql_query(f"SELECT * FROM Per_{eachcompartment}", conn)
        print(df_temp)
        df_list.append(df_temp)
    df = pd.concat(df_list, axis='columns')
    df = df.loc[:,~df.columns.duplicated()].copy() # remove duplicated columns
    df = metadata_cols.merge(df, on='ImageNumber', how="right")
    df_join.append(df)

       ImageNumber  Cells_Number_Object_Number  Cells_AreaShape_Area  \
0                5                           1                2154.0   
1                5                           2                2633.0   
2                5                           3                2701.0   
3                5                           4                4065.0   
4                5                           5               11825.0   
...            ...                         ...                   ...   
27868          540                          86                4204.0   
27869          540                          87                5420.0   
27870          540                          88                4307.0   
27871          540                          89                5593.0   
27872          540                          90                4096.0   

       Cells_AreaShape_BoundingBoxArea  Cells_AreaShape_BoundingBoxMaximum_X  \
0                               3450.0                 

# make the number of columns equal

In [5]:
cols_keep = df_join[1].columns.tolist() # the dataframe with the least number of cols

In [6]:
df_join[0] = df_join[0].filter(cols_keep)

In [7]:
df_final = pd.concat(df_join, axis="index").reset_index(drop=True)

In [8]:
df_final

Unnamed: 0,ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Site,Cells_Number_Object_Number,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,...,Cytoplasm_Texture_Variance_Zeb1_10_02_256,Cytoplasm_Texture_Variance_Zeb1_10_03_256,Cytoplasm_Texture_Variance_Zeb1_20_00_256,Cytoplasm_Texture_Variance_Zeb1_20_01_256,Cytoplasm_Texture_Variance_Zeb1_20_02_256,Cytoplasm_Texture_Variance_Zeb1_20_03_256,Cytoplasm_Texture_Variance_Zeb1_5_00_256,Cytoplasm_Texture_Variance_Zeb1_5_01_256,Cytoplasm_Texture_Variance_Zeb1_5_02_256,Cytoplasm_Texture_Variance_Zeb1_5_03_256
0,5,190905_182437_Plate_1,A1,13,1,2154.0,3450.0,330.0,98.0,255.0,...,25.917463,23.087755,14.232591,9.850569,34.637864,21.355556,18.770700,17.053989,19.852323,18.616000
1,5,190905_182437_Plate_1,A1,13,2,2633.0,4485.0,257.0,128.0,192.0,...,23.538267,33.245228,18.514566,34.826559,23.511136,42.694252,32.714971,23.897312,34.017966,39.090269
2,5,190905_182437_Plate_1,A1,13,3,2701.0,5412.0,193.0,132.0,111.0,...,21.032543,23.152448,14.358481,18.127809,14.088817,25.856383,29.640196,28.610458,24.189052,25.081683
3,5,190905_182437_Plate_1,A1,13,4,4065.0,7276.0,335.0,161.0,228.0,...,25.137710,23.079472,33.735991,55.690911,31.030905,28.997807,37.207646,36.875093,28.959807,27.124813
4,5,190905_182437_Plate_1,A1,13,5,11825.0,22464.0,597.0,208.0,453.0,...,9.736053,10.015848,9.089779,10.062169,10.056093,11.397098,9.866344,9.868549,9.865196,9.994464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52126,450,190906_172229_Plate_1,C6,9,67,4446.0,6240.0,468.0,788.0,388.0,...,34.265428,34.007666,38.916259,34.572268,23.981329,24.465994,36.118573,37.524208,35.676574,36.436240
52127,450,190906_172229_Plate_1,C6,9,68,2499.0,3844.0,526.0,780.0,464.0,...,26.621473,23.923010,38.094048,47.133320,21.211441,42.561451,31.400296,34.151226,24.980342,26.436576
52128,450,190906_172229_Plate_1,C6,9,69,4001.0,7719.0,955.0,838.0,872.0,...,12.638243,14.223370,15.289232,13.434715,15.331813,12.827750,13.051002,13.433531,12.581271,12.560913
52129,450,190906_172229_Plate_1,C6,9,70,3702.0,5688.0,522.0,837.0,443.0,...,10.846957,11.869543,16.365183,17.928186,12.368661,14.335752,13.096302,11.336473,11.028587,11.261117


In [33]:
# renameTable = "ALTER TABLE Per_Image RENAME TO image"
# conn_cursor.execute(renameTable)
# df['Metadata_Site'] = df['Image_Metadata_Site']
# df.to_sql('image', conn, if_exists='replace', index=False)

534

# Export

In [11]:
output_path = eg.diropenbox(msg="Choose an output folder", default=r"F:")
print('Path to save the single cell file', output_path)
project_name = input('Provide project name: ')

Path to save the single cell file G:\My Drive\Fernanda Mestrado\Paper Mestrado\Redo_Analysis_Paper\profiles\Vimentin


In [12]:
df_final.to_csv(output_path + r'/' + project_name + 'single_cells.csv')
print('Successfully exported to:', output_path + r'/' + project_name + 'single_cells.csv')

Successfully exported to: G:\My Drive\Fernanda Mestrado\Paper Mestrado\Redo_Analysis_Paper\profiles\Vimentin/Vimentinsingle_cells.csv
