In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('precision', 1)

folder = './032018_48hrs/'

path_cyt = folder + 'Cytoplasm_Cancer_032018.xlsx'

markers = ['Cell ID', 
           'Nucleus (Cycle 1 - DAPI)',
           'BT474-H2BeGFP (Cycle 1 - FITC)', 
           'MCL1 (Cycle 2 - Cy3)', 
           'p-p65NFkB (Cycle 2 - Cy5)', 
           'p-Akt (Cycle 3 - FITC)',
           'aSMA (Cycle 3 - Cy3)',
           'p-gH2AX (Cycle 3 - Cy5)',
           'GRP78 (Cycle 4 - FITC)',
           'Vimentin (Cycle 4 - Cy3)']

tabs = ['BT474,Cntrl(rb,c2-4)', 
        'BT474,30nM(rb,c5-7)',
        'BT474,300nM(rb,c8-10)',
        
        'BT474+C3H-scr,Cntrl(rc,c2-4)',
        'BT474+C3H-scr,30nM(rc,c5-7)',
        'BT474+C3H-scr,300nM(rc,c8-10)',
        
        'BT474+C3H-1,Cntrl(rd,c2-4)',
        'BT474+C3H-1,30nM(rd,c5-7)',
        'BT474+C3H-1,300nM(rd,c8-10)',
        
        'BT474+C3H-3,Cntrl(re,c2-4)',
        'BT474+C3H-3,30nM(re,c5-7)',
        'BT474+C3H-3,300nM(re,c8-10)']

def concat_df(path):
    dt = []
    for t in tabs:
        dt.append(pd.read_excel(open(path, 'rb'), sheet_name=t, names=markers))
    df = pd.concat(dt, ignore_index=True)
    return df

In [2]:
df_cyt = concat_df(path_cyt)
df_cyt['Group'] = df_cyt['Cell ID'].apply(lambda x: ((x // 10**9) % 10 - 1) // 3)
print('*** Cytoplasm Dataset:')
df_cyt

*** Cytoplasm Dataset:


Unnamed: 0,Cell ID,Nucleus (Cycle 1 - DAPI),BT474-H2BeGFP (Cycle 1 - FITC),MCL1 (Cycle 2 - Cy3),p-p65NFkB (Cycle 2 - Cy5),p-Akt (Cycle 3 - FITC),aSMA (Cycle 3 - Cy3),p-gH2AX (Cycle 3 - Cy5),GRP78 (Cycle 4 - FITC),Vimentin (Cycle 4 - Cy3),Group
0,11100020013,1.4e+06,311103.3,101851.6,43709.0,73056.7,30051.5,6120.7,16078.1,12079.2,0
1,11100030022,1.9e+06,162176.2,183881.8,104492.1,206216.4,67026.0,23716.0,23564.9,31449.9,0
2,11100040016,1.3e+06,538824.0,84439.7,45970.9,70549.5,30217.9,1426.4,20359.0,16116.3,0
3,11100050019,1.3e+06,217196.1,58911.0,26316.6,27367.7,29239.0,2993.7,17980.0,21591.7,0
4,11100060017,6.8e+05,55856.2,80966.3,32906.0,21788.7,23781.4,1625.9,7795.0,15434.6,0
...,...,...,...,...,...,...,...,...,...,...,...
343663,49911711034,2.2e+05,16640.2,44580.2,4733.4,27072.3,15436.3,8104.0,7305.9,17668.4,2
343664,49911721034,2.5e+05,96468.8,40339.2,16100.2,26939.8,24199.1,10418.5,12620.9,15018.5,2
343665,49911731035,2.9e+05,162932.7,30150.8,46467.5,29489.4,16019.5,9917.5,12035.9,19004.7,2
343666,49911741036,2.3e+05,13803.6,35212.8,7303.4,17433.2,14706.9,2545.4,6376.2,16880.5,2


In [3]:
# All columns in the Cytoplasm dataset's information
df_cyt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343668 entries, 0 to 343667
Data columns (total 11 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Cell ID                         343668 non-null  int64  
 1   Nucleus (Cycle 1 - DAPI)        343668 non-null  float64
 2   BT474-H2BeGFP (Cycle 1 - FITC)  343668 non-null  float64
 3   MCL1 (Cycle 2 - Cy3)            343668 non-null  float64
 4   p-p65NFkB (Cycle 2 - Cy5)       343668 non-null  float64
 5   p-Akt (Cycle 3 - FITC)          343668 non-null  float64
 6   aSMA (Cycle 3 - Cy3)            343668 non-null  float64
 7   p-gH2AX (Cycle 3 - Cy5)         343668 non-null  float64
 8   GRP78 (Cycle 4 - FITC)          343668 non-null  float64
 9   Vimentin (Cycle 4 - Cy3)        343668 non-null  float64
 10  Group                           343668 non-null  int64  
dtypes: float64(9), int64(2)
memory usage: 28.8 MB


In [4]:
# Check if duplicate ID exists in Cytoplasm dataset
df_cyt['Cell ID'].value_counts()

34402740219    1
47805560560    1
27408570854    1
27507130853    1
41304830435    1
              ..
37710570681    1
27903960382    1
29902720323    1
32902180167    1
48907010612    1
Name: Cell ID, Length: 343668, dtype: int64

In [5]:
# Check the correctness of groups
df_cyt['Group'].value_counts()

0    118161
1    113238
2    112269
Name: Group, dtype: int64

In [6]:
# Cytoplasm markers' statistics
df_cyt[df_cyt.columns[1:-1]].describe()

Unnamed: 0,Nucleus (Cycle 1 - DAPI),BT474-H2BeGFP (Cycle 1 - FITC),MCL1 (Cycle 2 - Cy3),p-p65NFkB (Cycle 2 - Cy5),p-Akt (Cycle 3 - FITC),aSMA (Cycle 3 - Cy3),p-gH2AX (Cycle 3 - Cy5),GRP78 (Cycle 4 - FITC),Vimentin (Cycle 4 - Cy3)
count,340000.0,340000.0,340000.0,340000.0,340000.0,340000.0,340000.0,340000.0,340000.0
mean,540000.0,53000.0,86000.0,37000.0,40000.0,40000.0,9200.0,19000.0,33000.0
std,370000.0,99000.0,74000.0,58000.0,35000.0,62000.0,16000.0,27000.0,59000.0
min,39000.0,1300.0,0.0,7.6,0.0,0.0,0.0,0.0,0.0
25%,310000.0,14000.0,45000.0,11000.0,20000.0,19000.0,3600.0,12000.0,20000.0
50%,440000.0,21000.0,66000.0,19000.0,29000.0,27000.0,5900.0,16000.0,26000.0
75%,640000.0,41000.0,100000.0,40000.0,47000.0,42000.0,11000.0,23000.0,37000.0
max,13000000.0,2700000.0,8800000.0,4700000.0,3600000.0,5500000.0,1200000.0,4200000.0,8800000.0


In [None]:
sns.pairplot(data=df_cyt, vars=df_cyt.columns[1:-1], hue='Group')
plt.show()