In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../data/train.txt', sep='\t', header=None)
val = pd.read_csv('../data/val.txt', sep='\t', header=None)
test = pd.read_csv('../data/test.txt', sep='\t', header=None)

# Train data

In [3]:
train.shape

(1181, 3)

In [4]:
print('unique cell line: ' + str(len(set(train[0]))))
print('unique SMILES: ' + str(len(set(train[1]))))

unique cell line: 55
unique SMILES: 37


# Validation data

In [5]:
val.shape

(393, 3)

In [6]:
print('unique cell line: ' + str(len(set(val[0]))))
print('unique SMILES: ' + str(len(set(val[1]))))

unique cell line: 55
unique SMILES: 37


In [7]:
val[0].value_counts()

MDAMB231_BREAST                                12
RXF393_KIDNEY                                  11
HCT15_LARGE_INTESTINE                          11
CAKI1_KIDNEY                                   11
SKMEL5_SKIN                                    11
T47D_BREAST                                    10
IGROV1_OVARY                                   10
SW620_LARGE_INTESTINE                          10
OVCAR5_OVARY                                   10
NCIH226_LUNG                                    9
786O_KIDNEY                                     9
K562_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE         9
SKMEL28_SKIN                                    9
HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE         9
TK10_KIDNEY                                     9
SKMEL2_SKIN                                     8
CCRFCEM_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE      8
HOP92_LUNG                                      8
SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE        8
SKOV3_OVARY                                     8


# Test data

In [8]:
test = pd.read_csv(
    '../data/test.txt',
    sep='\t',
    header=None
)

In [9]:
test.shape

(393, 3)

In [10]:
print('unique cell line: ' + str(len(set(test[0]))))
print('unique SMILES: ' + str(len(set(test[1]))))

unique cell line: 55
unique SMILES: 37


In [11]:
test[0].value_counts()

M14_SKIN                                       14
DU145_PROSTATE                                 14
HCC2998_LARGE_INTESTINE                        13
SKMEL2_SKIN                                    11
SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE       11
SKMEL28_SKIN                                   10
HS578T_BREAST                                  10
LOXIMVI_SKIN                                   10
MCF7_BREAST                                    10
OVCAR4_OVARY                                   10
CAKI1_KIDNEY                                   10
HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE        10
NCIH23_LUNG                                    10
NCIH322_LUNG                                    9
HCT15_LARGE_INTESTINE                           9
MALME3M_SKIN                                    9
IGROV1_OVARY                                    8
UACC62_SKIN                                     8
EKVX_LUNG                                       8
NCIH226_LUNG                                    8


In [12]:
df = pd.merge(
    pd.merge(
        pd.DataFrame(train[0].value_counts()), 
        pd.DataFrame(val[0].value_counts()), 
        left_index=True, right_index=True
    ), pd.DataFrame(test[0].value_counts()), 
    left_index=True, right_index=True
)
df.columns = ['train', 'val', 'test']
df.loc['total'] = np.sum(df, axis=0) 

In [13]:
df

Unnamed: 0,train,val,test
SF268_CENTRAL_NERVOUS_SYSTEM,29,4,4
KM12_LARGE_INTESTINE,28,4,4
HCT116_LARGE_INTESTINE,28,6,3
RPMI8226_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,27,3,7
OVCAR8_OVARY,27,4,6
A498_KIDNEY,25,4,6
NCIH322_LUNG,25,2,9
SKOV3_OVARY,24,8,3
SNB75_CENTRAL_NERVOUS_SYSTEM,24,7,5
SF295_CENTRAL_NERVOUS_SYSTEM,24,6,7


In [14]:
print(df.to_markdown())

|                                             |   train |   val |   test |
|:--------------------------------------------|--------:|------:|-------:|
| SF268_CENTRAL_NERVOUS_SYSTEM                |      29 |     4 |      4 |
| KM12_LARGE_INTESTINE                        |      28 |     4 |      4 |
| HCT116_LARGE_INTESTINE                      |      28 |     6 |      3 |
| RPMI8226_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE |      27 |     3 |      7 |
| OVCAR8_OVARY                                |      27 |     4 |      6 |
| A498_KIDNEY                                 |      25 |     4 |      6 |
| NCIH322_LUNG                                |      25 |     2 |      9 |
| SKOV3_OVARY                                 |      24 |     8 |      3 |
| SNB75_CENTRAL_NERVOUS_SYSTEM                |      24 |     7 |      5 |
| SF295_CENTRAL_NERVOUS_SYSTEM                |      24 |     6 |      7 |
| A549_LUNG                                   |      24 |     7 |      6 |
| NCIH522_LUNG           