In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [3]:
root = Path('/home/user/data/rxrx1/images/')
train = pd.read_csv(root / "train.csv")
test = pd.read_csv(root / "test.csv")

train_controls = pd.read_csv(root / "train_controls.csv")
test_controls = pd.read_csv(root / "test_controls.csv")

rxrx_config = pd.read_csv(root.parent / "rxrx1.csv")

In [4]:
def replace_with_rxrx_id(kaggle_df, rxrx_df):
    for i in kaggle_df['id_code'].unique():
        rxrx_sirna_id = rxrx_df.loc[rxrx_df['well_id']==i, 'sirna_id']
        if not rxrx_sirna_id.empty and len(rxrx_sirna_id)==2:
            kaggle_df.loc[kaggle_df['id_code']==i, 'sirna'] = list(rxrx_sirna_id)[0]
        else:
            print(f'No match for experiment {i}')
            print(f'From shape {kaggle_df.shape} dropping {kaggle_df[kaggle_df["id_code"]==i]}')
            kaggle_df.drop(kaggle_df[kaggle_df['id_code']==i].index, axis=0, inplace=True)
            print(f'New shape {kaggle_df.shape}')
    return kaggle_df

def get_ids_dict(kaggle_df, rxrx_df):
    ids = {}
    for i in kaggle_df['id_code'].unique():
        rxrx_sirna_id = rxrx_df.loc[rxrx_df['well_id']==i, 'sirna_id']
        if not rxrx_sirna_id.empty and len(rxrx_sirna_id)==2:
            print(kaggle_df.loc[kaggle_df['id_code']==i, 'sirna'])
            print(list(rxrx_sirna_id)[0])
            break
            ids[kaggle_df.loc[kaggle_df['id_code']==i, 'sirna']] = list(rxrx_sirna_id)[0]
        else:
            print(f'No match for experiment {i}')
    return ids

### Check sizes of Kaggle and RxRx labels 

In [5]:
train['well_type'] = 'unknown'
test['well_type'] = 'unknown'
kaggle_all_train = pd.concat([train, train_controls])
kaggle_all_test = pd.concat([test, test_controls])
print(f'All Kaggle train ids: {kaggle_all_train["id_code"].nunique()}')
print(f'All Kaggle test ids: {kaggle_all_test["id_code"].nunique()}')

All Kaggle train ids: 40614
All Kaggle test ids: 22145


In [6]:
rxrx_train = rxrx_config.loc[rxrx_config['dataset']=='train', :]
rxrx_test = rxrx_config.loc[rxrx_config['dataset']=='test', :]

print(f'All train ids: {rxrx_train["well_id"].nunique()}')
print(f'All test ids: {rxrx_test["well_id"].nunique()}')

All train ids: 40612
All test ids: 22143


In [7]:
# experiments missing from the RxRx dataset
print(f'Missing from RxRx train: \
{set(kaggle_all_train["id_code"]).difference(set(rxrx_train["well_id"]))}')
print(f'Missing from RxRx test: \
{set(kaggle_all_test["id_code"]).difference(set(rxrx_test["well_id"]))}')

Missing from RxRx train: {'HUVEC-06_1_B18', 'RPE-04_3_E04'}
Missing from RxRx test: {'HUVEC-18_3_D23', 'RPE-09_2_J16'}


### Get dictionary of correspondence

### Replace Kaggle well IDs with RxRx

In [7]:
kaggle_all_train = replace_with_rxrx_id(kaggle_all_train, rxrx_train)
kaggle_train = kaggle_all_train.loc[kaggle_all_train['well_type']=='unknown', :]
kaggle_train.drop(['well_type'], axis=1, inplace=True)
kaggle_train.to_csv(root / "kaggle_train.csv", index=False)
kaggle_train_controls = kaggle_all_train[kaggle_all_train['well_type']!='unknown']
kaggle_train_controls.to_csv(root / "kaggle_train_controls.csv", index=False)

No match for experiment HUVEC-06_1_B18
From shape (40614, 6) dropping               id_code experiment  plate well      sirna well_type
13305  HUVEC-06_1_B18   HUVEC-06      1  B18  sirna_777   unknown
New shape (40613, 6)
No match for experiment RPE-04_3_E04
From shape (40613, 6) dropping             id_code experiment  plate well      sirna well_type
29378  RPE-04_3_E04     RPE-04      3  E04  sirna_612   unknown
New shape (40612, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


NameError: name 'kaggle' is not defined

In [9]:
kaggle_all_test = replace_with_rxrx_id(kaggle_all_test, rxrx_test)
kaggle_test = kaggle_all_test.loc[kaggle_all_test['well_type']=='unknown', :]
kaggle_test.drop(['well_type'], axis=1, inplace=True)
kaggle_test.to_csv(root / "kaggle_test.csv", index=False)
kaggle_test_controls = kaggle_all_test[kaggle_all_test['well_type']!='unknown']
kaggle_test_controls_cols = kaggle_test_controls.columns.to_list()
kaggle_test_controls = kaggle_test_controls[kaggle_test_controls_cols[:-2] +
                                            kaggle_test_controls_cols[-2:]]
kaggle_test_controls.to_csv(root / "kaggle_test_controls.csv", index=False)

No match for experiment HUVEC-18_3_D23
From shape (22145, 6) dropping              id_code experiment  plate well well_type sirna
6149  HUVEC-18_3_D23   HUVEC-18      3  D23   unknown   NaN
New shape (22144, 6)
No match for experiment RPE-09_2_J16
From shape (22144, 6) dropping             id_code experiment  plate well well_type sirna
14828  RPE-09_2_J16     RPE-09      2  J16   unknown   NaN
New shape (22143, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
