In [1]:
import os, gc, pickle, scipy.sparse, lightgbm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from matplotlib.ticker import MaxNLocator

from sklearn.model_selection import GroupKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

# DATA_DIR = "/kaggle/input/open-problems-multimodal/"
DATA_DIR = "/home/wuxinchao/data/project/kaggle_comp_scmo_data/data"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

CROSS_VALIDATE = True
SUBMIT = False

In [2]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [3]:
metadata_df = pd.read_csv(FP_CELL_METADATA, index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="citeseq"]
metadata_df.shape

(119651, 4)

In [4]:
FP_CITE_TRAIN_INPUTS

'/home/wuxinchao/data/project/kaggle_comp_scmo_data/data/train_cite_inputs.h5'

In [5]:
X = pd.read_hdf(FP_CITE_TRAIN_INPUTS)
X_test = pd.read_hdf(FP_CITE_TEST_INPUTS)
constant_cols = list(X.columns[(X == 0).all(axis=0).values]) + list(X_test.columns[(X_test == 0).all(axis=0).values])
X = X.drop(constant_cols, axis=1)
cell_idx = X.index
meta = metadata_df.reindex(cell_idx)

important_cols = []
Y = pd.read_hdf(FP_CITE_TRAIN_TARGETS)
for y_col in Y.columns:
    important_cols += [x_col for x_col in X.columns if y_col in x_col]
X0 = X[important_cols].values
print(f"Original X shape: {str(X.shape):14} {X.size*4/1024/1024/1024:2.3f} GByte")
gc.collect()
X = scipy.sparse.csr_matrix(X.values)
gc.collect()

# Read test and convert to sparse matrix
Xt = pd.read_hdf(FP_CITE_TEST_INPUTS).drop(columns=constant_cols)
cell_index_test = Xt.index
meta_test = metadata_df.reindex(cell_index_test)
X0t = Xt[important_cols].values
print(f"Original Xt shape: {str(Xt.shape):14} {Xt.size*4/1024/1024/1024:2.3f} GByte")
gc.collect()
Xt = scipy.sparse.csr_matrix(Xt.values)

Original X shape: (70988, 20856) 5.515 GByte
Original Xt shape: (48663, 20856) 3.781 GByte


In [6]:
# Apply the singular value decomposition
both = scipy.sparse.vstack([X, Xt])
assert both.shape[0] == 119651
print(f"Shape of both before SVD: {both.shape}")
svd = TruncatedSVD(n_components=512, random_state=1) # 512
both = svd.fit_transform(both)
print(f"Shape of both after SVD:  {both.shape}")

# Hstack the svd output with the important features
X = both[:70988]
Xt = both[70988:]
del both
X = np.hstack([X, X0])
Xt = np.hstack([Xt, X0t])
print(f"Reduced X shape:  {str(X.shape):14} {X.size*4/1024/1024/1024:2.3f} GByte")
print(f"Reduced Xt shape: {str(Xt.shape):14} {Xt.size*4/1024/1024/1024:2.3f} GByte")

Shape of both before SVD: (119651, 20856)
Shape of both after SVD:  (119651, 512)
Reduced X shape:  (70988, 656)   0.173 GByte
Reduced Xt shape: (48663, 656)   0.119 GByte


In [7]:
Y = pd.read_hdf(FP_CITE_TRAIN_TARGETS)
y_columns = list(Y.columns)
Y = Y.values

print(f"Y shape: {str(Y.shape):14} {Y.size*4/1024/1024/1024:2.3f} GByte")

Y shape: (70988, 140)   0.037 GByte


In [8]:
lightgbm_params = {
     'learning_rate': 0.1, 
     'max_depth': 10, 
     'num_leaves': 200,
     'min_child_samples': 250,
     'colsample_bytree': 0.8, 
     'subsample': 0.6, 
     "seed": 1,
    }

In [9]:
if CROSS_VALIDATE:
    y_cols = Y.shape[1] # set this to a small number for a quick test
    n_estimators = 300

    kf = GroupKFold(n_splits=3)
    score_list = []
    for fold, (idx_tr, idx_va) in enumerate(kf.split(X, groups=meta.donor)):
        model = None
        gc.collect()
        X_tr = X[idx_tr]
        y_tr = Y[:,:y_cols][idx_tr]
        X_va = X[idx_va]
        y_va = Y[:,:y_cols][idx_va]

        models, va_preds = [], []
        for i in range(y_cols):
            #print(f"Training column {i:3} for validation")
            model = lightgbm.LGBMRegressor(n_estimators=n_estimators, **lightgbm_params)
            # models.append(model) # not needed
            model.fit(X_tr, y_tr[:,i].copy())
            va_preds.append(model.predict(X_va))
        y_va_pred = np.column_stack(va_preds) # concatenate the 140 predictions
        del va_preds

        del X_tr, y_tr, X_va
        gc.collect()

        # We validate the model (mse and correlation over all 140 columns)
        mse = mean_squared_error(y_va, y_va_pred)
        corrscore = correlation_score(y_va, y_va_pred)
        
        del y_va

        print(f"Fold {fold} {X.shape[1]:4}: mse = {mse:.5f}, corr =  {corrscore:.5f}")
        score_list.append((mse, corrscore))
        break # We only need the first fold

    if len(score_list) > 1:
        # Show overall score
        result_df = pd.DataFrame(score_list, columns=['mse', 'corrscore'])
        print(f"Average LGBM mse = {result_df.mse.mean():.5f}; corr = {result_df.corrscore.mean():.5f}")

In [None]:
te_preds = []
n_estimators = 300
y_cols = Y.shape[1]
for i in range(y_cols):
    print(f"Training column {i:3} for test")
    model = lightgbm.LGBMRegressor(n_estimators=n_estimators, **lightgbm_params)
    model.fit(X, Y[:,i].copy())
    te_preds.append(model.predict(Xt))
y_te_pred = np.column_stack(te_preds)
del te_preds

print(f"Test_pred shape: {str(y_te_pred.shape):14}")