# Tutorial for running KinaseNet
😎 Welcome to the KinaseNet tutorial! In this notebook, we demonstrate how to run **KinaseNet** and how to obtain:

- the inferred kinase-substrate regulatory relationships, and  
- the sample-specific kinase activity profiles.


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from kinasenet import *

## Preprocess data

In [None]:
ct = 'example'
data_pp = DataProcessor(exp_path=f"example_data/{ct}.feather", 
                        ksr_path=f"example_data/prior_{ct}.tsv", 
                        output_path=f"example_data/preprocessed_data/{ct}", 
                        with_centering=False, quantile_range=(1, 99), unit_variance=False)
data, prior = data_pp.process_all()

Loading data...
Totally 11639 phosphosites and 291 samples

Executing RobustMinScaler...

Processing KSR...
Total number of merged kinases: 192

Saving data...
All preprocessed files are saved to example_data/preprocessed_data/example

Done!


In [None]:
data.shape, prior.shape

((291, 11639), (192, 11639))

## Run KinaseNet

In [None]:
data = pd.read_parquet("example_data/preprocessed_data/example/data.parquet")
prior = pd.read_parquet("example_data/preprocessed_data/example/prior.parquet")

data.shape, prior.shape

((291, 11639), (192, 11639))

In [None]:
dp1 = dp2 = 0

fit_model(data, prior, output_path=f"example_results/dp1_{dp1}_dp2_{dp2}/", 
         data_val_size=0.3, batch_size=16, fraction_gs=0.2, 
         num_epochs=200, cvs=5, num_epochs_refit=50, refit_iters=10, refit_resample=True, 
         weight_decays=[1e-10], lr=1e-4, 
         scheduler_class=torch.optim.lr_scheduler.CosineAnnealingLR, scheduler_kwargs={'T_max': 10}, 
         optimizer_class=torch.optim.Adam, optimizerkw={}, optimizer_paramskw={},
         dropout_rate1=dp1, dropout_rate2=dp2, dropout_rate3=dp2, activation=ReLU0(),
         eps=torch.finfo(torch.float).eps, eps_factor=10, fill_zeroed=True, device='cuda:1')

Total number of KSRs: 121108, number of KSRs used to train: 96912, number of KSRs used to test: 24196

cv: 0, weight_decay: 1e-10
Epoch [200/200], Train Loss: 0.0212, Val Loss: 0.0197, Val R2: 0.72829

cv: 0, weight_decay: 1e-10, refit: 0
Epoch [50/50], Train Loss: 0.0208, Val Loss: 0.0199, Val R2: 0.7170

cv: 0, weight_decay: 1e-10, refit: 1
Epoch [50/50], Train Loss: 0.0183, Val Loss: 0.0180, Val R2: 0.7589

cv: 0, weight_decay: 1e-10, refit: 2
Epoch [50/50], Train Loss: 0.0176, Val Loss: 0.0169, Val R2: 0.7450

cv: 0, weight_decay: 1e-10, refit: 3
Epoch [50/50], Train Loss: 0.0166, Val Loss: 0.0168, Val R2: 0.7573

cv: 0, weight_decay: 1e-10, refit: 4
Epoch [50/50], Train Loss: 0.0165, Val Loss: 0.0162, Val R2: 0.7703

cv: 0, weight_decay: 1e-10, refit: 5
Epoch [50/50], Train Loss: 0.0161, Val Loss: 0.0161, Val R2: 0.7666

cv: 0, weight_decay: 1e-10, refit: 6
Epoch [50/50], Train Loss: 0.0159, Val Loss: 0.0166, Val R2: 0.7743

cv: 0, weight_decay: 1e-10, refit: 7
Epoch [50/50], Trai

## Remove temporary models (optionally)
⚠️ **Note:** Here we present an optional operation. For a specific set of hyperparameter combinations, we retain only the refit model with the highest R² in each cross-validation.

In [None]:
l2_map = {}
l2_map[1e-10] = '1e-10'
l2_map[1e-07] = '1e-07'
l2_map[1e-04] = '0.0001'
l2_map[1e-01] = '0.1'

path = 'example_results/'
for dir in sorted(os.listdir(f"{path}/")):
    if 'performance.csv' in os.listdir(f"{path}/{dir}/"):
        perf = pd.read_csv(f"{path}/{dir}/performance.csv", index_col=0)
        perf = perf[(perf['network_type'] == 'cpd')&(perf['refit'].notna())]
        perf['weight_decay'] = perf['weight_decay'].map(l2_map)
        perf['filename'] = 'cv'+perf['cv'].astype(str)+'_'+'wd'+perf['weight_decay']+'_'+'refit'+perf['refit'].astype(int).astype('str')+'.pth'
        
        to_del_file = []
        for cv, df in perf.groupby('cv'):
            for wd, df2 in df.groupby('weight_decay'):
                
                max_r2_rowidx = df2['val_r2'].idxmax()
                keep_file = df2.loc[max_r2_rowidx]['filename']
                
                del_file = df2['filename'].tolist()
                del_file.remove(keep_file)
        
                to_del_file += del_file
        
        for file in to_del_file:
            try:
                os.remove(f"{path}/{dir}/model/{file}")
            except FileNotFoundError as e:
                print(e)
                continue

## Downstream applications

We recommend selecting the model checkpoint with the highest R² for practical applications. In this example, we use a representative model, **example_results/model/cv1_wd1e-10_refit6.pth**, to demonstrate how to extract kinase-substrate relationships inferred by KinaseNet and to obtain sample-specific kinase activity profiles.

In [None]:
# An auxiliary function used to standardize the output format.
def split_cols(df):
    for col in df.columns:
        if ';' in col:
            new_cols = col.split(';')
            for new_col in new_cols:
                df[new_col] = df[col].to_list()
    
    df = df.loc[:, ~df.columns.str.contains(';')]

    return df

In [None]:
# Global settings.
DEVICE = 'cuda:0'
EPS = torch.finfo(torch.float).eps

### Extract KSR

In [None]:
def extract_ksr(preprocessed_data_path, preprocessed_prior_path, model_path, batch_size):
    data = pd.read_parquet(preprocessed_data_path)
    prior = pd.read_parquet(preprocessed_prior_path)

    model = torch.load(model_path).to(DEVICE)
    data_loader = setup_dataloader_from_df(df=data, batch_size=batch_size, shuffle=False)
    node_indices = list(range(model.hidden_dim))
    kin_cpd, _ = cal_cpd(model, data_loader, node_indices, device=DEVICE)
    kin_cpd, _ = threshold_cpd(kin_cpd, EPS)

    kin_cpd_df = pd.DataFrame(kin_cpd.cpu().numpy(), index=prior.index, columns=prior.columns)
    kin_cpd_df = kin_cpd_df.T
    kin_cpd_df = split_cols(kin_cpd_df)
    kin_cpd_df = kin_cpd_df.T

    return kin_cpd_df

In [None]:
preprocessed_data_path = 'example_data/preprocessed_data/example/data.parquet'
preprocessed_prior_path = 'example_data/preprocessed_data/example/prior.parquet'
model_path = 'example_results/dp1_0_dp2_0/model/cv1_wd1e-10_refit6.pth'

ksr_erv_df = extract_ksr(preprocessed_data_path, preprocessed_prior_path, model_path, batch_size=32)

Delete meta kinase 191 in layer 3.

In [None]:
ksr_erv_df

Unnamed: 0,A0A068F7M9:S367,A0A068F7M9:S524,A0A068F7M9:S536,A0A068F7M9:S549,A0A087WVQ6:S1498,A0A087WVQ6:T109,A0A087WVQ6:T242,A0A087WVQ6:T398,A0A087WX45:S590,A0A087WX97:S312,...,Q9Y6X8:T37,Q9Y6X9:S615,Q9Y6X9:S739,Q9Y6X9:S743,Q9Y6X9:S777,Q9Y6X9:S779,Q9Y6Y8:S737,Q9Y6Y8:S926,U5Y3L1:S155,X6R2W0:S134
Q05655,0.560783,0.002952,0.019174,0.814604,0.288820,0.239341,0.433433,0.154425,0.000000,0.746670,...,0.349618,0.303151,0.445013,0.531324,0.690931,0.682163,0.158852,0.420259,0.791586,0.331931
Q9UQM7,0.040898,0.060702,0.000000,0.002258,0.029067,0.003609,0.076392,0.018265,0.041211,0.000000,...,0.136592,0.008567,0.091247,0.198652,0.066138,0.034857,0.000000,0.002927,0.006671,0.000126
P68400,0.131817,0.524378,0.574328,0.261528,0.284119,0.112574,0.458967,0.050323,0.589214,0.055542,...,0.279168,0.529356,0.080569,0.162649,0.474530,0.277000,0.186413,0.000000,0.106563,0.011026
O14965,0.072127,0.126282,0.044531,0.076322,0.015847,0.025479,0.006221,0.017482,0.000000,0.154513,...,0.117350,0.004515,0.009881,0.063228,0.059244,0.011802,0.000000,0.103338,0.327346,0.024799
P35269,0.000132,0.000110,0.000000,0.000000,0.000000,0.000000,0.000005,0.000024,0.000000,0.000095,...,0.000041,0.000103,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000009,0.000051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q8TD08,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Q9UBS0,0.000040,0.000142,0.000132,0.000001,0.000110,0.000000,0.000031,0.000000,0.000162,0.000000,...,0.000000,0.000000,0.000081,0.000000,0.000000,0.000000,0.000197,0.000097,0.000052,0.000000
Q16644,0.000040,0.000142,0.000132,0.000001,0.000110,0.000000,0.000031,0.000000,0.000162,0.000000,...,0.000000,0.000000,0.000081,0.000000,0.000000,0.000000,0.000197,0.000097,0.000052,0.000000
Q9H093,0.000203,0.000153,0.000275,0.000059,0.000010,0.000000,0.000054,0.000000,0.000013,0.000344,...,0.000013,0.000000,0.000000,0.000000,0.000091,0.000000,0.000000,0.000000,0.000034,0.000002


### Obtain sample-specific kinase activities

In [None]:
data = pd.read_parquet("example_data/preprocessed_data/example/data.parquet")
prior = pd.read_parquet("example_data/preprocessed_data/example/prior.parquet")

data.shape, prior.shape

((291, 11639), (192, 11639))

In [None]:
model_path = 'example_results/dp1_0_dp2_0/model/cv1_wd1e-10_refit6.pth'
model = torch.load(model_path)

In [None]:
kin_act_df = cal_kin_act(data=data, prior=prior, model=model, batch_size=32, meta_kin=False, device='cpu')
kin_act_df = split_cols(kin_act_df)
kin_act_df = kin_act_df.T

In [None]:
kin_act_df

Unnamed: 0,PDAC023 Log Ratio_249,PDAC132 Log Ratio_249,PDAC117 Log Ratio_249,PDAC173 Log Ratio_249,PDAC024 Log Ratio_249,PDAC078 Log Ratio_249,PDAC151 Log Ratio_249,PDAC016 Log Ratio_249,PDAC087 Log Ratio_249,PDAC280 Log Ratio_249,...,CPT0088180004 Log Ratio_271,CPT0094130004 Log Ratio_271,CPT0019300004 Log Ratio_271,CPT0109300004 Log Ratio_271,CPT0078250003 Log Ratio_271,CPT0124610004 Log Ratio_271,CPT0078160004 Log Ratio_271,CPT0197420004 Log Ratio_271,CPT0226530004 Log Ratio_271,CPT0236340004 Log Ratio_271
Q05655,4.442262,10.968646,8.013958,8.570335,9.267240,9.807008,10.371140,8.397958,9.100194,7.774693,...,9.220232,9.798079,7.951744,11.023209,9.777719,9.984776,8.489233,9.361906,10.151527,5.243673
Q9UQM7,0.513608,4.526225,3.198536,3.234443,2.712388,3.461047,4.104052,1.925871,2.846691,3.083754,...,3.580673,4.432765,2.445521,4.880162,4.211061,3.853444,3.386337,3.578969,4.609526,0.244718
P68400,4.302485,10.802397,8.877197,8.442153,9.205274,9.741819,10.302524,8.123404,8.710963,8.960730,...,10.328449,8.963333,7.253847,11.260491,10.485667,9.097406,8.954916,9.988289,11.276884,5.522637
O14965,2.600602,6.481169,4.842426,5.036903,5.312742,5.832384,5.958508,4.595213,5.344114,4.525845,...,5.610330,5.460195,4.531932,6.427862,6.028811,5.973714,4.775837,5.168546,6.191411,2.643049
P35269,0.004691,0.009109,0.007143,0.007684,0.011728,0.009466,0.011543,0.007782,0.009268,0.006352,...,0.009456,0.002337,0.006906,0.010040,0.010021,0.013096,0.006439,0.009766,0.009499,0.003759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q8TD08,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Q9UBS0,0.011166,0.018575,0.018252,0.014688,0.023525,0.017524,0.021745,0.017042,0.016591,0.016960,...,0.021279,0.011859,0.011765,0.023493,0.022338,0.018713,0.019916,0.023337,0.021990,0.017857
Q16644,0.011166,0.018575,0.018252,0.014688,0.023525,0.017524,0.021745,0.017042,0.016591,0.016960,...,0.021279,0.011859,0.011765,0.023493,0.022338,0.018713,0.019916,0.023337,0.021990,0.017857
Q9H093,0.008115,0.029250,0.021345,0.019324,0.019081,0.022258,0.023689,0.014952,0.019702,0.020117,...,0.024999,0.014116,0.023313,0.027222,0.023965,0.024251,0.016848,0.024198,0.026590,0.015633
