In [1]:
import pandas as pd
import numpy as np
import random
from itertools import combinations
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch_geometric.nn import HANConv
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, 
    roc_auc_score, classification_report
)
from sklearn.svm import SVC

In [2]:
edges_df = pd.read_csv('Net_final.csv', index_col=0)
print(edges_df)

for col in edges_df.columns[0:]:
    print(f"distribution - {col}:")
    print(edges_df[col].value_counts())
    print("\n")


       Regulator       Target RegulatorType TargetType regulatory_Mechanism
1          NEAT1   miR-194-5p        lncRNA      miRNA      ceRNA or sponge
2      LINC00460      miR-206        lncRNA      miRNA      ceRNA or sponge
3         MALAT1      miR-497        lncRNA      miRNA      ceRNA or sponge
4           MIAT      miR-29b        lncRNA      miRNA      ceRNA or sponge
5     CDKN2B-AS1  miR-181a-5p        lncRNA      miRNA      ceRNA or sponge
...          ...          ...           ...        ...                  ...
9513       WWTR1         YAP1           PCG        PCG            interacts
9514        YAP1        YWHAH           PCG        PCG            interacts
9515        YAP1        YWHAZ           PCG        PCG            interacts
9516        YAP1         YES1           PCG        PCG            interacts
9517       YWHAH        YWHAZ           PCG        PCG            interacts

[9517 rows x 5 columns]
distribution - Regulator:
Regulator
MALAT1       258
HOTAIR    

In [3]:
edges_df.columns = ['A', 'B', 'A_type', 'B_type', 'relation']
print(edges_df)

               A            B  A_type B_type         relation
1          NEAT1   miR-194-5p  lncRNA  miRNA  ceRNA or sponge
2      LINC00460      miR-206  lncRNA  miRNA  ceRNA or sponge
3         MALAT1      miR-497  lncRNA  miRNA  ceRNA or sponge
4           MIAT      miR-29b  lncRNA  miRNA  ceRNA or sponge
5     CDKN2B-AS1  miR-181a-5p  lncRNA  miRNA  ceRNA or sponge
...          ...          ...     ...    ...              ...
9513       WWTR1         YAP1     PCG    PCG        interacts
9514        YAP1        YWHAH     PCG    PCG        interacts
9515        YAP1        YWHAZ     PCG    PCG        interacts
9516        YAP1         YES1     PCG    PCG        interacts
9517       YWHAH        YWHAZ     PCG    PCG        interacts

[9517 rows x 5 columns]


In [4]:
lnc_feat_df = pd.read_csv('Label_final.csv', index_col=0)
print(lnc_feat_df)

      Regulator  cell.proliferation  cell.invasion  cell.migration  \
1       A2M-AS1                   1              1               1   
2         AATBC                   1              0               0   
3        ABALON                   0              0               0   
4     ACTA2-AS1                   1              1               0   
5      ACTG1P25                   1              0               0   
..          ...                 ...            ...             ...   
662       ZFAS1                   1              1               1   
663   ZFHX4-AS1                   0              1               1   
664   ZFPM2-AS1                   0              0               0   
665  ZNF407-AS1                   1              1               0   
666  ZNF667-AS1                   1              0               0   

     apoptosis.process  length  GC_content    A_freq    T_freq    G_freq  ...  \
1                    1    2930    0.429693  0.282253  0.288055  0.200000  ... 

In [5]:
lncrna_cluster_features = pd.read_csv('lncRNA_cluster_features.csv')
lncrna_cluster_features.rename(columns={lncrna_cluster_features.columns[0]: 'Regulator'}, inplace=True)
print(lncrna_cluster_features.head())

lncRNA_emb =  pd.read_csv('lncRNA_emb_features.csv')
lncRNA_emb.rename(columns={lncRNA_emb.columns[0]: 'Regulator'}, inplace=True)
print(lncRNA_emb.head())

    Regulator  cluster_k2  cluster_k3  cluster_k4  cluster_k5  cluster_k6  \
0       NEAT1           0           2           1           4           2   
1   LINC00460           0           2           1           4           2   
2      MALAT1           0           2           1           4           2   
3        MIAT           0           2           3           4           3   
4  CDKN2B-AS1           0           2           1           4           2   

   cluster_k7  cluster_k8  cluster_k9  cluster_k10  cluster_k11  
0           1           1           7            2            5  
1           1           1           7            6            5  
2           1           1           7            6            5  
3           1           1           7            2            0  
4           1           1           7            6            5  
    Regulator  feature_1  feature_2  feature_3  feature_4  feature_5  \
0       NEAT1  -0.089118  -0.015128   0.064141  -0.057461  -0.054880 

In [6]:

merged_df = pd.merge(lnc_feat_df, lncrna_cluster_features, on='Regulator', how='inner')
merged_df = pd.merge(merged_df, lncRNA_emb, on='Regulator', how='inner')
print(merged_df.head())
lnc_feat_df = merged_df

   Regulator  cell.proliferation  cell.invasion  cell.migration  \
0    A2M-AS1                   1              1               1   
1      AATBC                   1              0               0   
2     ABALON                   0              0               0   
3  ACTA2-AS1                   1              1               0   
4   ACTG1P25                   1              0               0   

   apoptosis.process  length  GC_content    A_freq    T_freq    G_freq  ...  \
0                  1    2930    0.429693  0.282253  0.288055  0.200000  ...   
1                  1    4598    0.614180  0.221618  0.164202  0.316007  ...   
2                  1    1903    0.565423  0.216500  0.218077  0.262218  ...   
3                  1    2450    0.483265  0.267347  0.249388  0.266122  ...   
4                  0    2321    0.506247  0.234382  0.259371  0.231366  ...   

   feature_119  feature_120  feature_121  feature_122  feature_123  \
0    -0.075707     0.131054     0.030071    -0.13012

In [7]:
new_order = ['Regulator', 'cell.proliferation'] + [col for col in lnc_feat_df.columns if col not in ['Regulator',
                                                                                                     'cell.proliferation', 
                                                                                                     'cell.invasion',
                                                                                                     'cell.migration',
                                                                                                    'apoptosis.process']]

lnc_feat_df = lnc_feat_df[new_order]
print(lnc_feat_df.head())

cols = lnc_feat_df.columns.tolist()
new_cols = ['lncRNA', 'label'] + [f'feat_{i}' for i in range(1, len(cols)-1)]
lnc_feat_df.columns = new_cols

print(lnc_feat_df.head())

   Regulator  cell.proliferation  length  GC_content    A_freq    T_freq  \
0    A2M-AS1                   1    2930    0.429693  0.282253  0.288055   
1      AATBC                   1    4598    0.614180  0.221618  0.164202   
2     ABALON                   0    1903    0.565423  0.216500  0.218077   
3  ACTA2-AS1                   1    2450    0.483265  0.267347  0.249388   
4   ACTG1P25                   1    2321    0.506247  0.234382  0.259371   

     G_freq    C_freq  dimer_AA  dimer_AC  ...  feature_119  feature_120  \
0  0.200000  0.229693       259       156  ...    -0.075707     0.131054   
1  0.316007  0.298173       237       279  ...    -0.088064     0.085754   
2  0.262218  0.303205       122        88  ...     0.110509    -0.117067   
3  0.266122  0.217143       166       125  ...    -0.016995     0.037305   
4  0.231366  0.274882       127       136  ...     0.023095    -0.093766   

   feature_121  feature_122  feature_123  feature_124  feature_125  \
0     0.030071  

In [8]:
X = lnc_feat_df.iloc[:, 2:].values
y = lnc_feat_df.iloc[:, 1].values

# Data split (60% train, 20% val, 20% test)
idx = np.arange(X.shape[0])
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=42, stratify=y)
train_idx, val_idx  = train_test_split(train_idx, test_size=0.25, random_state=42, stratify=y[train_idx])
# train_idx 60%, val_idx 20%, test_idx 20%

X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val     = X[val_idx], y[val_idx]
X_test, y_test   = X[test_idx], y[test_idx]

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train model
clf = SVC(probability=True, random_state=42)
clf.fit(X_train_scaled, y_train)

# Predict on test set
y_pred = clf.predict(X_test_scaled)
y_prob = clf.predict_proba(X_test_scaled)[:, 1]

# Compute metrics
bal_acc = balanced_accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"\nFinal Test balanced accuracy: {bal_acc:.4f}")
print(f"Final Test precision:         {prec:.4f}")
print(f"Final Test recall:            {rec:.4f}")
print(f"Final Test F1-score:          {f1:.4f}")
print(f"Final Test ROC-AUC:           {roc_auc:.4f}")



Final Test balanced accuracy: 0.5323
Final Test precision:         0.7803
Final Test recall:            1.0000
Final Test F1-score:          0.8766
Final Test ROC-AUC:           0.6899
