In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import copy

In [2]:
cell_line_feature=pd.read_csv("final_cell_line_feature.csv",index_col=0)

In [4]:
cell_line_feature.isnull().sum().sum()

np.int64(0)

In [5]:
len(cell_line_feature['cell_line_name'].unique())

717

In [7]:
cell_line_feature.shape

(717, 19226)

In [8]:
X = cell_line_feature.drop(columns=['cell_line_name']).values.astype(np.float32)

In [9]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_tensor = torch.tensor(X_scaled)

In [10]:
input_dim = X_tensor.shape[1]  
encoding_dim = 512
batch_size = 64
epochs = 50
patience = 5 

In [11]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16384),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16384, 8196),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(8196, 4096),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 2048),
            nn.ReLU(),
            nn.Linear(2048, 4096),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(4096, 8196),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(8196, 16384),
            nn.ReLU(),
            nn.Linear(16384, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [12]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

dataset = torch.utils.data.TensorDataset(X_tensor, X_tensor)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

best_loss = float('inf')
patience_counter = 0

for epoch in range(10):
    model.train()
    running_loss = 0.0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(dataloader.dataset)
    print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.6f}')
    
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_wts = copy.deepcopy(model.state_dict())  
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

model.load_state_dict(best_model_wts)

model.eval()


Epoch 1/50, Loss: 0.114559
Epoch 2/50, Loss: 0.029970
Epoch 3/50, Loss: 0.030271
Epoch 4/50, Loss: 0.030411
Epoch 5/50, Loss: 0.030450
Epoch 6/50, Loss: 0.030465
Epoch 7/50, Loss: 0.030463
Early stopping triggered at epoch 7


Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=19225, out_features=16384, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=16384, out_features=8196, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=8196, out_features=4096, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=4096, out_features=2048, bias=True)
    (10): ReLU()
    (11): Linear(in_features=2048, out_features=1024, bias=True)
    (12): ReLU()
    (13): Linear(in_features=1024, out_features=512, bias=True)
    (14): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=512, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=2048, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2048, out_features=4096, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.2, inplace=False)
    (7): Linear(in_features=4096, out_features=8196

In [13]:
model.eval()

with torch.no_grad():
    encoded_features = model.encoder(X_tensor)

In [14]:
a=pd.DataFrame(encoded_features)

In [16]:
model.eval()

with torch.no_grad():
    encoded_features = model.encoder(X_tensor)

In [17]:
cell_feature=pd.DataFrame(encoded_features)

In [18]:
cell_feature.isnull().sum().sum()

np.int64(0)

In [19]:
cell_feature.shape

(717, 512)

In [21]:
cell_feature = cell_feature.reset_index(drop=True)
cell_line_feature = cell_line_feature.reset_index(drop=True)
cell_feature['cell'] = cell_line_feature['cell_line_name']

In [22]:
cell_feature.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,cell
0,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,0.0,...,0.462274,54.36364,0.0,0.0,39.329762,38.971588,0.0,0.0,11.634261,A2058
1,36.460117,0.0,39.609081,0.0,24.177588,0.0,0.0,0.0,0.0,0.0,...,0.446537,52.41684,0.0,0.0,37.918495,37.576443,0.0,0.0,11.214973,A2780
2,32.583645,0.0,35.39822,0.0,21.608105,0.0,0.0,0.0,0.0,0.0,...,0.398011,46.841835,0.0,0.0,33.881237,33.579411,0.0,0.0,10.021283,A427
3,29.480316,0.0,32.031872,0.0,19.554235,0.0,0.0,0.0,0.0,0.0,...,0.359413,42.383743,0.0,0.0,30.654093,30.386757,0.0,0.0,9.06605,NCI-H460
4,29.205639,0.0,31.732414,0.0,19.370293,0.0,0.0,0.0,0.0,0.0,...,0.356257,41.986923,0.0,0.0,30.369259,30.102051,0.0,0.0,8.980433,RKO


In [23]:
drug_interaction=pd.read_csv("drug_interaction_drugcomb.csv",low_memory=False)

In [24]:
df_zip = drug_interaction.copy()
df_zip['label'] = (df_zip['synergy_zip'] > 0).astype(int)

In [25]:
df_interaction=df_zip[['drug_row','drug_col','cell_line_name','label']].dropna()

In [27]:
df_interaction.label.value_counts()

label
0    258510
1    207523
Name: count, dtype: int64

In [30]:
df_interaction['cell_line_name'] = df_interaction['cell_line_name'].str.strip()
cell_feature['cell_line_name'] = cell_feature['cell'].str.strip()

# Set index
cell_feature_indexed = cell_feature.set_index('cell_line_name')

# Create the dictionary
cell_feature_dict = cell_feature_indexed.to_dict(orient='index')

# Map it
cell_features_expanded = df_interaction['cell_line_name'].map(cell_feature_dict)

# Drop rows where mapping failed (optional, or you can raise an error)
cell_features_expanded = cell_features_expanded.dropna()

# Only keep valid rows in original dataframe
interaction_df_valid = df_interaction.loc[cell_features_expanded.index]

# Convert to DataFrame
cell_features_df = pd.DataFrame(cell_features_expanded.tolist())

# Concatenate with valid interaction data
interaction_with_features = pd.concat([interaction_df_valid.reset_index(drop=True),
                                       cell_features_df.reset_index(drop=True)], axis=1)


In [31]:
interaction_with_features.head()

Unnamed: 0,drug_row,drug_col,cell_line_name,label,0,1,2,3,4,5,...,503,504,505,506,507,508,509,510,511,cell
0,5-FU,ABT-888,A2058,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0.462274,54.36364,0.0,0.0,39.329762,38.971588,0.0,0.0,11.634261,A2058
1,5-FU,ABT-888,A2058,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0.462274,54.36364,0.0,0.0,39.329762,38.971588,0.0,0.0,11.634261,A2058
2,5-FU,ABT-888,A2058,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0.462274,54.36364,0.0,0.0,39.329762,38.971588,0.0,0.0,11.634261,A2058
3,5-FU,ABT-888,A2058,0,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0.462274,54.36364,0.0,0.0,39.329762,38.971588,0.0,0.0,11.634261,A2058
4,5-FU,AZD1775,A2058,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0.462274,54.36364,0.0,0.0,39.329762,38.971588,0.0,0.0,11.634261,A2058


In [32]:
interaction_with_features.shape

(166532, 517)

In [33]:
len(interaction_with_features.cell_line_name.unique())

40

In [34]:
drug_feature=pd.read_csv("drug_feature.csv",index_col=0)

In [36]:
drug_feature['drug'] = drug_feature['drug'].str.strip()
drug_feature_indexed = drug_feature.set_index('drug')
drug_feature_dict = drug_feature_indexed.to_dict(orient='index')


In [37]:
drug_row_features = interaction_with_features['drug_row'].str.strip().map(drug_feature_dict)
drug_col_features = interaction_with_features['drug_col'].str.strip().map(drug_feature_dict)

valid_indices = drug_row_features.dropna().index.intersection(drug_col_features.dropna().index)
interaction_with_features = interaction_with_features.loc[valid_indices].reset_index(drop=True)
drug_row_features = pd.DataFrame(drug_row_features.loc[valid_indices].tolist()).reset_index(drop=True)
drug_col_features = pd.DataFrame(drug_col_features.loc[valid_indices].tolist()).reset_index(drop=True)


In [38]:
drug_row_features.columns = [f'drug_row_feat_{i}' for i in range(drug_row_features.shape[1])]
drug_col_features.columns = [f'drug_col_feat_{i}' for i in range(drug_col_features.shape[1])]

final_interaction = pd.concat([
    interaction_with_features.reset_index(drop=True),
    drug_row_features,
    drug_col_features
], axis=1)


In [39]:
final_interaction.isnull().sum().sum()

np.int64(0)

In [40]:
final_interaction.shape

(166227, 4613)

In [41]:
final_interaction.head()

Unnamed: 0,drug_row,drug_col,cell_line_name,label,0,1,2,3,4,5,...,drug_col_feat_2038,drug_col_feat_2039,drug_col_feat_2040,drug_col_feat_2041,drug_col_feat_2042,drug_col_feat_2043,drug_col_feat_2044,drug_col_feat_2045,drug_col_feat_2046,drug_col_feat_2047
0,5-FU,ABT-888,A2058,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0,0,0,0,0,0,0,0,0,0
1,5-FU,ABT-888,A2058,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0,0,0,0,0,0,0,0,0,0
2,5-FU,ABT-888,A2058,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0,0,0,0,0,0,0,0,0,0
3,5-FU,ABT-888,A2058,0,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0,0,0,0,0,0,0,0,0,0
4,5-FU,AZD1775,A2058,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
final_interaction.drop(columns=['drug_row','drug_col','cell_line_name'],inplace=True)

In [43]:
final_interaction.head()

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,drug_col_feat_2038,drug_col_feat_2039,drug_col_feat_2040,drug_col_feat_2041,drug_col_feat_2042,drug_col_feat_2043,drug_col_feat_2044,drug_col_feat_2045,drug_col_feat_2046,drug_col_feat_2047
0,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
final_interaction.shape

(166227, 4610)

In [45]:
final_interaction.drop(columns='cell',inplace=True)

In [46]:
final_interaction.shape

(166227, 4609)

In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb

In [48]:
y=final_interaction.label

X = final_interaction.drop(columns='label')

In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [50]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,drug_col_feat_2038,drug_col_feat_2039,drug_col_feat_2040,drug_col_feat_2041,drug_col_feat_2042,drug_col_feat_2043,drug_col_feat_2044,drug_col_feat_2045,drug_col_feat_2046,drug_col_feat_2047
0,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,37.814968,0.0,41.079426,0.0,25.073776,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166222,45.293999,0.0,49.200001,0.0,30.029352,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
166223,45.293999,0.0,49.200001,0.0,30.029352,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
166224,45.293999,0.0,49.200001,0.0,30.029352,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
166225,45.293999,0.0,49.200001,0.0,30.029352,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [52]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")


Accuracy: 0.7032
ROC AUC: 0.7628


In [53]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.71      0.83      0.76     19184
           1       0.69      0.54      0.60     14062

    accuracy                           0.70     33246
   macro avg       0.70      0.68      0.68     33246
weighted avg       0.70      0.70      0.70     33246

