# Data Preprocessing and Matrix Generation

This notebook preprocesses data and generates various matrices related to peptides, microbes, and diseases. The following steps are performed:

1. Loading data files.
2. Constructing node types and saving them.
3. Generating positive and negative pairs.
4. Creating adjacency matrices for various relations (peptide-microbe, microbe-disease, peptide-peptide, microbe-microbe, disease-disease).
5. Saving the processed data.
### Step 1: Load Data Files

In [27]:
import pandas as pd
import numpy as np
import os
import scipy.sparse as sp
import pickle

# Define file paths
prefix = "./aBiofilm"

# Load the data
pm = pd.read_csv(os.path.join(prefix, "adj.dat"), encoding='utf-8', delimiter=',',
                 names=['pid', 'mid', 'rating']).reset_index(drop=True)
pp = pd.read_csv(os.path.join(prefix, "drugsimilarity.dat"), encoding='utf-8', delimiter=',',
                 names=['p1', 'p2', 'weight']).reset_index(drop=True)
mm = pd.read_csv(os.path.join(prefix, "microbesimilarity.dat"), encoding='utf-8', delimiter=',',
                 names=['m1', 'm2', 'weight']).reset_index(drop=True)

# The original dataset uses 1-based indexing, so we subtract 1 from all indices to convert them to 0-based indexing.
pm[['pid', 'mid']] = pm[['pid', 'mid']] - 1
pp[['p1', 'p2']] = pp[['p1', 'p2']] - 1
mm[['m1', 'm2']] = mm[['m1', 'm2']] - 1

print('==========Step 1 complete==========')



Step 2: Construct Node Types

In [28]:
# Set offsets for node types
max_pid = pm['pid'].max() + 1
print(max_pid)
max_mid = pm['mid'].max() + 1
print(max_mid)

offsets = {'p': max_pid, 'm': max_pid + max_mid}
# Initialize node types array
node_types = np.zeros((offsets['m'],), dtype=np.int32)
node_types[offsets['p']:] = 1

# Save node types to file
if not os.path.exists("../preprocessed/node_types.npy"):
    np.save("../preprocessed/node_types", node_types)
print('==========Step 2 complete==========')

1720
140


Step 3: Generate Positive and Negative Pairs

In [29]:
# Generate positive pairs
pm_pos = pm[pm['rating'] == 1].to_numpy()[:, :2]
pm_pos[:, 1] += offsets['p']

# Generate negative pairs
neg_ratings = pm[pm['rating'] == 0].to_numpy()[:, :2]
neg_ratings[:, 1] += offsets['p']
assert (pm_pos.shape[0] + neg_ratings.shape[0] == pm.shape[0])

# Shuffle and select negative pairs
indices_neg = np.arange(neg_ratings.shape[0])
np.random.shuffle(indices_neg)
indices_neg = indices_neg[:pm_pos.shape[0] * 1]
neg_data = neg_ratings[indices_neg]
np.savez("../preprocessed/neg_pairs_offset", neg_data=neg_data)

# Shuffle positive pairs
indices = np.arange(pm_pos.shape[0])
np.random.shuffle(indices)
pos_data = pm_pos[indices]
np.savez("../preprocessed/pos_pairs_offset", pos_data=pos_data)
print('==========Step 3 complete==========')



Step 4: Create Adjacency Matrices

Drug-Microbe Relation

In [30]:
# Create adjacency matrix for drug-microbe relation
adj_offset = np.zeros((node_types.shape[0], node_types.shape[0]), dtype=np.float32)
adj_offset[pm_pos[:, 0], pm_pos[:, 1]] = 1
adjs_offset = {'0': sp.coo_matrix(adj_offset)}


Drug-Drug Relation

In [31]:
# Create adjacency matrix for peptide-peptide relation
pp_npy = pp.to_numpy(int)[:, :2]
pp_matrix = np.zeros((max_pid, max_pid), dtype=float)
pp_score = pp['weight'].tolist()
adj_offset = np.zeros((node_types.shape[0], node_types.shape[0]), dtype=np.float32)
for i, j, k in zip(pp_npy[:, 0], pp_npy[:, 1], pp_score):
    adj_offset[i, j] = k
    pp_matrix[i, j] = k
    adj_offset[j, i] = k
    pp_matrix[j, i] = k
adjs_offset['2'] = sp.coo_matrix(adj_offset)


Microbe-Microbe Relation

In [32]:
# Create adjacency matrix for microbe-microbe relation
mm_npy = mm.to_numpy(int)[:, :2]
mm_matrix = np.zeros((max_mid, max_mid), dtype=float)
mm_score = mm['weight'].tolist()
adj_offset = np.zeros((node_types.shape[0], node_types.shape[0]), dtype=np.float32)
for i, j, k in zip(mm_npy[:, 0] + offsets['p'], mm_npy[:, 1] + offsets['p'], mm_score):
    adj_offset[i, j] = k
    mm_matrix[i - offsets['p'], j - offsets['p']] = k
adjs_offset['3'] = sp.coo_matrix(adj_offset)
print('==========Step 4 complete==========')



Step 5: Save the Processed Data

In [33]:
# Save combined matrices
np.savez('../preprocessed/combined_matrices.npz', dp_matrix=pp_matrix, pd_matrix=mm_matrix)

# Save adjacency matrices
with open("../preprocessed/adjs_offset.pkl", "wb") as f2:
    pickle.dump(adjs_offset, f2)
print('==========Step 5 complete==========')

