In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import gseapy as gp
import sys
import time
import requests

# read data

In [3]:
#### string gene_id map
local_stringdb = os.path.join('/itf-fi-ml/shared/users/ziyuzh/svm/data/stringdb','2023')

ppidf = pd.read_csv(os.path.join(local_stringdb,'9606.protein.info.v12.0.txt'), sep='\t', header=0, usecols=['#string_protein_id', 'preferred_name'])
ppidf['preferred_name'] = ppidf['preferred_name'].str.upper()
stringId2name = ppidf.set_index('#string_protein_id')['preferred_name'].to_dict()
name2stringId = ppidf.set_index('preferred_name')['#string_protein_id'].to_dict()
ppidf = pd.read_csv(os.path.join(local_stringdb,'9606.protein.aliases.v12.0.txt'), sep='\t', header=0, usecols=['#string_protein_id', 'alias']).drop_duplicates(['alias'], keep='first')
ppidf['alias'] = ppidf['alias'].str.upper()
aliases2stringId = ppidf.set_index('alias')['#string_protein_id'].to_dict()

def string_convert(gene):
    if gene in name2stringId.keys():
        return name2stringId[gene]
    elif gene in aliases2stringId.keys():
        return aliases2stringId[gene]
    else:
        return None

In [5]:

with open('/itf-fi-ml/shared/users/ziyuzh/svm/data/stringdb/2023/name_convert.pkl', 'wb') as file:
    pickle.dump([stringId2name,name2stringId,aliases2stringId], file)

In [3]:
# # ############################ string id map with uniport ######################################################### don't delete
# ppi_features = pd.read_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/ppi_full_emb.csv')
# ppi_features['gene_name'] = ppi_features['gene_id'].map(stringId2name)
# ### string2uniport generated from uniport id map, stringid to uniport id
# string2uniport = pd.read_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/uniport_id/idmapping_string.tsv',sep='\t')
# string_dict = dict(zip(string2uniport['From'], string2uniport['Entry']))
# ppi_features['uniport_id'] = ppi_features['gene_id'].map(string_dict)
# ppi_features.to_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/ppi_full_emb_uniport.csv',index=False)

# ## 512 unmapped string id
# ### unipor_id_maps generated from uniport id map, gene name to uniport id
# unipor_id_maps = pd.read_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/uniport_id/idmapping.tsv',sep='\t')
# ### returns multiple uniport id for each gene name, take the first human species
# first_to_items = unipor_id_maps.groupby('From').first().reset_index()
# uniport_dict = dict(zip(first_to_items['From'], first_to_items['To']))
# ppi_features.loc[ppi_features['uniport_id'].isna(), 'uniport_id'] = ppi_features.loc[ppi_features['uniport_id'].isna(), 'gene_name'].map(uniport_dict)
# ppi_features = ppi_features.dropna(subset=['uniport_id'])
# len(ppi_features)


all_df = pd.read_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/disgent_2020/disgenet_string.csv')
ppi_features = pd.read_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/ppi_full_emb_uniport.csv')

In [4]:
def read_data(disease):
    pos_genes_list = all_df[all_df['disease_id']==disease]['string_id']
    df = ppi_features.iloc[:, 1:]
    df = df.rename(columns={'gene_id': 'string_id'})
    df['label'] = df['string_id'].isin(pos_genes_list).astype(int)
    df.head(3)

    X = df.loc[:, df.columns.str.startswith("network_")].to_numpy()
    y = df['label'].to_numpy()
    return df, X, y

In [5]:
# selected_diseases = ['ICD10_G20', 'ICD10_C50']
selected_diseases = ['ICD10_G20']
for disease in selected_diseases:
    methods = ['ooc', 'random_negative', 'pseudo_labeling']
    df, X, y = read_data(disease)

In [6]:
df[df['label']==1]['string_id']

107      9606.ENSP00000389103
172      9606.ENSP00000299138
200      9606.ENSP00000360108
224      9606.ENSP00000270142
591      9606.ENSP00000298910
                 ...         
16871    9606.ENSP00000496339
17092    9606.ENSP00000369822
17135    9606.ENSP00000480474
17985    9606.ENSP00000373627
18056    9606.ENSP00000376776
Name: string_id, Length: 146, dtype: object

### KEGG features (no need run, use prepared data)

In [None]:
from collections import defaultdict
def invert_dict(original_dict):
    inverted_dict = defaultdict(list)

    for key, values in original_dict.items():
        for value in values:
            inverted_dict[string_convert(value)].append(key)  # Swap key and value

    return dict(inverted_dict) 

keggs = gp.get_library(name='KEGG_2021_Human',organism='human')
kegg_features = invert_dict(keggs)
# Save dictionary to a Pickle file
with open("/itf-fi-ml/shared/users/ziyuzh/svm/data/kegg/kegg_features.pkl", "wb") as f:
    pickle.dump(kegg_features, f)

### CD features (no need run, use prepared data)

In [104]:
import requests
import time
import pandas as pd

def fetch_cd_search_results(query):
    """
    Performs a batch CD-Search for a given query and returns the results as a DataFrame.
    
    Parameters:
        query (str): The query sequence for CD-Search.
    
    Returns:
        pd.DataFrame: The search results as a DataFrame.
    """
    BWRPSB_URL = "https://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi"
    
    # Default parameters
    params = {
        "useid1": "true",
        "maxhit": 250,
        "filter": "true",
        "db": "cdd",
        "evalue": 0.01,
        "cddefl": "false",
        "qdefl": "false",
        "dmode": "rep",
        "clonly": "false",
        "tdata": "hits",
        "queries": query,
    }
    
    # Submit search request
    response = requests.post(BWRPSB_URL, data=params)
    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code} - {response.reason}")
    
    # Extract request ID
    rid = None
    for line in response.text.split("\n"):
        if line.startswith("#cdsid"):
            rid = line.split()[1]
            break
    
    if not rid:
        raise Exception("Failed to retrieve request ID")
    
    # Check search status
    while True:
        time.sleep(5)
        status_response = requests.post(BWRPSB_URL, data={"tdata": "hits", "cdsid": rid})
        if status_response.status_code != 200:
            raise Exception(f"Error: {status_response.status_code} - {status_response.reason}")

        for line in status_response.text.split("\n"):
            if line.startswith("#status"):
                status = int(line.split()[1])
                if status == 0:
                    break
                elif status == 3:
                    continue
                else:
                    error_messages = {
                        1: "Invalid request ID",
                        2: "Invalid input (missing query or ID)",
                        4: "Queue Manager Service error",
                        5: "Data corrupted or unavailable",
                    }
                    raise Exception(error_messages.get(status, "Unknown error"))
        else:
            continue
        break
    
    # Retrieve results
    result_response = requests.post(BWRPSB_URL, data={"tdata": params["tdata"], "cdsid": rid})
    if result_response.status_code != 200:
        raise Exception(f"Error retrieving results: {result_response.status_code}")
    
    # Process results
    filtered_data = [line for line in result_response.text.split('\n') if not line.startswith('#') and line.strip() != '']
    if not filtered_data:
        return pd.DataFrame()  # Return an empty DataFrame if no data is found
    
    header = filtered_data[0].split('\t')
    data_rows = [line.split('\t') for line in filtered_data[1:]]
    
    return pd.DataFrame(data_rows, columns=header)


In [None]:
def batch(iterable, size):
    """Splits an iterable into batches of given size."""
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

def process_in_batches(data, func, batch_size=1000):
    """Applies func to each batch of data and stacks results into a single DataFrame."""
    results = [func(batch) for batch in batch(data, batch_size)]
    return pd.concat(results, ignore_index=True) if results else pd.DataFrame()


cd_results = process_in_batches(df['uniport_id'].dropna().tolist(), fetch_cd_search_results)
cd_results['uniport_id'] = cd_results['Query'].str.split('-').str[-1].str.strip()
cd_results = cd_results.merge(df.dropna(subset=['uniport_id'])[['uniport_id', 'string_id']], how='left', on='uniport_id')

cd_dict = dict()
for string_id, subdf in cd_results.groupby('string_id'):
    cd_dict[string_id] = subdf['Accession'].tolist()

# Save dictionary to a Pickle file
with open("/itf-fi-ml/shared/users/ziyuzh/svm/data/cd/cd_features.pkl", "wb") as f:
    pickle.dump(cd_dict, f)

### read kegg and cd features

In [3]:

# Read dictionary from a Pickle file
with open("/itf-fi-ml/shared/users/ziyuzh/svm/data/kegg/kegg_features.pkl", "rb") as f:
    kegg_features = pickle.load(f)

# Read dictionary from a Pickle file
with open("/itf-fi-ml/shared/users/ziyuzh/svm/data/cd/cd_features.pkl", "rb") as f:
    cd_features = pickle.load(f)


In [6]:
def merge_values(query_dict):
    all_items = []
    for key in query_dict:
        all_items.extend(query_dict[key])
    return all_items

def sparsity(query_dict):
    items = merge_values(query_dict)
    return 1-(len(items)/(len(query_dict)*len(set(items))))

print(sparsity(kegg_features),sparsity(cd_features))

0.9869735533743905 0.9998526575554529


In [7]:
len(set(merge_values(kegg_features))),len(kegg_features)

(320, 7794)

In [8]:
len(set(merge_values(cd_features))),len(cd_features)

(15218, 17791)

### calculate similarility score

In [9]:
from itertools import combinations

def unique_pairs(items):
    return list(combinations(set(items), 2))

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def feature_similarity(gene1,gene2,query_dict):
    jaccard_score = jaccard_similarity(set(query_dict[gene1]), set(query_dict[gene2]))
    return jaccard_score

def pair_similarity(query_dict):
    pairs = unique_pairs(list(query_dict.keys()))
    score_dict = dict()
    for pair in pairs:
        score = feature_similarity(pair[0],pair[1],query_dict)
        score_dict[pair] = score
    return score_dict


In [10]:
kegg_dict = pair_similarity(kegg_features)
print(1 - (list(kegg_dict.values()).count(0)/len(kegg_dict)))

0.05289568377244913


In [11]:
cd_dict = pair_similarity(cd_features)
print(1 - (list(cd_dict.values()).count(0)/len(cd_dict)))

0.0030410244943560594


In [12]:
# Save dictionary to a Pickle file
with open("/itf-fi-ml/shared/users/ziyuzh/svm/data/kegg/kegg_sim.pkl", "wb") as f:
    pickle.dump(kegg_dict, f)
# Save dictionary to a Pickle file
with open("/itf-fi-ml/shared/users/ziyuzh/svm/data/cd/cd_sim.pkl", "wb") as f:
    pickle.dump(cd_dict, f)

# svm

In [7]:
from sklearn import svm
from sklearn import metrics
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold,ParameterGrid,GridSearchCV, train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import precision_score, recall_score, roc_curve, auc
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import fcluster

import matplotlib.pyplot as plt



### mask high positive samples (based on train set)

In [None]:
def mask_enrich(train_pos):
    enr = gp.enrichr(gene_list=train_pos.map(stringId2name),
                    gene_sets=['KEGG_2021_Human'],
                    organism='human', 
                    outdir=None, 
                    )
    enr_df = enr.results
    gene_lists = enr_df[enr_df['Adjusted P-value']<0.01]['Genes'].to_list()
    mask_gene_pool = set()
    for items in gene_lists:
        genes = items.split(';')
        mask_gene_pool.update(set(genes))
    mask_index = df[df['string_id'].isin([string_convert(gene) for gene in list(mask_gene_pool)])].index
    return mask_index

def mask_ppi_loop(train_pos, threshold):
    ppi_connection = pd.read_csv(os.path.join(local_stringdb,'9606.protein.links.v12.0.txt'), sep=' ', header=0).convert_dtypes().replace(0, float('nan'))
    ppi_connection_med = ppi_connection[ppi_connection['combined_score']>threshold]
    first_loop_genes = set(ppi_connection_med[ppi_connection_med['protein1'].isin(list(train_pos))]['protein2'].tolist())
    mask_index = df[df['string_id'].isin(first_loop_genes)].index
    return mask_index

In [None]:
## all 17013
## med confidence (400) 8549
## high confidence (700) 3692
## highest confidence (900) 1950

In [8]:
def average_rank_ratio(y_scores, y_test):
    """
    Calculate the average predicted rank of true positives.

    Parameters:
    y_scores (array-like): Decision function scores from the classifier.
    y_test (array-like): True binary labels (0 for negative, 1 for positive).

    Returns:
    float: The average rank of true positives.
    """
    
    # Convert inputs to numpy arrays for consistency
    y_scores = np.array(y_scores)
    y_test = np.array(y_test)

    # Step 1: Sort scores in descending order and assign ranks
    sorted_indices = np.argsort(-y_scores)  # Negative for descending sort
    ranks = np.empty_like(sorted_indices)
    ranks[sorted_indices] = np.arange(1, len(y_scores) + 1)  # Rank starts from 1

    # Step 2: Identify true positives
    true_positive_indices = np.where(y_test == 1)[0]

    # Step 3: Extract ranks of true positives
    true_positive_ranks = ranks[true_positive_indices]

    # Step 4: Calculate the average rank of true positives
    average_rank = np.mean(true_positive_ranks)

    rank_ratio = average_rank/y_test.shape[0]

    return round(rank_ratio,4)

def eval(clf, X_train, y_train, X_test, y_test, print_result = False):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Check if predict_proba or decision_function is available for AUROC
    if hasattr(clf, "predict_proba"):
        y_scores = clf.predict_proba(X_test)
        if y_scores.shape[1] == 2:
            y_scores = y_scores[:, 1]  # Use positive class probabilities for binary classification
    elif hasattr(clf, "decision_function"):
        y_scores = clf.decision_function(X_test)
    else:
        y_scores = None  # AUROC cannot be computed without scores

    rank_ratio = average_rank_ratio(y_scores, y_test)

    # Compute AUROC if scores are available
    if y_scores is not None:
        try:
            auroc = roc_auc_score(y_test, y_scores, multi_class='ovr' if len(set(y_test)) > 2 else 'raise')
        except:
            auroc = "AUROC computation failed (possibly due to label issues)"
    else:
        auroc = "AUROC not available (no predict_proba or decision_function)"
    if print_result == True:
        # Print classification metrics
        print(classification_report(y_test, y_pred))
        
        print(f"""
        Accuracy: {accuracy_score(y_test, y_pred)}
        Recall: {recall_score(y_test, y_pred, average="weighted")}
        Precision: {precision_score(y_test, y_pred, average="weighted")}
        F1-score: {f1_score(y_test, y_pred, average="weighted")}
        AUROC: {auroc}
        Rank ratio: {rank_ratio * 100:.2f}%
        """)
        
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

    return (
        accuracy_score(y_test, y_pred), 
        recall_score(y_test, y_pred, average="weighted"), 
        precision_score(y_test, y_pred, average="weighted"), 
        f1_score(y_test, y_pred, average="weighted"),
        auroc,
        rank_ratio
    )


def build_normalized_linear_kernel():
    """
    Returns a function that computes a normalized linear kernel (cosine similarity).
    The scaling factor C is optional and can adjust the magnitude of the kernel.
    """
    def normalized_linear_kernel(X, Y):
        # Step 1: Manually compute the linear kernel (dot product)
        K = np.dot(X, Y.T)
        
        # Step 2: Compute norms of X and Y for normalization
        X_norms = np.linalg.norm(X, axis=1)
        Y_norms = np.linalg.norm(Y, axis=1)
        
        # Avoid division by zero
        X_norms[X_norms == 0] = 1e-10
        Y_norms[Y_norms == 0] = 1e-10
        
        # Normalize the kernel (cosine similarity)
        normalization_matrix = np.outer(X_norms, Y_norms)
        K_normalized = K / normalization_matrix
        
        return K_normalized
    
    return normalized_linear_kernel



def build_normalized_poly_kernel(coef0=1.0, degree=2):
    """
    Returns a function that computes a normalized polynomial kernel with dynamic parameters.
    
    Parameters:
    - coef0: Independent term added to the dot product (default=1.0).
    - degree: Degree of the polynomial kernel (default=2).
    """
    def custom_poly_kernel(X, Y):
        # Step 1: Compute the polynomial kernel
        K = (np.dot(X, Y.T) + coef0) ** degree
        
        # Step 2: Compute self-similarities (for normalization)
        K_self_X = (np.sum(X * X, axis=1) + coef0) ** degree  # Diagonal for X
        K_self_Y = (np.sum(Y * Y, axis=1) + coef0) ** degree  # Diagonal for Y
        
        # Step 3: Avoid division by zero (replace zeros with a small number)
        K_self_X[K_self_X == 0] = 1e-10
        K_self_Y[K_self_Y == 0] = 1e-10
        
        # Step 4: Normalize the kernel
        normalization_matrix = np.outer(np.sqrt(K_self_X), np.sqrt(K_self_Y))
        K_normalized = K / normalization_matrix
        
        return K_normalized
    
    return custom_poly_kernel

In [9]:
def getLinkageMat(model):
    children = model.children_
    cs = np.zeros(len(children))
    N = len(model.labels_)
    for i,child in enumerate(children):
        count = 0
        for idx in child:
            count += 1 if idx < N else cs[idx - N]
        cs[i] = count
    return np.column_stack([children, model.distances_, cs])

def calculate_proportion(hierarchical_labels,y_train):
    proportion = []
    unique_clusters = np.unique(hierarchical_labels)
    # Total number of samples with label 1
    total_y1 = np.sum(y_train == 1)
    print('total positive',total_y1)
    for c in unique_clusters:
        # Indices of samples in the current cluster
        cluster_indices = np.where(hierarchical_labels == c)[0]
        
        # Number of samples with label 1 in this cluster
        cluster_y1 = np.sum(y_train[cluster_indices] == 1)
        
        # Calculate A_i using the corrected formula
        A_i = len(cluster_indices) * (1 - (cluster_y1 / total_y1))
        proportion.append(A_i)
    return proportion

def cluster_negative_sampling(hierarchical_labels,y_train,proportion,neg_size):
    neg_cluster = []
    all_sampled_negatives = []
    unique_clusters = np.unique(hierarchical_labels)
    total_proportion = sum(proportion)
    if total_proportion == 0:
        raise ValueError("Sum of proportions is zero. Check the 'proportion' calculation.")

    for index, c in enumerate(unique_clusters):
        A_i = proportion[index]
        neg_i = int((A_i / total_proportion) * neg_size)
        cluster_indices = np.where(hierarchical_labels == c)[0]
        negative_indices = np.array(cluster_indices)[y_train[cluster_indices] == 0]
        # Check if there are enough negative samples to draw from
        if len(negative_indices) == 0:
            continue  
        elif neg_i > len(negative_indices):
            neg_i = len(negative_indices)  # Adjust to the maximum available
        # Sample negative indices without replacement
        sampled_indices = np.random.choice(negative_indices, size=neg_i, replace=False)
        # Store the sampled indices and the count
        all_sampled_negatives.extend(sampled_indices)
        neg_cluster.append(neg_i)

    return all_sampled_negatives

def generate_pesudo_labels(negative_ratio,X_train,y_train):
    neg_size = negative_ratio * np.sum(y_train == 1)
    # clustering
    model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
    model = model.fit(X_train)
    mat = getLinkageMat(model)
    # get first level labels
    n_clusters = 5
    hierarchical_labels = fcluster(mat, n_clusters, criterion='maxclust')
    # sample negatives
    proportion = calculate_proportion(hierarchical_labels,y_train)
    all_sampled_negatives = cluster_negative_sampling(hierarchical_labels,y_train,proportion,neg_size)
    # return selected train set
    positive_indices = np.where(y_train == 1)[0]
    balanced_indices = np.concatenate([positive_indices, all_sampled_negatives])

    return balanced_indices

In [10]:
# def per_disease_val(df,X,y,methods):
#     result_df = pd.DataFrame(columns=['method',"fold","kernel", "accuracy", "recall", "precision", "f1-score",'auroc',"rank_ratio"])
#     kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     for fold, (train_idx, test_idx) in enumerate(kf.split(df, df['label'])):
#         X_train = X[df.index.get_indexer(train_idx)]
#         y_train = y[df.index.get_indexer(train_idx)]

#         X_test = X[df.index.get_indexer(test_idx)]
#         y_test = y[df.index.get_indexer(test_idx)]
#         for method in methods:
#             if method == 'ooc':
#                 X_pos_train = X_train[np.where(y_train == 1)]
#                 y_pos_train = y_train[np.where(y_train == 1)]
#                 y_test_ooc = np.where(y_test == 0, -1, y_test)
                
#                 clf = svm.OneClassSVM(nu=0.8,kernel=build_normalized_linear_kernel())
#                 result_df.loc[len(result_df.index)] = ["ooc_svm",fold,"norm_linear", *eval(clf, X_pos_train, y_pos_train, X_test, y_test_ooc)]
#                 clf = svm.OneClassSVM(nu= 0.8, kernel=build_normalized_poly_kernel(degree = 3))
#                 result_df.loc[len(result_df.index)] = ["ooc_svm",fold,"norm_poly", *eval(clf, X_pos_train, y_pos_train, X_test, y_test_ooc)]
#                 clf = svm.OneClassSVM(nu= 0.8, kernel='rbf')
#                 result_df.loc[len(result_df.index)] = ["ooc_svm",fold,"rbf", *eval(clf, X_pos_train, y_pos_train, X_test, y_test_ooc)]
#             elif method == 'random_negative':
#                 negative_indices = np.where(y_train == 0)[0]
#                 positive_indices = np.where(y_train == 1)[0]
#                 # Randomly select the same number of negative samples as positive samples
#                 random_neg_indices = np.random.choice(negative_indices, size=5*len(positive_indices), replace=False)
#                 # Combine positive and selected negative samples
#                 balanced_indices = np.concatenate([positive_indices, random_neg_indices])
#                 # Extract the balanced subset
#                 X_train_balance = X_train[balanced_indices]
#                 y_train_balance = y_train[balanced_indices] 
#                 clf = SVC(C=0.8,kernel=build_normalized_linear_kernel())
#                 result_df.loc[len(result_df.index)] = ["random_svm",fold,"norm_linear", *eval(clf, X_train_balance, y_train_balance, X_test, y_test)]
#                 clf = SVC(C=0.8,kernel=build_normalized_poly_kernel(degree = 3))
#                 result_df.loc[len(result_df.index)] = ["random_svm",fold,"norm_poly", *eval(clf, X_train_balance, y_train_balance, X_test, y_test)]
#                 clf = SVC(C=0.8,kernel='rbf')
#                 result_df.loc[len(result_df.index)] = ["random_svm",fold,"rbf", *eval(clf, X_train_balance, y_train_balance, X_test, y_test)]
#             elif method == 'pseudo_labeling':
#                 negative_ratio = 5
#                 balanced_indices = generate_pesudo_labels(negative_ratio,X_train,y_train)
#                 X_train_balance = X_train[balanced_indices]
#                 y_train_balance = y_train[balanced_indices]
                
#                 clf = SVC(C=0.8,kernel=build_normalized_linear_kernel())
#                 result_df.loc[len(result_df.index)] = ["pseudo_svm",fold,"norm_linear", *eval(clf, X_train_balance, y_train_balance, X_test, y_test)]
#                 clf = SVC(C=0.8,kernel=build_normalized_poly_kernel(degree = 3))
#                 result_df.loc[len(result_df.index)] = ["pseudo_svm",fold,"norm_poly", *eval(clf, X_train_balance, y_train_balance, X_test, y_test)]
#                 clf = SVC(C=0.8,kernel='rbf')
#                 result_df.loc[len(result_df.index)] = ["pseudo_svm",fold,"rbf", *eval(clf, X_train_balance, y_train_balance, X_test, y_test)] 
#     return result_df

In [11]:
def per_disease_val(df,X,y,methods):
    result_df = pd.DataFrame(columns=['method',"fold","kernel", "accuracy", "recall", "precision", "f1-score",'auroc',"rank_ratio"])
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for fold, (train_idx, test_idx) in enumerate(kf.split(df, df['label'])):
        X_train = X[df.index.get_indexer(train_idx)]
        y_train = y[df.index.get_indexer(train_idx)]

        X_test = X[df.index.get_indexer(test_idx)]
        y_test = y[df.index.get_indexer(test_idx)]
        for method in methods:
            if method == 'ooc':
                X_pos_train = X_train[np.where(y_train == 1)]
                y_pos_train = y_train[np.where(y_train == 1)]
                X_all_test = np.vstack((X_train[np.where(y_train != 1)], X_test))
                y_all_test = np.concatenate((y_train[np.where(y_train != 1)], y_test))
                y_test_ooc = np.where(y_all_test == 0, -1, y_all_test)
                
                clf = svm.OneClassSVM(nu=0.8,kernel=build_normalized_linear_kernel())
                result_df.loc[len(result_df.index)] = ["ooc_svm",fold,"norm_linear", *eval(clf, X_pos_train, y_pos_train, X_all_test, y_test_ooc)]
                clf = svm.OneClassSVM(nu= 0.8, kernel=build_normalized_poly_kernel(degree = 3))
                result_df.loc[len(result_df.index)] = ["ooc_svm",fold,"norm_poly", *eval(clf, X_pos_train, y_pos_train, X_all_test, y_test_ooc)]
                clf = svm.OneClassSVM(nu= 0.8, kernel='rbf')
                result_df.loc[len(result_df.index)] = ["ooc_svm",fold,"rbf", *eval(clf, X_pos_train, y_pos_train, X_all_test, y_test_ooc)]
            elif method == 'random_negative':
                negative_indices = np.where(y_train == 0)[0]
                positive_indices = np.where(y_train == 1)[0]
                # Randomly select the same number of negative samples as positive samples
                random_neg_indices = np.random.choice(negative_indices, size=5*len(positive_indices), replace=False)
                # Combine positive and selected negative samples
                balanced_indices = np.concatenate([positive_indices, random_neg_indices])
                # Extract the balanced subset
                X_train_balance = X_train[balanced_indices]
                y_train_balance = y_train[balanced_indices] 
                unused_neg_indices = np.setdiff1d(negative_indices, random_neg_indices)
                X_new_test = np.vstack((X_train[unused_neg_indices], X_test))
                y_new_test = np.concatenate((y_train[unused_neg_indices], y_test))

                clf = SVC(C=0.8,kernel=build_normalized_linear_kernel())
                result_df.loc[len(result_df.index)] = ["random_svm",fold,"norm_linear", *eval(clf, X_train_balance, y_train_balance, X_new_test, y_new_test)]
                clf = SVC(C=0.8,kernel=build_normalized_poly_kernel(degree = 3))
                result_df.loc[len(result_df.index)] = ["random_svm",fold,"norm_poly", *eval(clf, X_train_balance, y_train_balance, X_new_test, y_new_test)]
                clf = SVC(C=0.8,kernel='rbf')
                result_df.loc[len(result_df.index)] = ["random_svm",fold,"rbf", *eval(clf, X_train_balance, y_train_balance, X_new_test, y_new_test)]
            elif method == 'pseudo_labeling':
                negative_ratio = 5
                balanced_indices = generate_pesudo_labels(negative_ratio,X_train,y_train)

                all_train_indices = np.arange(len(y_train))
                non_selected_indices = np.setdiff1d(all_train_indices, balanced_indices)

                # Step 4: Create the new test set with non-selected training data + original test data
                X_new_test = np.vstack((X_train[non_selected_indices], X_test))
                y_new_test = np.concatenate((y_train[non_selected_indices], y_test))
                clf = SVC(C=0.8,kernel=build_normalized_linear_kernel())
                result_df.loc[len(result_df.index)] = ["pseudo_svm",fold,"norm_linear", *eval(clf, X_train_balance, y_train_balance, X_new_test, y_new_test)]
                clf = SVC(C=0.8,kernel=build_normalized_poly_kernel(degree = 3))
                result_df.loc[len(result_df.index)] = ["pseudo_svm",fold,"norm_poly", *eval(clf, X_train_balance, y_train_balance, X_new_test, y_new_test)]
                clf = SVC(C=0.8,kernel='rbf')
                result_df.loc[len(result_df.index)] = ["pseudo_svm",fold,"rbf", *eval(clf, X_train_balance, y_train_balance, X_new_test, y_new_test)] 
    return result_df

In [15]:
selected_diseases = (
    all_df.groupby('disease_id')
    .filter(lambda x: len(x) > 15)
    ['disease_id']
    .unique()
    .tolist()
)
# selected_diseases = ['ICD10_G20', 'ICD10_C50']
all_results = []

for disease in selected_diseases:
    methods = ['ooc', 'random_negative', 'pseudo_labeling']
    df, X, y = read_data(disease)
    result_df = per_disease_val(df, X, y, methods)

    # Calculate mean metrics
    mean_df = result_df.groupby(['method', 'kernel'])[['accuracy', 'recall', 'precision', 
                                                       'f1-score', 'auroc', 'rank_ratio']].mean().reset_index()
    
    # Keep the row with the lowest rank_ratio for each method
    mean_df = mean_df.loc[mean_df.groupby('method')['rank_ratio'].idxmin()].reset_index(drop=True)

    # Add disease information
    mean_df['disease'] = disease

    # Append to all_results list
    all_results.append(mean_df)

# Concatenate all results into a single DataFrame
final_result = pd.concat(all_results, ignore_index=True)

total positive 76
total positive 76
total positive 76
total positive 76
total positive 76
total positive 40
total positive 40
total positive 40
total positive 40
total positive 40
total positive 15
total positive 15


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 16


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 15
total positive 15
total positive 41
total positive 41
total positive 42
total positive 42
total positive 42
total positive 34
total positive 34
total positive 35
total positive 35
total positive 34


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 572


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 572


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 572


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 572


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 572


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 22
total positive 22
total positive 23
total positive 23
total positive 22


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 21
total positive 20
total positive 21


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 21
total positive 21
total positive 99
total positive 99
total positive 100
total positive 99
total positive 99


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 452


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 452


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 452


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 452


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 452


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 159
total positive 159
total positive 160
total positive 159
total positive 159
total positive 18
total positive 18
total positive 19
total positive 19
total positive 18
total positive 17
total positive 16
total positive 17


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 17


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 17
total positive 23
total positive 23
total positive 24
total positive 23
total positive 23
total positive 13
total positive 13
total positive 14
total positive 14
total positive 14
total positive 17


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 17


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 18
total positive 18
total positive 18
total positive 16
total positive 16
total positive 16


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 16


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 16


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 21
total positive 21
total positive 22
total positive 22
total positive 22
total positive 17
total positive 16
total positive 17
total positive 17
total positive 17
total positive 90
total positive 90
total positive 91
total positive 91
total positive 90
total positive 230
total positive 230
total positive 231
total positive 231
total positive 230
total positive 227
total positive 227
total positive 228
total positive 227
total positive 227
total positive 38
total positive 38
total positive 39
total positive 39
total positive 38
total positive 33
total positive 32
total positive 33
total positive 33
total positive 33
total positive 751
total positive 751
total positive 752
total positive 751
total positive 751
total positive 375
total positive 375
total positive 376
total positive 375
total positive 375


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 17


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 16
total positive 17


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 17


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 17


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 29
total positive 29
total positive 30
total positive 30
total positive 30


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 30


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 30


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 31


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 31


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 30


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 30
total positive 30
total positive 31
total positive 31
total positive 30
total positive 117
total positive 116
total positive 117
total positive 117
total positive 117


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 24


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 158
total positive 158
total positive 159
total positive 159
total positive 158
total positive 95
total positive 95
total positive 96
total positive 95
total positive 95
total positive 109
total positive 108
total positive 109
total positive 109
total positive 109
total positive 53
total positive 53
total positive 54
total positive 54
total positive 54
total positive 26
total positive 26
total positive 27
total positive 27
total positive 26


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 19


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 19


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 19


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 19


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 29
total positive 29
total positive 30
total positive 30
total positive 30
total positive 133
total positive 132
total positive 133
total positive 133
total positive 133
total positive 13
total positive 13


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 14


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 14
total positive 14
total positive 137
total positive 136
total positive 137
total positive 137
total positive 137
total positive 17
total positive 17
total positive 18
total positive 18


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 18
total positive 128
total positive 128
total positive 128
total positive 128
total positive 128
total positive 22
total positive 22
total positive 23
total positive 23
total positive 22
total positive 39
total positive 39
total positive 40
total positive 39
total positive 39
total positive 55
total positive 55
total positive 56
total positive 55
total positive 55
total positive 44
total positive 44
total positive 44
total positive 44
total positive 44
total positive 164
total positive 164
total positive 164
total positive 164
total positive 164
total positive 13
total positive 12
total positive 13
total positive 13
total positive 13
total positive 42
total positive 42
total positive 43
total positive 43
total positive 42
total positive 14
total positive 14
total positive 15
total positive 15
total positive 14
total positive 51
total positive 51
total positive 52
total positive 51
total positive 51
total positive 60
total positive 60
total positive 60
total positive 60


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 14
total positive 14
total positive 14
total positive 31
total positive 31
total positive 32
total positive 31
total positive 31
total positive 14
total positive 14
total positive 15


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 15


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 14
total positive 90
total positive 90
total positive 91
total positive 91
total positive 90
total positive 33
total positive 32
total positive 33
total positive 33
total positive 33
total positive 33
total positive 32


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 33


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


total positive 33
total positive 33
total positive 21
total positive 20
total positive 21
total positive 21
total positive 21
total positive 64
total positive 64
total positive 64
total positive 64
total positive 64
total positive 75
total positive 75
total positive 76
total positive 75
total positive 75
total positive 81
total positive 80
total positive 81
total positive 81
total positive 81
total positive 74
total positive 74
total positive 75
total positive 75
total positive 74
total positive 131
total positive 131
total positive 132
total positive 131
total positive 131
total positive 26
total positive 26
total positive 27
total positive 27
total positive 26


In [16]:
final_result.to_csv('/itf-fi-ml/shared/users/ziyuzh/svm/results/all_test.csv',index=False)

In [17]:
final_result

Unnamed: 0,method,kernel,accuracy,recall,precision,f1-score,auroc,rank_ratio,disease
0,ooc_svm,norm_poly,0.996275,0.996275,0.998279,0.997242,0.900764,0.09966,ICD10_C16
1,pseudo_svm,norm_linear,0.970253,0.970253,0.998654,0.983953,0.921781,0.07864,ICD10_C16
2,random_svm,norm_linear,0.970166,0.970166,0.998654,0.983907,0.921517,0.07892,ICD10_C16
3,ooc_svm,norm_poly,0.997028,0.997028,0.999050,0.998020,0.896439,0.10380,ICD10_C18
4,pseudo_svm,rbf,0.979396,0.979396,0.999174,0.989086,0.923816,0.07644,ICD10_C18
...,...,...,...,...,...,...,...,...,...
211,pseudo_svm,rbf,0.972024,0.972024,0.997074,0.984153,0.871784,0.12888,ICD10_N80
212,random_svm,rbf,0.971595,0.971595,0.997073,0.983931,0.870868,0.12980,ICD10_N80
213,ooc_svm,norm_linear,0.999153,0.999153,0.999368,0.999256,0.908378,0.09178,ICD10_N97
214,pseudo_svm,rbf,0.985369,0.985369,0.999498,0.992299,0.913964,0.08620,ICD10_N97
