In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer,matthews_corrcoef,recall_score
from sklearn.model_selection import  KFold


In [3]:
def cross_val(model, X, Y):
    sensitivity_scorer = make_scorer(recall_score)
    specificity_scorer = make_scorer(recall_score, pos_label=0)
    MCC=make_scorer(matthews_corrcoef)


    scoring = {'AUC': 'roc_auc', 'Accuracy': "accuracy", "f1": "f1",
                        "Recall": "recall", "Precision": "precision","MCC":MCC, "Average Precision": "average_precision",
                        "Sensitivity": sensitivity_scorer, "Specificity": specificity_scorer}

    scores=cross_validate(model, X, Y, scoring=scoring, cv=5)

    mean_scores = {metric: values.mean() for metric, values in scores.items()}


    return mean_scores



def model_hyperparameter_tuning(model, param_grid):
    MCC=make_scorer(matthews_corrcoef)
    results_list = []




    for dataset,name in dataset_list:

        x = dataset["3"].copy()
        
        x = x.str.split(expand=True)

        x= x.astype(float)


        y = dataset["4"].copy().astype('category')

        grid=GridSearchCV(model, param_grid, cv=5, scoring=MCC, verbose=1)

        search=grid.fit(x,y)


        results=cross_val(grid,x,y)
        result_entry = {'dataset_name': name, **results}

        results_list.append(result_entry)

    results_df = pd.DataFrame(results_list)
    results_df.set_index('dataset_name', inplace=True)

    with open('model_results.csv', 'a') as f:
        results_df.to_csv(f)



In [12]:
values_to_remove = ['ENSG00000142599', 'ENSG00000135636', 'ENSG00000285508']
dataset_paths = {
        'cat_1': 'gene_lists/cat_1.csv.gz',
        'cat_1_sd': 'gene_lists/cat_1_sd.csv.gz',
        'cat_1_2': 'gene_lists/cat_1_2.csv.gz',
        'cat_1_2_sd': 'gene_lists/cat_1_2_sd.csv.gz',
        'cat_1_2_3': 'gene_lists/cat_1_2_3.csv.gz',
        'complete': 'gene_lists/complete.csv.gz'
}

#dataset_list = [(pd.read_csv(path, compression='gzip')[~pd.read_csv(path, compression='gzip')['1'].isin(values_to_remove)], name) for name, path in dataset_paths.items()]
    
dataset_list = [(pd.read_csv(path, compression='gzip'), name) for name, path in dataset_paths.items()]

for dataset,name in dataset_list:
    print(name)
    print(len(dataset))
    print(dataset["4"].value_counts())

cat_1
1022
0    790
1    232
Name: 4, dtype: int64
cat_1_sd
1116
0    790
1    326
Name: 4, dtype: int64
cat_1_2
1721
1    931
0    790
Name: 4, dtype: int64
cat_1_2_sd
1815
1    1025
0     790
Name: 4, dtype: int64
cat_1_2_3
1851
1    1061
0     790
Name: 4, dtype: int64
complete
1945
1    1155
0     790
Name: 4, dtype: int64


In [15]:
values_to_remove = ['ENSG00000142599', 'ENSG00000135636', 'ENSG00000285508']
dataset_paths = {
        'cat_1': 'gene_lists/cat_1.csv.gz',
        'cat_1_sd': 'gene_lists/cat_1_sd.csv.gz',
        'cat_1_2': 'gene_lists/cat_1_2.csv.gz',
        'cat_1_2_sd': 'gene_lists/cat_1_2_sd.csv.gz',
        'cat_1_2_3': 'gene_lists/cat_1_2_3.csv.gz',
        'complete': 'gene_lists/complete.csv.gz'
}

dataset_list = [(pd.read_csv(path, compression='gzip')[~pd.read_csv(path, compression='gzip')['1'].isin(values_to_remove)], name) for name, path in dataset_paths.items()]
    
#dataset_list = [(pd.read_csv(path, compression='gzip'), name) for name, path in dataset_paths.items()]

for dataset,name in dataset_list:
    print(name)
    print(len(dataset))
    print(dataset["4"].value_counts())
display(dataset_list[0][0].head())

cat_1
1019
0    788
1    231
Name: 4, dtype: int64
cat_1_sd
1113
0    788
1    325
Name: 4, dtype: int64
cat_1_2
1718
1    930
0    788
Name: 4, dtype: int64
cat_1_2_sd
1812
1    1024
0     788
Name: 4, dtype: int64
cat_1_2_3
1848
1    1060
0     788
Name: 4, dtype: int64
complete
1942
1    1154
0     788
Name: 4, dtype: int64


Unnamed: 0,0,1,2,3,4
0,RSPO4,ENSG00000101282,ENST00000217260,-0.0378995 0.04202614 0.11390661 -0.15509647 -...,0
1,BMP2,ENSG00000125845,ENST00000378827,-0.013096557 0.097702235 0.0488684 -0.11864715...,0
2,PDYN,ENSG00000101327,ENST00000217305,-0.0493422 0.09433348 0.0359074 -0.112070404 -...,0
3,IRS2,ENSG00000185950,ENST00000375856,0.0009915202 0.086411364 0.04103969 -0.1103346...,0
4,JAG1,ENSG00000101384,ENST00000254958,-0.04605531 0.069122575 0.06510323 -0.14371262...,0


In [16]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import os

# Define the values to remove and dataset paths
values_to_remove = ['ENSG00000142599', 'ENSG00000135636', 'ENSG00000285508']
dataset_paths = {
    'cat_1': 'gene_lists/cat_1.csv.gz',
    'cat_1_sd': 'gene_lists/cat_1_sd.csv.gz',
    'cat_1_2': 'gene_lists/cat_1_2.csv.gz',
    'cat_1_2_sd': 'gene_lists/cat_1_2_sd.csv.gz',
    'cat_1_2_3': 'gene_lists/cat_1_2_3.csv.gz',
    'complete': 'gene_lists/complete.csv.gz'
}

# Initialize StratifiedKFold with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to read dataset, remove specified values, and update 'cat_1' dataset
def update_dataset(dataset_path):
    dataset_name = os.path.basename(dataset_path).split('.')[0]  # Extract dataset name
    df = pd.read_csv(dataset_path, compression='gzip')
    df_cleaned = df[~df['1'].isin(values_to_remove)]  # Remove specified values
    if dataset_name == 'cat_1':  # If it's 'cat_1' dataset, select one fold as IVS
        fold_num = 1
        for train_index, ivs_index in skf.split(df_cleaned, df_cleaned['4']):
            if fold_num == 1:
                df_ivs = df_cleaned.iloc[ivs_index]
                ivs_path = f'IVS_{dataset_name}.csv'
                df_ivs.to_csv(ivs_path, index=False)  # Save IVS as CSV file
            fold_num += 1
        # Use the remaining folds as the new 'cat_1' dataset
        df_new_cat_1 = df_cleaned.drop(df_ivs.index)
        return df_new_cat_1, dataset_name
    else:
        return df_cleaned, dataset_name

# Update datasets and create the new dataset list
dataset_list = [update_dataset(path) for name, path in dataset_paths.items()]
