In [54]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer,matthews_corrcoef,recall_score
from sklearn.model_selection import StratifiedKFold


In [55]:
def cross_val(model, X, Y):
    sensitivity_scorer = make_scorer(recall_score)
    specificity_scorer = make_scorer(recall_score, pos_label=0)
    MCC=make_scorer(matthews_corrcoef)


    scoring = {'AUC': 'roc_auc', 'Accuracy': "accuracy", "f1": "f1",
                        "Recall": "recall", "Precision": "precision","MCC":MCC, "Average Precision": "average_precision",
                        "Sensitivity": sensitivity_scorer, "Specificity": specificity_scorer}

    scores=cross_validate(model, X, Y, scoring=scoring, cv=5)

    mean_scores = {metric: values.mean() for metric, values in scores.items()}


    return mean_scores



def model_hyperparameter_tuning(model, param_grid):
    MCC=make_scorer(matthews_corrcoef)
    results_list = []




    for dataset,name in dataset_list:

        x = dataset["3"].copy()
        
        x = x.str.split(expand=True)

        x= x.astype(float)


        y = dataset["4"].copy().astype('category')

        grid=GridSearchCV(model, param_grid, cv=5, scoring=MCC, verbose=1)

        search=grid.fit(x,y)


        results=cross_val(grid,x,y)
        result_entry = {'dataset_name': name, **results}

        results_list.append(result_entry)

    results_df = pd.DataFrame(results_list)
    results_df.set_index('dataset_name', inplace=True)

    with open('model_results.csv', 'a') as f:
        results_df.to_csv(f)



In [121]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score, average_precision_score

values_to_remove = ['ENSG00000142599', 'ENSG00000135636', 'ENSG00000285508']
dataset_paths = {
        'cat_1': 'gene_lists/cat_1.csv.gz',
        'cat_1_sd': 'gene_lists/cat_1_sd.csv.gz',
        'cat_1_2': 'gene_lists/cat_1_2.csv.gz',
        'cat_1_2_sd': 'gene_lists/cat_1_2_sd.csv.gz',
        'cat_1_2_3': 'gene_lists/cat_1_2_3.csv.gz',
        'complete': 'gene_lists/complete.csv.gz'
    }

dataset_list = [(pd.read_csv(path, compression='gzip')[~pd.read_csv(path, compression='gzip')['1'].isin(values_to_remove)], name) for name, path in dataset_paths.items()]

def model_evaluation(model, param_grid):
    


    sensitivity_scorer = make_scorer(recall_score)
    specificity_scorer = make_scorer(recall_score, pos_label=0)
    MCC = make_scorer(matthews_corrcoef)

    scoring = {'AUC': roc_auc_score,'Accuracy': accuracy_score, "f1": f1_score,
            "Recall": recall_score,"Precision": precision_score, "Average Precision": average_precision_score,
            "Sensitivity": sensitivity_scorer, "Specificity": specificity_scorer, "MCC": MCC}

    # Create the folds----------------------------------------------------------------------------
    cat_1=dataset_list[0][0]
    X_fold=cat_1["3"].copy()

    y_fold=cat_1["4"].copy()

    skf = StratifiedKFold(n_splits=5)

    #Iterate through the datasets----------------------------------------------------------------
    dataset_results = []
    for dataset, name in dataset_list:
        results_list = []

        for i, (train_index, test_index) in enumerate(skf.split(X_fold, y_fold)):
                
            cat_1 = pd.read_csv('gene_lists/cat_1.csv.gz', compression='gzip')

            # Fold filters
            train_filter = cat_1["0"].iloc[train_index]
            test_filter = cat_1["0"].iloc[test_index]

            test_dataset= cat_1[cat_1['0'].isin(test_filter)]
                
            dataset_ed = dataset[~dataset['0'].isin(test_filter)]

            # create X and Y
            X_data = dataset_ed["3"].copy()
            X_data = X_data.str.split(expand=True)
            X_data = X_data.astype(float)

            y_data = dataset_ed["4"].copy().astype('category')

            #fit model with best param_grid
            grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=MCC, verbose=1, refit=True)

            search = grid.fit(X_data, y_data)

            best_model = search.best_estimator_

            #edit test data
            test_data_x= test_dataset["3"].copy()
            test_data_x = test_data_x.str.split(expand=True)
            test_data_x = test_data_x.astype(float)

            test_data_y = test_dataset["4"].copy().astype('category')

            #make the predictions
            y_pred = search.predict(test_data_x)
            y_true = test_data_y

            #calculate the scores
            scores = {}
            for metric, scorer in scoring.items():
                if metric in ['Sensitivity', 'Specificity', 'MCC']:
                    scores[metric] = scorer(best_model, test_data_x, test_data_y,sample_weight=None)
                else:
                    scores[metric] = scorer(y_true, y_pred)

            results_list.append(scores)

        mean_scores = {metric: np.mean([result[metric] for result in results_list]) for metric in scoring}
        mean_scores['model'] = "Logistic Regression"
        mean_scores['dataset_name'] = name  
        
        dataset_results.append(mean_scores)

    results_df = pd.DataFrame(dataset_results)
    results_df.set_index('dataset_name', inplace=True)
    results_df.to_csv('test.csv')

model=LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=10)
param_grid={'C': [10]}

model_evaluation(model, param_grid)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [119]:
values_to_remove = ['ENSG00000142599', 'ENSG00000135636', 'ENSG00000285508']
dataset_paths = {
        'cat_1': 'gene_lists/cat_1.csv.gz',
        'cat_1_sd': 'gene_lists/cat_1_sd.csv.gz',
        'cat_1_2': 'gene_lists/cat_1_2.csv.gz',
        'cat_1_2_sd': 'gene_lists/cat_1_2_sd.csv.gz',
        'cat_1_2_3': 'gene_lists/cat_1_2_3.csv.gz',
        'complete': 'gene_lists/complete.csv.gz'
    }

dataset_list = [(pd.read_csv(path, compression='gzip')[~pd.read_csv(path, compression='gzip')['1'].isin(values_to_remove)], name) for name, path in dataset_paths.items()]
print(dataset_list[0][1])

cat_1


In [58]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)

splits=skf.split(cat_1["3"],cat_1["4"])

splits=[(cat_1.iloc[train_index], cat_1.iloc[test_index]) for train_index, test_index in splits]

print(splits[2][1]["4"].value_counts())
print(cat_1["4"].value_counts())

0    158
1     46
Name: 4, dtype: int64
0    788
1    231
Name: 4, dtype: int64


In [59]:
for fold_idx, (train_idx, test_idx) in enumerate(skf.split(cat_1["3"], cat_1["4"])):
    train_set = cat_1.iloc[train_idx]
    test_set = cat_1.iloc[test_idx]
    
    print(f"Fold {fold_idx+1} - Training Set:")
    print(train_set["4"].value_counts())


Fold 1 - Training Set:
0    630
1    185
Name: 4, dtype: int64
Fold 2 - Training Set:
0    630
1    185
Name: 4, dtype: int64
Fold 3 - Training Set:
0    630
1    185
Name: 4, dtype: int64
Fold 4 - Training Set:
0    631
1    184
Name: 4, dtype: int64
Fold 5 - Training Set:
0    631
1    185
Name: 4, dtype: int64
