In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer, KNNImputer


pd.options.display.max_columns = 50  
pd.options.display.max_rows = 500     
pd.options.display.max_colwidth = 100
pd.options.display.precision = 3

In [None]:
df_embed = pd.read_csv('embeddings.csv')
df_embed.head()

In [None]:
mean_vals = df_embed.iloc[:, 1:].mean()  
mean_vals.head()

In [None]:
df_meta = pd.read_csv('metadata.csv')
df_meta.head()

In [None]:
merged_df = pd.merge(df_embed, df_meta, on='site_id')
merged_df

In [None]:
unique_cell_types = merged_df['cell_type'].nunique()
unique_sirnas = merged_df['sirna'].nunique()
unique_well_types = merged_df['well_type'].nunique()

feature_summary = merged_df.describe().transpose()

missing_values = merged_df.isnull().sum()

(unique_cell_types, unique_sirnas, unique_well_types, feature_summary.head(), missing_values.head())

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
import random
from sklearn.model_selection import train_test_split, KFold
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.common.utils.utils import setup_outputdir
from autogluon.core.utils.loaders import load_pkl
from autogluon.core.utils.savers import save_pkl
import os.path
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt

In [None]:
scaler = StandardScaler()
feature_cols = [col for col in merged_df.columns if col.startswith('feature_')]
X_scaled = scaler.fit_transform(merged_df[feature_cols])

# PCA for dimensionality reduction
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X_scaled)
principalDf = pd.DataFrame(data=principalComponents, columns=['principal component 1', 'principal component 2'])

# KMeans clustering on the PCA results
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(principalDf)
principalDf['cluster'] = kmeans.labels_


plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='principal component 1', y='principal component 2',
    hue='cluster', data=principalDf, palette=sns.color_palette("hsv", 5),
    legend="full", alpha=0.3
)
plt.title('PCA Clustering Results')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

all_data = merged_df[feature_cols + ['sirna']]
train_df, test_df = train_test_split(all_data, test_size=0.3, random_state=42)

encoder = LabelEncoder()

train_df['sirna_encoded'] = encoder.fit_transform(train_df['sirna'])
test_df['sirna_encoded'] = encoder.transform(test_df['sirna'])

X_train = train_df[feature_cols]
y_train = train_df['sirna_encoded']
X_test = test_df[feature_cols]
y_test = test_df['sirna_encoded']  


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

all_data = merged_df[feature_cols + ['sirna']]
train_df, test_df = train_test_split(all_data, test_size=0.3, random_state=42)

encoder = LabelEncoder()

train_df['sirna_encoded'] = encoder.fit_transform(train_df['sirna'])
test_df['sirna_encoded'] = encoder.transform(test_df['sirna'])

X_train = train_df[feature_cols]
y_train = train_df['sirna_encoded']
X_test = test_df[feature_cols]
y_test = test_df['sirna_encoded']  


In [None]:
num_splits = 4
kf = KFold(n_splits=num_splits, shuffle=True, random_state=2997)

xkf_train=[]
xkf_val=[]
ykf_train=[]
ykf_val=[]
x_train_arr = X_train.values
y_train_arr = y_train.values
for train_index, test_index in kf.split(x_train_arr):
    Xtrainfold, Xvalfold = x_train_arr[train_index], x_train_arr[test_index]
    Ytrainfold, Yvalfold = y_train_arr[train_index], y_train_arr[test_index]
    Ytrainfold = np.squeeze(Ytrainfold)
    Yvalfold = np.squeeze(Yvalfold)
    xkf_train.append(Xtrainfold)
    xkf_val.append(Xvalfold)
    ykf_train.append(Ytrainfold)
    ykf_val.append(Yvalfold)

In [None]:
from sklearn.metrics import roc_auc_score

def printScores(y_val, class_pred, proba, average='macro'):
    f1 = f1_score(y_val, class_pred, average=average)
    precision = precision_score(y_val, class_pred, average=average)
    recall = recall_score(y_val, class_pred, average=average)
    accuracy = accuracy_score(y_val, class_pred)
    # For ROC AUC, you might want to handle multiclass separately
    # auc = roc_auc_score(y_val, proba, multi_class='ovr')  # Example for OvR approach
    print(confusion_matrix(y_val, class_pred))
    print("F1: " + str(f1))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("Accuracy: " + str(accuracy))

def returnscores_xv(Yval, predictions, average='macro'):
    score = np.zeros(4)
    score[0] = accuracy_score(Yval, predictions)
    score[1] = recall_score(Yval, predictions, average=average, zero_division=0)
    score[2] = precision_score(Yval, predictions, average=average, zero_division=0)
    score[3] = f1_score(Yval, predictions, average=average)
    return score

def printscores_xv(scores):
    print("Accuracy: " + str(100 * scores[0]) + "%")
    print("Recall: " + str(100 * scores[1]) + "%")
    print("Precision: " + str(100 * scores[2]) + "%")
    print("F1: " + str(100 * scores[3]) + "%")

In [None]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
import numpy as np


# Convert DataFrame to numpy arrays for cross-validation
x_train_arr = X_train.values
y_train_arr = y_train.values
x_test_arr = X_test.values
y_test_arr = y_test.values

# Setting up K-Fold cross-validation
num_splits = 4
kf = KFold(n_splits=num_splits, shuffle=True, random_state=2997)

# Preparing lists to store train and validation data for each fold
xkf_train = []
xkf_val = []
ykf_train = []
ykf_val = []

# Splitting the data for each fold
for train_index, test_index in kf.split(x_train_arr):
    Xtrainfold, Xvalfold = x_train_arr[train_index], x_train_arr[test_index]
    Ytrainfold, Yvalfold = y_train_arr[train_index], y_train_arr[test_index]
    xkf_train.append(Xtrainfold)
    xkf_val.append(Xvalfold)
    ykf_train.append(Ytrainfold)
    ykf_val.append(Yvalfold)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from tqdm import tqdm
import numpy as np

scores_lr = np.zeros((num_splits, 4))  # Assuming 4 scores: accuracy, recall, precision, F1
for split, (train_index, test_index) in tqdm(enumerate(kf.split(X_train)), total=num_splits, desc='Logistic Regression Progress'):
    Xtrainfold, Xvalfold = X_train.iloc[train_index], X_train.iloc[test_index]
    Ytrainfold, Yvalfold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Scaling within the loop
    scaler = StandardScaler()
    Xtrainfold_scaled = scaler.fit_transform(Xtrainfold)
    Xvalfold_scaled = scaler.transform(Xvalfold)

    log_reg = LogisticRegression(random_state=42, max_iter=500)
    log_reg.fit(Xtrainfold_scaled, Ytrainfold)
    predictions = log_reg.predict(Xvalfold_scaled)
    scores_lr[split, :] = returnscores_xv(Yvalfold, predictions)

average_scores_lr = np.mean(scores_lr, axis=0)
print("Average Scores for Logistic Regression:")
printscores_xv(average_scores_lr)

# Test Set Evaluation
X_test_scaled = scaler.transform(X_test)
test_predictions = log_reg.predict(X_test_scaled)
print("Test Set Scores for Logistic Regression:")
printScores(y_test, test_predictions)


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from tqdm import tqdm
import numpy as np

In [None]:


# Logistic Regression Classifier
scores_lr = np.zeros((num_splits, n_scores))
for split in tqdm(range(num_splits), desc='Logistic Regression Progress'):
    log_reg = LogisticRegression(random_state=42, max_iter=500)
    log_reg.fit(xkf_train[split], ykf_train[split])
    scores_lr[split, :] = returnscores_xv(ykf_val[split], log_reg.predict(xkf_val[split]))
average_scores_lr = np.mean(scores_lr, axis=0)
print("Average Scores for Logistic Regression:")
printscores_xv(average_scores_lr)
print("Test Set Scores for Logistic Regression:")
DNN.fit(x_train_arr, y_train_arr)
proba = DNN.predict_proba(x_test_arr)[:, 1]
printScores(y_test_arr, DNN.predict(x_test_arr), proba)

In [None]:
!pip install optuna
!pip install pyDOE2
!pip install tensorflow

In [None]:
import optuna
import time
import pyDOE2
from tqdm import tqdm
import tensorflow as tf

In [None]:
'''
Outermost Function Call for hyperparameter tuning.
Takes in dataframe of optimization parameters, datasets, parameter names, algorithm name
Takes in reduction parameter for Generalized Subspace Design for grid search (reduction)
Takes in the number of retrainings for each hyperparameter config (n_trials), number of Cross Val splits (n_splits),
number of iterations to run BO (n_bo) number of instantiations of the final hyperparameter configuration to test
First, we set up a wrapper function for the call to fit_bo that we can use with Optuna.
Nest, we set up the pruner and the study, then perform the initial grid sampling.
We then reset the sampler for our study to the TPESampler and perform BO to find the best hyperparameter configuration
Finally, we call preparDF, saving a report, and search for an optimal instantiation of a model to return.
'''

def hyperparam_search(df, function, x_train, x_test, y_train, y_test, indexnames, name, reduction=1, n_trials = 10, n_splits = 5, n_bo=100, n_inst=100, sklearn=True):
    print(np.shape(x_test))
    def wrapper(trial): #Wrapper function for fit_bo
        return fit_bo(trial, function, df, x_train, y_train, n_trials, n_splits)
    start_time = time.time()
    pruner = optuna.pruners.PercentilePruner(75.0, n_startup_trials=1, n_warmup_steps=3, interval_steps=1) #Pruner
    study = optuna.create_study(direction="maximize", pruner=pruner)  # Create a new study using pruner.
    n_configs = init_gridsample(study, wrapper, df, reduction=reduction) #Perform initial grid sampling
    study.sampler = optuna.samplers.TPESampler() #Set sampler to Tree-Structured Parzen Estimator
    study.optimize(wrapper, n_trials=n_bo) #Optimize
    bestparams = study.best_params
    prepareDF(df, bestparams, indexnames, name)
    model = test_instantiations(function, bestparams, x_train, y_train, n_inst, sklearn)
    bestf1, auc, precision, recall, accuracy = score(model, x_test, y_test, sklearn)
    return model

'''
Perform the initialization of the BO using grid sampling.
First call get_configlist to get the list of grid sampling configurations
Then for each configuration, initialize a single point grid sample search in optuna
Run 1 trial for each grid point
'''
def init_gridsample(study, func, df, reduction=5):
    configs=get_configlist(df, reduction) # Get the DOE configurations
    for config in configs: #Loop over all DOE configs

        #Create the search space
        search_space={}
        for i in range(len(df.index)):
            search_space[df.index[i]]=[config[i]]

        #Set the study's sampler to be single point gridsearch space, then do one round of fitting
        study.sampler=optuna.samplers.GridSampler(search_space)
        study.optimize(func, n_trials=1)
    return len(configs)

'''
Calculate gridsearch configurations. If we have a reduction greater than one, use GSD to reduce
For each configuration, calculate the corresponding parameter values using get_gridvalm then return
'''
def get_configlist(df, reduction=5):
    #Create a list of the number of grid locations for parameter.
    #Cont & int get 2 locations, Cat gets locations equal to # of discrete categories
    configsize=[]
    for parameter in df.index:
        if df.loc[parameter, "Datatype"]=="Categorical":
            configsize.append(len(df.loc[parameter, "Values/Min-Max"]))
        else:
            configsize.append(df.loc[parameter, "Gridres"])

    print(configsize)

    #If reduction>1 we call gsd, otherwise do full fact
    #Uses pyDOE2's Generalized subset designs
    if reduction>1:
        DOEconfigs=pyDOE2.gsd(configsize, reduction=reduction)
    else:
        DOEconfigs=pyDOE2.fullfact(configsize).astype(int)
    print("Number of GridSearch configs: " + str(len(DOEconfigs)))

    #Look up actual gridsearch values from indices
    configvals=[]
    for i in range(len(DOEconfigs)):
        newvals=[]
        for j in range(len(DOEconfigs[0])):
            newvals.append(get_gridval(df, df.index[j], DOEconfigs[i][j]))
        configvals.append(newvals)
    return configvals


'''
Calculate the value of a particular grid sampling point
Categorical, continuous, and discrete variables are handled individually
Log scale our calculations when the variable is log scaled
We assume grid points are evenly spaced between limits
'''
def get_gridval(df, parameter, index):
    #If categorical, we simply return the value corresponidng to the index
    if df.loc[parameter, "Datatype"]=="Categorical":
        return df.loc[parameter, "Values/Min-Max"][index]

    #Grab vals for minval, maxval and scaling from df for convenience
    minval=df.loc[parameter, "Values/Min-Max"][0]
    maxval=df.loc[parameter, "Values/Min-Max"][1]
    scaling=df.loc[parameter, "Logscaling"]
    gridres=df.loc[parameter, "Gridres"]

    #Calculate the percentile between the parameter limits to sample the grid point
    gridloc=index/gridres+1/gridres/2

    #scaling is true, we will logscale when performing our calculations
    if scaling==True:
        #Indices 0 and 1 should be at 25th and 75th percentile of parameter ranges, respecively
        value=np.exp((1-gridloc)*np.log(minval)+gridloc*np.log(maxval))
    else:
        value=(1-gridloc)*minval+gridloc*maxval

    #If we have an integer parameter, we round to the nearest integer
    if df.loc[parameter, "Datatype"]=="Continuous":
        return value
    if df.loc[parameter, "Datatype"]=="Integer":
        return round(value)



'''
Sample a particular parameter from the optuna trial.
Automates the sampling call based on the information about the parameter contained in DF

This function duals as a simple dictionary lookup if trial is a dictionary.
This allows the reuse of fit_XXX functions when selecting an instantiation for a given config
'''
def sample(trial, df, parameter):
    #If we have passed in a dictionary, simply index a value from the dictionary.
    if type(trial)==dict:
        return trial.get(parameter)

    #We setup each call to the trial in the format Optuna expects. See the Optuna docs
    if df.loc[parameter, "Datatype"]=="Categorical":
        return trial.suggest_categorical(parameter, df.loc[parameter, "Values/Min-Max"])
    #Grab vals for minval, maxval and scaling from df for convenience
    minval=df.loc[parameter, "Values/Min-Max"][0]
    maxval=df.loc[parameter, "Values/Min-Max"][1]
    scaling=df.loc[parameter, "Logscaling"]
    if df.loc[parameter, "Datatype"]=="Continuous":
        return trial.suggest_float(parameter, minval, maxval, log=scaling)
    if df.loc[parameter, "Datatype"]=="Integer":
        return trial.suggest_int(parameter, minval, maxval, 1, log=scaling)


'''
General BO fit loop. For each trial in BO we will create n_trials k_fold splits where k is n_splits
This yields a total of n_trial*n_folds fitting runs.
Since we apply the hyperparameters to each model differently, we call the func function, which is specified
This func function will be a unique function for each type of model which will assign the hyperparameters
The func function will then fit the model and return the model back. For each run, we score on the val set.
We pass intermediate scores in a report to Optuna so it can determine if the trial should be pruned
If the trial is pruned, the trial prematurely exits
'''
def fit_bo(trial, func, df, xdata, ydata, n_trials, n_splits):
    valf1 = 0
    stepcount = 0
    with tqdm(total=n_splits * n_trials) as pbar:
        for j in range(n_trials):
            kf = KFold(n_splits=n_splits, random_state=None, shuffle=False)
            for train_index, test_index in kf.split(xdata):
                x_train, x_val = xdata[train_index], xdata[test_index]
                y_train, y_val = ydata[train_index], ydata[test_index]

                model = func(trial, df, x_train, x_val, y_train, y_val)

                # Adjust metric calculation for multiclass
                instancef1 = sklearn.metrics.f1_score(y_val, model.predict(x_val), average='macro')
                valf1 += instancef1
                trial.report(valf1 / (stepcount + 1), stepcount)
                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()
                pbar.update(1)
                stepcount += 1
        return valf1 / n_splits / n_trials



def plot_res(study): #Some Optuna plots
    optuna.visualization.matplotlib.plot_param_importances(study)
    optuna.visualization.matplotlib.plot_intermediate_values(study)
    optuna.visualization.matplotlib.plot_optimization_history(study)
    optuna.visualization.matplotlib.plot_slice(study)


'''
Select an optimal instantiation of a model.
We have defined a constant random seed to ensure train and val sets across all models are consistent
numtest models are tested, all instantiated using bestparams.
'''
def test_instantiations(func, bestparams, xdata, ydata, numtest=100, sklearn_like=True):
    x_train, x_val, y_train, y_val = train_test_split(xdata, ydata, test_size=0.2, random_state=1)
    bestf1=float("-inf")
    for i in tqdm(range(numtest)):
        #True flag indicates to we are testing instantiations. Used to predict probability in models like SVM
        #We need probabilities for AUC, but they cause slower fitting
        model = func(bestparams, None, x_train, x_val, y_train, y_val, True)
        valf1 = sklearn.metrics.f1_score(y_val, np.rint(model.predict(x_val)))
        if valf1>bestf1:
            bestf1 = valf1
            bestmodel = model
    class_pred=bestmodel.predict(x_val)
    return bestmodel

'''
Score the model in the test set
If sklearn-like is true, the class probabilities are given by predict_proba and predict returns classes
Otherwise, predict returns probabilities and we need to round to get the classes.
We handle these cases separately. We report F1, AUC, Precision, Recall, and Accuracy.
'''
def score(model, xdata, ydata, sklearn_like=True):
    class_pred=model.predict(xdata)
    if sklearn_like==True: #If class_pred has class values, we need to use predict_proba for probability values
        auc = roc_auc_score(ydata, model.predict_proba(xdata)[:, 1])
    else: #If class_pred currently has probability values
        auc = roc_auc_score(ydata, class_pred)
        class_pred = np.rint(class_pred)
    precision = precision_score(ydata, class_pred)
    recall = recall_score(ydata, class_pred)
    accuracy = accuracy_score(ydata, class_pred)
    f1 = f1_score(ydata, class_pred)
    print(confusion_matrix(ydata, class_pred, labels=[0, 1]))
    print("F1: " + str(f1))
    print("AUC: " + str(auc))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("Accuracy: " + str(accuracy))
    return f1, auc, precision, recall, accuracy

'''
Print out a clean(ish) report of hyperparameter configurations
'''
def prepareDF(df, bestparams, indexnames, name):
    df = df.copy()
    df.columns=["Datatype", "Values/[Min, Max]", "Log Scaling", "Gridpoint Count"]
    df["Best Value"] = bestparams.values()
    pd.options.display.float_format = '{:,.2f}'.format
    for index in df.index:
        if df.at[index, "Datatype"]=="Integer":
            df.at[index, "Best Value"]=int(df.at[index, "Best Value"])
    df.index=indexnames
    df.columns.name = "Hyperparameter"
    print(df)


In [None]:
# Performing Bayesian hyperparameter optimization logistic regression

def fit_logistic_regression(trial_or_dict, df, X_train, X_val, y_train, y_val, inst=False):
    # Sample parameters
    C = sample(trial_or_dict, df, "C")
    solver = sample(trial_or_dict, df, "solver")
    penalty = sample(trial_or_dict, df, "penalty")
    max_iter = sample(trial_or_dict, df, "max_iter")
    tol = sample(trial_or_dict, df, "tol")
    class_weight = sample(trial_or_dict, df, "class_weight")
    l1_ratio = None
    if penalty == "elasticnet":
        l1_ratio = sample(trial_or_dict, df, "l1_ratio")

    # Instantiate and fit the Logistic Regression model
    model = LogisticRegression(C=C, solver=solver, penalty=penalty, max_iter=max_iter, 
                               tol=tol, class_weight=class_weight, l1_ratio=l1_ratio, 
                               multi_class='ovr')  # or 'multinomial'

    model.fit(X_train, y_train)
    return model

# Define the hyperparameter search space
df = pd.DataFrame()
df["C"] = ["Continuous", [0.001, 10], True, 5]
df["solver"] = ["Categorical", ["liblinear", "newton-cg", "lbfgs", "sag", "saga"], False, "N/A"]
df["penalty"] = ["Categorical", ["l1", "l2", "elasticnet", "none"], False, "N/A"]
df["max_iter"] = ["Integer", [100, 1000], False, 5]
df["tol"] = ["Continuous", [1e-4, 1e-2], True, 5]
df["class_weight"] = ["Categorical", ["balanced", None], False, "N/A"]
df["l1_ratio"] = ["Continuous", [0, 1], False, 5]  # Only relevant if penalty is 'elasticnet'

df = df.transpose()
df.columns = ["Datatype", "Values/Min-Max", "Logscaling", "Gridres"]
lrdf = df

indexnames = ["Inverse of Regularization Strength", "Optimization Algorithm", "Penalty Norm", 
              "Maximum Iterations", "Tolerance for Stopping", "Class Weight", "Elastic-Net Mixing Parameter"]
model = hyperparam_search(lrdf, fit_logistic_regression, X_train.values, X_test.values, 
                          y_train.values, y_test.values, indexnames, "LogisticRegression", 5)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

all_data = merged_df[feature_cols + ['sirna']]
train_df, test_df = train_test_split(all_data, test_size=0.3, random_state=42)

encoder = LabelEncoder()

train_df['sirna_encoded'] = encoder.fit_transform(train_df['sirna'])
test_df['sirna_encoded'] = encoder.transform(test_df['sirna'])

X_train = train_df[feature_cols]
y_train = train_df['sirna_encoded']
X_test = test_df[feature_cols]
y_test = test_df['sirna_encoded']  


In [None]:
!pip install autogluon

In [None]:
!cmake --version

In [None]:
!pip install lightgbm

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
import random
from sklearn.model_selection import train_test_split, KFold
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.common.utils.utils import setup_outputdir
from autogluon.core.utils.loaders import load_pkl
from autogluon.core.utils.savers import save_pkl
import os.path
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

scaler = StandardScaler()
feature_cols = [col for col in merged_df.columns if col.startswith('feature_')]
X_scaled = scaler.fit_transform(merged_df[feature_cols])


all_data = merged_df[feature_cols + ['sirna']]
train_df, test_df = train_test_split(all_data, test_size=0.3, random_state=42)

encoder = LabelEncoder()

train_df['sirna_encoded'] = encoder.fit_transform(train_df['sirna'])
test_df['sirna_encoded'] = encoder.transform(test_df['sirna'])

train_df = train_df.drop(columns=['sirna'])
test_df = test_df.drop(columns=['sirna'])

X_train = train_df[feature_cols]
y_train = train_df['sirna_encoded']
X_test = test_df[feature_cols]
y_test = test_df['sirna_encoded']  

In [None]:
from autogluon.tabular import TabularPredictor

# Existing hyperparameters
hyperparameters = {
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini'}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr'}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini'}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr'}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
    # Add Logistic Regression
    'LR': {}  # Using default hyperparameters for Logistic Regression
}

# Fit the predictor
predictor = TabularPredictor(label='sirna_encoded', problem_type='multiclass').fit(
    train_data=train_df,
    hyperparameters=hyperparameters
)


In [None]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score

def printScores(y_val, class_pred):
    f1 = f1_score(y_val, class_pred, average='macro')
    precision = precision_score(y_val, class_pred, average='macro')
    recall = recall_score(y_val, class_pred, average='macro')
    accuracy = accuracy_score(y_val, class_pred)
    print(confusion_matrix(y_val, class_pred))
    print("F1: " + str(f1))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("Accuracy: " + str(accuracy))
    
def returnscores_xv(Yval, predictions):
    score = np.zeros(4)
    score[0] = accuracy_score(Yval, predictions)
    score[1] = recall_score(Yval, predictions, average='macro', zero_division=0)
    score[2] = precision_score(Yval, predictions, average='macro', zero_division=0)
    score[3] = f1_score(Yval, predictions, average='macro')
    return score

def printscores_xv(scores):
    print("Accuracy: " + str(100 * scores[0]) + "%")
    print("Recall: " + str(100 * scores[1]) + "%")
    print("Precision: " + str(100 * scores[2]) + "%")
    print("F1: " + str(100 * scores[3]) + "%")

predictions = predictor.predict(X_test)

proba = predictor.predict_proba(X_test)

printScores(y_test, predictions)

scores = returnscores_xv(y_test, predictions)
printscores_xv(scores)


In [None]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, log_loss
import time

def printScores(y_val, class_pred, proba):
    f1 = f1_score(y_val, class_pred, average='macro')
    precision = precision_score(y_val, class_pred, average='macro')
    recall = recall_score(y_val, class_pred, average='macro')
    accuracy = accuracy_score(y_val, class_pred)
    auc = roc_auc_score(y_val, proba, multi_class='ovr')
    logloss = log_loss(y_val, proba)
    print(confusion_matrix(y_val, class_pred))
    print("F1: " + str(f1))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("Accuracy: " + str(accuracy))
    print("AUC-ROC: " + str(auc))
    print("Log Loss: " + str(logloss))
    
def returnscores_xv(Yval, predictions, proba):
    score = np.zeros(6)
    score[0] = accuracy_score(Yval, predictions)
    score[1] = recall_score(Yval, predictions, average='macro', zero_division=0)
    score[2] = precision_score(Yval, predictions, average='macro', zero_division=0)
    score[3] = f1_score(Yval, predictions, average='macro')
    score[4] = roc_auc_score(Yval, proba, multi_class='ovr')
    score[5] = log_loss(Yval, proba)
    return score

def printscores_xv(scores):
    print("Accuracy: " + str(100 * scores[0]) + "%")
    print("Recall: " + str(100 * scores[1]) + "%")
    print("Precision: " + str(100 * scores[2]) + "%")
    print("F1: " + str(100 * scores[3]) + "%")
    print("AUC-ROC: " + str(100 * scores[4]) + "%")
    print("Log Loss: " + str(scores[5]))

# Start time
start_time = time.time()

predictions = predictor.predict(X_test)
proba = predictor.predict_proba(X_test)

training_time = time.time() - start_time

printScores(y_test, predictions, proba)

scores = returnscores_xv(y_test, predictions, proba)
printscores_xv(scores)

print("Training time: " + str(training_time) + " seconds")


In [None]:
from autogluon.tabular import TabularPredictor

# second round
# Existing hyperparameters
hyperparameters = {
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini'}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr'}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini'}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr'}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
    # Add Logistic Regression
    'LR': {}  # Using default hyperparameters for Logistic Regression
}

# Fit the predictor
predictor2 = TabularPredictor(label='sirna_encoded', problem_type='multiclass').fit(
    train_data=train_df,
    hyperparameters=hyperparameters
)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, log_loss

y_true = test_df['sirna_encoded']
y_pred = predictor2.predict(test_df.drop(columns=['sirna_encoded']))
proba_pred = predictor2.predict_proba(test_df.drop(columns=['sirna_encoded']))

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
auc_roc = roc_auc_score(y_true, proba_pred, multi_class='ovr')
logloss = log_loss(y_true, proba_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {auc_roc}")
print(f"Log Loss: {logloss}")


In [None]:
log_reg = LogisticRegression(random_state=42, max_iter=10)
log_reg.fit(X_train_scaled, y_train)
y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
log_reg = LogisticRegression(random_state=42, max_iter=1000, tol=1e-5, solver='saga', penalty='l1', class_weight='balanced', C=1.0)

log_reg.fit(X_train_scaled, y_train)
y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

all_data = merged_df[feature_cols + ['sirna']]
train_df, test_df = train_test_split(all_data, test_size=0.3, random_state=42)

encoder = LabelEncoder()

train_df['sirna_encoded'] = encoder.fit_transform(train_df['sirna'])
test_df['sirna_encoded'] = encoder.transform(test_df['sirna'])

X_train = train_df[feature_cols]
y_train = train_df['sirna_encoded']
X_test = test_df[feature_cols]
y_test = test_df['sirna_encoded']  


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())
])

# Define the hyperparameters grid
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'logreg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validated score: {grid_search.best_score_:.3f}")

best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test accuracy: {test_score:.4f}")

from time import time

start_time = time()
training_time = time() - start_time

y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred, average='macro')
test_f1 = f1_score(y_test, y_pred, average='macro')

y_proba = best_model.predict_proba(X_test)
test_auc_roc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test AUC-ROC: {test_auc_roc:.4f}")
print(f"Training Time: {training_time:.4f} seconds")

