<a href="https://colab.research.google.com/github/ahmedhesham47/Bayesian-Network-for-Predicting-ICB-Response/blob/main/Feature_Selection_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Packages**

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from google.colab import drive
drive.mount('/content/drive')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import make_pipeline
from itertools import combinations
from functools import reduce
from sklearn.model_selection import GridSearchCV
import itertools
from scipy import stats
from sklearn.svm import SVC

Mounted at /content/drive


# **Helper Functions**

In [2]:
def GeneExpressionPreprocessing(mRNAFile, DropList, GeneColumnName, Need_Transpose=True, Percent_To_FilterOut=20):
  for i in DropList: # Dropping columns
    mRNAFile = mRNAFile.drop(i, axis=1, errors='ignore')
  if Need_Transpose == True: # transpose the file such that genes are columns and samples are rows
    mRNAFile = mRNAFile.set_index(GeneColumnName)
    mRNAFile = mRNAFile.T
    mRNAFile = mRNAFile.rename_axis('Sample', axis=1)
  # Filter out genes which are zeroes in certain percentage from the data.
  filtered_data = mRNAFile.loc[:, (mRNAFile == 0).mean() * 100 <= Percent_To_FilterOut]
  return filtered_data

In [3]:
def merge_on_common_column(dataframes, merge_column):
    merged_df = dataframes[0]
    for df in dataframes[1:]:
        merged_df = pd.merge(merged_df, df, on=merge_column)
    return merged_df

In [4]:
def merge_dataframes(dataframeslist, commoncolumn):
  return reduce(lambda left, right: pd.merge(left, right, on=commoncolumn, how='inner'), dataframeslist)

In [34]:
def Best_Combinations(merged):
  column_names = merged.columns
  column_combinations = combinations(column_names, 6)
  intersections = {}
  # Compute the intersection for each combination
  for combination in column_combinations:
      # Drop NaN values and find the intersections from that particular combination
      intersected_genes = set(merged[combination[0]].dropna()) & set(merged[combination[1]].dropna()) & \
                          set(merged[combination[2]].dropna()) & set(merged[combination[3]].dropna()) & \
                          set(merged[combination[4]].dropna())
      intersections[combination] = intersected_genes
  intersections_summary = {combo: len(genes) for combo, genes in intersections.items()} # this is not needed, it can only be used to see what each combination produced
  max_combination = max(intersections, key=lambda x: len(intersections[x])) # the combination leading to the maximum number of genes
  max_genes = intersections[max_combination] # the genes themselves
  Final_Selected_Genes = list(max_genes) # making sures they are in a list
  return Final_Selected_Genes

In [6]:
def get_merged_df(rf, fdr, cor, sfm, gb, rfe):
  #fdr.rename(columns={'Feature': 'Gene'}, inplace=True)

  rf_top_genes = rf['Feature']

  sfm_selected_genes = sfm['Feature']

  correlation_top_genes = cor['Feature']

  gb_top_genes = gb['Feature']

  fdr_top_genes = fdr['Feature']

  rfe_selected_genes = rfe['Feature']

  all_genes = set(rf_top_genes) | set(sfm_selected_genes) | set(correlation_top_genes) | set(gb_top_genes) | set(fdr_top_genes) | set(rfe_selected_genes)
  all_genes_list = list(all_genes)
  # In sets, the "|" means union, so we get all the genes from all the feature selection methods

  col = ['Random_Forest', 'Select_From_Model', 'Gene_Response_Correlation', 'Gradient_Boosting', 'Fisher_Discriminant_Ratio', 'Recursive_Feature_Elimination']
  merged_df = pd.DataFrame(index=all_genes_list, columns=col) # this is an empty dataframe with rows as ALL the genes and columns as the feature selection methods

  # We will now fill the dataframe with either a value (name of the gene) or NaN, which means the gene was not selected via this feature selection method
  # For example...
  # Random_Forest     Select_From_Model"
  # TP53              NaN
  # And so on...

  merged_df.loc[rf_top_genes, "Random_Forest"] = rf_top_genes.values
  merged_df.loc[sfm_selected_genes, "Select_From_Model"] = sfm_selected_genes.values
  merged_df.loc[correlation_top_genes, "Gene_Response_Correlation"] = correlation_top_genes.values
  merged_df.loc[gb_top_genes, "Gradient_Boosting"] = gb_top_genes.values
  merged_df.loc[fdr_top_genes, "Fisher_Discriminant_Ratio"] = fdr_top_genes.values
  merged_df.loc[rfe_selected_genes, "Recursive_Feature_Elimination"] = rfe_selected_genes.values
  return merged_df

# **Importing Files**

In [8]:
clinical_patient = pd.read_csv('/content/drive/MyDrive/Machine Learning Project/data_clinical_patient.txt', sep='\t')
clinical_sample = pd.read_csv('/content/drive/MyDrive/Machine Learning Project/data_clinical_sample.txt', sep='\t')
data_mrna = pd.read_csv('/content/drive/MyDrive/Machine Learning Project/data_mrna_seq_tpm.txt', sep='\t')
clinical_patient = clinical_patient.iloc[4:]
clinical_sample = clinical_sample.iloc[4:]

# **Pre-processing**

In [None]:
merged_clinical = merge_on_common_column([clinical_sample, clinical_patient], '#Patient Identifier')

labels = merged_clinical[['Sample Identifier', 'Best Radiographic Response (RECIST 1.1)']]
response_mapping = {
    'Complete Response': 1,
    'Partial Response': 1,
    'Stable Disease': 0,
    'Progressive Disease': 0
}
labels['Best Radiographic Response (RECIST 1.1)'] = labels['Best Radiographic Response (RECIST 1.1)'].map(response_mapping)
labels = labels.dropna(subset = ['Best Radiographic Response (RECIST 1.1)', 'Sample Identifier'])
labels = labels.rename(columns={'Best Radiographic Response (RECIST 1.1)': 'ICB Response'})
labels['ICB Response'] = labels['ICB Response'].astype('category')
filter_ids = labels['Sample Identifier']

GeneExpressionData = GeneExpressionPreprocessing(data_mrna, ['Entrez_Gene_Id'], 'Hugo_Symbol')
GeneExpressionData = GeneExpressionData.reset_index()
GeneExpressionData = GeneExpressionData.rename(columns={'index': 'Sample Identifier'})
GeneExpressionData = GeneExpressionData[GeneExpressionData['Sample Identifier'].isin(filter_ids)]

In [None]:
GeneExpressionData

# **Feature Selection**

## **Random Forest**

In [13]:
def feature_selection_with_rf(df, label_df, commoncolumn='Sample Identifier', labelID ='ICB Response', n_feature=1000, n_estimators=100, random_state=42):
    merged_df = pd.merge(df, label_df, on=commoncolumn)
    X = merged_df.drop(columns=[commoncolumn, labelID])
    y = merged_df[labelID]
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    rf.fit(X, y)
    rf_importances = rf.feature_importances_
    rf_importances_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_importances
    })
    rf_importances_df = rf_importances_df.sort_values(by='Importance', ascending=False)
    rf_importances_df = rf_importances_df.head(n_feature)
    rf_importances_df = rf_importances_df[rf_importances_df['Importance'] > 0]
    return rf_importances_df

In [None]:
rf_Expression = feature_selection_with_rf(GeneExpressionData, labels, 'Sample Identifier', 'ICB Response', n_estimators=1000)
rf_Expression

## **Recursive Feature Elimination**

In [16]:
def recursive_feature_elimination(df, labels_df, commoncolumn='Sample Identifier', labelID ='ICB Response', n_features_to_select=1000, step=100):
    merged_df = pd.merge(df, labels_df, on=commoncolumn)
    X = merged_df.drop([commoncolumn, labelID], axis=1)
    y = merged_df[labelID]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    logreg = LogisticRegression()
    rfe = RFE(estimator=logreg, n_features_to_select=n_features_to_select, step=step)
    rfe.fit(X_scaled, y)
    selected_features = pd.DataFrame({'Feature': X.columns,
                                      'Selected': rfe.support_,
                                      'Ranking': rfe.ranking_})
    s = selected_features.sort_values(by='Ranking', ascending=True)

    # Selecting genes with ranking = 1, that is the top n_features_to_select genes
    s = s[s['Ranking'] == 1]
    return s

In [None]:
rfe_Expression = recursive_feature_elimination(GeneExpressionData, labels, 'Sample Identifier', 'ICB Response', step=50)
rfe_Expression

## **SelectFromModel**

In [18]:
def select_from_model(df, labels_df, commoncolumn='Sample Identifier', labelID ='ICB Response'):
# Selecting features using Logistic Regression and SelectFromModel
  merged_df = pd.merge(df, labels_df, on=commoncolumn)
  X = merged_df.drop([commoncolumn, labelID], axis=1)
  y = merged_df[labelID]
  log_reg = LogisticRegression()
  select_model = SelectFromModel(log_reg)
  select_model.fit(X, y)
  selection_status = select_model.get_support()
  SelectFromModel_genes_df = pd.DataFrame({
      'Feature': X.columns,
      'Selected': selection_status
  })
  SelectFromModel_genes_df['Selected'] = SelectFromModel_genes_df['Selected'].astype(int)
  SelectFromModel_genes_df = SelectFromModel_genes_df[SelectFromModel_genes_df['Selected'] == 1]

  return SelectFromModel_genes_df

In [None]:
sfm_Expression = select_from_model(GeneExpressionData, labels, 'Sample Identifier', 'ICB Response')
sfm_Expression

## **Correlation**

In [20]:
def correlation(df, labels_df, threshold=0.2, commoncolumn='Sample Identifier', labelID ='ICB Response', is_df_categorical=True):
  merged_df = pd.merge(df, labels_df, on=commoncolumn)
  X = merged_df.drop([commoncolumn, labelID], axis=1)
  y = merged_df[labelID]

  # Was producing an error because Pearson correlation (calculated by corr function) cannot correlate categoricals to floats
  # So we added an else condition to handle the case where the data is not categorical to be calculated via point biserial method
  if is_df_categorical:
    correlation_values = X.apply(lambda gene: gene.corr(y))
  else:
    correlation_values = X.apply(lambda gene: stats.pointbiserialr(gene, y).correlation)


  gene_correlation = pd.DataFrame({
      'Feature': correlation_values.index,
      'Correlation': correlation_values
  })

  gene_correlation = gene_correlation.sort_values(by='Correlation', key=abs, ascending=False)

  selected_genes = gene_correlation[abs(gene_correlation['Correlation']) > threshold]

  return selected_genes

In [None]:
Expression_correlation = correlation(GeneExpressionData, labels, 0.2, commoncolumn='Sample Identifier', labelID ='ICB Response')
Expression_correlation

## **Gradient Boost**

In [23]:
def gradient_boosting(df, labels_df, commoncolumn='Sample Identifier', labelID ='ICB Response', nofeatures=1000, n_estimator = 100, random_state=42):
  merged_df = pd.merge(df, labels_df, on=commoncolumn)
  X = merged_df.drop([commoncolumn, labelID], axis=1)
  y = merged_df[labelID]
  gb_classifier = GradientBoostingClassifier(n_estimators=n_estimator, random_state=random_state)

  gb_classifier.fit(X, y)

  feature_importances_gb = gb_classifier.feature_importances_

  gene_importances_gb = pd.DataFrame({
      'Feature': X.columns,
      'Importance': feature_importances_gb
  })

  sorted_gene_importances_gb = gene_importances_gb.sort_values(by='Importance', ascending=False)
  sorted_gene_importances_gb = sorted_gene_importances_gb.head(nofeatures)
  sorted_gene_importances_gb = sorted_gene_importances_gb[sorted_gene_importances_gb['Importance'] > 0]
  return sorted_gene_importances_gb

In [None]:
gb_Expression = gradient_boosting(GeneExpressionData, labels, commoncolumn='Sample Identifier', labelID ='ICB Response')
gb_Expression

## **Fischer's Discriminant Ratio (FDR)**

In [25]:
def fdr(df, labels_df, commoncolumn='Sample Identifier', n_feature=1000, labelID ='ICB Response'):
  merged_df = pd.merge(df, labels_df, on=commoncolumn)
  X = merged_df.drop([commoncolumn, labelID], axis=1)
  y = merged_df[labelID]

  f_scores, _ = f_classif(X, y)

  fdr_df = pd.DataFrame({'Feature': X.columns, 'Fisher_Score': f_scores})

  fdr_df = fdr_df.sort_values(by='Fisher_Score', ascending=False)

  fdr_df = fdr_df.head(n_feature)

  return fdr_df

In [None]:
fdr_Expression = fdr(GeneExpressionData, labels, commoncolumn='Sample Identifier', labelID ='ICB Response')
fdr_Expression

## **Intersection Selection Methods**

In [None]:
# Code only made to visualize the resulting dataframe, but it will not be used later

fisher_selected = set(fdr_Expression['Feature']) #if we want SNP, we should make it fdr_SNP instead and so on

gradient_boosting_selected = set(gb_Expression['Feature'])

correlation_selected = set(Expression_correlation['Feature'])

sfm_selected = set(sfm_Expression['Feature'])

random_forest_selected = set(rf_Expression['Feature'])

rfe_selected = set(rfe_Expression['Feature'])

all_genes = set.union(fisher_selected, gradient_boosting_selected, correlation_selected, sfm_selected, random_forest_selected, rfe_selected)

all_genes_list = list(all_genes)

final_df = pd.DataFrame(index=all_genes_list, columns=["Fisher", "Gradient Boosting", "Correlation", "Select From Model", "Random Forest", "Recursive Feature Elimination"])

final_df["Fisher"] = final_df.index.isin(fisher_selected).astype(int)
final_df["Gradient Boosting"] = final_df.index.isin(gradient_boosting_selected).astype(int)
final_df["Correlation"] = final_df.index.isin(correlation_selected).astype(int)
final_df["Select From Model"] = final_df.index.isin(sfm_selected).astype(int)
final_df["Random Forest"] = final_df.index.isin(random_forest_selected).astype(int)
final_df["Recursive Feature Elimination"] = final_df.index.isin(rfe_selected).astype(int)
final_df

In [None]:
Merged_Expression = get_merged_df(rf_Expression, fdr_Expression, Expression_correlation, sfm_Expression, gb_Expression, rfe_Expression)
Merged_Expression

In [36]:
Best_Expression_Genes = Best_Combinations(Merged_Expression)
len(Best_Expression_Genes)

18

In [38]:
best_genes_expression = pd.DataFrame(Best_Expression_Genes, columns=['Gene_Name'])
best_genes_expression.to_csv('Best Expression Genes.tsv', sep='\t', index=False)
filtered_Expression_Data = GeneExpressionData[['Sample Identifier'] + Best_Expression_Genes]
filtered_Expression_Data.to_csv('Filtered Expression Genes.tsv', sep='\t', index=False)