In [1]:
import os
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
from sklearn.model_selection import cross_val_predict
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneOut, KFold, cross_val_score

In [3]:
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [4]:
output_dir = 'C:\\Users\\MONSTER\\Desktop\\TEZ SON\\CCLE\\OUTPUT3'

In [5]:
Train_IC50_predict = dict()
Test_IC50_predict = dict()
Drug_dict = dict()  # Her histoloji için bir ilaç sözlüğü oluşturun
os.chdir(output_dir)
    
Drugs = [item1 for item1 in os.listdir() if os.path.isdir(item1)]
ind = 1
        
for drug in Drugs:
    os.chdir(os.path.join(output_dir, drug))
    expression_data = pd.read_csv(drug + " - Expression Data (IC50) - Normalized3.tsv", sep="\t")
    
    if len(expression_data.columns) > 2 and len(expression_data['Sample_Name'].unique()) > 20:
        print(f"\nDrug {ind}: XGBoost is running for {drug}", flush=True)
        x1 = expression_data.drop(['Sample_Name', 'IC50'], axis=1)
        y1 = expression_data['IC50']
        
        if len(x1.columns)>= 5:
            x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.30, random_state=42)
            
            # LOOCV (Leave-One-Out Cross-Validation)
            loo = LeaveOneOut()
            xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
            loo_scores = cross_val_score(xgb_model, x_train, y_train, cv=loo)
                                          
                                
            # k-fold kullanılıyor.
                            
            kfold = KFold(n_splits=2, random_state=1, shuffle=True)
            cv_results = cross_validate(xgb_model, x_train, y_train, cv=kfold, return_train_score=True)

                            
            average_train_score = cv_results['train_score'].mean()
            average_test_score = cv_results['test_score'].mean()
                            
            # En iyi modeli seçelim
            best_model_index = np.argmax(cv_results['test_score'])
            best_model = xgb_model
            best_model.fit(x_train, y_train)                                                    
                            
                                
            # Tahmin yapılıyor.
            y_train_pred = best_model.predict(x_train)
            y_test_pred = best_model.predict(x_test)
            
                
            # Sonuçları toplama
            r_squared_train = r2_score(y_train, y_train_pred)
            pearson_correlation_train, p_value_train_pearsonr = pearsonr(y_train_pred, y_train)
            corr_train_spearmanr, p_value_train_spearmanr = spearmanr(y_train_pred, y_train)
            rmse_val_train = np.sqrt(mean_squared_error(y_train_pred, y_train))
                
            r_squared_test = r2_score(y_test, y_test_pred)
            pearson_correlation_test, p_value_test_pearsonr = pearsonr(y_test_pred, y_test)
            corr_test_spearmanr, p_value_test_spearmanr = spearmanr(y_test_pred, y_test)
            rmse_val_test = np.sqrt(mean_squared_error(y_test_pred, y_test))
                
            result_table_drug = pd.DataFrame({
                         
                        'Drug': drug,
                        'R_Square_Train': r_squared_train,
                        'R_Square_Test': r_squared_test,
                        'Pearson_Correlation_Train': pearson_correlation_train,
                        'Pearson_Correlation_Test': pearson_correlation_test,
                        'Pearson_Correlation_P_Value_Train': p_value_train_pearsonr,
                        'Pearson_Correlation_P_Value_Test': p_value_test_pearsonr,
                        'Spearman_Correlation_Train': corr_train_spearmanr,
                        'Spearman_Correlation_Test': corr_test_spearmanr,
                        'Spearman_Correlation_P_Value_Train': p_value_train_spearmanr,
                        'Spearman_Correlation_P_Value_Test': p_value_test_spearmanr,
                        'RMSE_Train': rmse_val_train,
                        'RMSE_Test': rmse_val_test
                }, index=[0])
                    
            Drug_dict[drug] = result_table_drug
            print(Drug_dict[drug])
                
            # Sonuçları dosyalara kaydetme
            os.mkdir(os.path.join(output_dir, drug, "XGBoost"))
            result_table_drug.to_csv(os.path.join(output_dir, drug, "XGBoost", drug + "- Result_XGBoost.tsv"), 
                                                 sep='\t', index=False, quoting=False)
            Train_IC50_predict[drug] = pd.DataFrame({'Actual_IC50_Train_Data': y_train, 'Predicted_IC50_Train_Data': y_train_pred})
            Train_IC50_predict[drug].to_csv(os.path.join(output_dir, drug, "XGBoost", drug + "- IC50_Train_Result.tsv"), 
                                                        sep='\t', index=False, quoting=False)
            Test_IC50_predict[drug] = pd.DataFrame({'Actual_IC50_Test_Data': y_test, 'Predicted_IC50_Test_Data': y_test_pred})
            Test_IC50_predict[drug].to_csv(os.path.join(output_dir, drug, "XGBoost", drug + "- IC50_Test_Result.tsv"), 
                                                       sep='\t', index=False, quoting=False)
                
            ind = ind + 1

if Drug_dict:
    All_Result_XG_CCLE_without_separated_tissue = pd.concat(Drug_dict.values())
    


Drug 1: XGBoost is running for 17-AAG
     Drug  R_Square_Train  R_Square_Test  Pearson_Correlation_Train  \
0  17-AAG        0.999999      -0.239188                   0.999999   

   Pearson_Correlation_Test  Pearson_Correlation_P_Value_Train  \
0                  0.227576                                0.0   

   Pearson_Correlation_P_Value_Test  Spearman_Correlation_Train  \
0                          0.008181                    0.999961   

   Spearman_Correlation_Test  Spearman_Correlation_P_Value_Train  \
0                   0.148533                                 0.0   

   Spearman_Correlation_P_Value_Test  RMSE_Train  RMSE_Test  
0                           0.086749    0.000623   0.429305  

Drug 2: XGBoost is running for AEW541
     Drug  R_Square_Train  R_Square_Test  Pearson_Correlation_Train  \
0  AEW541             1.0      -0.221222                        1.0   

   Pearson_Correlation_Test  Pearson_Correlation_P_Value_Train  \
0                  0.208601              

         Drug  R_Square_Train  R_Square_Test  Pearson_Correlation_Train  \
0  PD-0325901             1.0       0.048793                        1.0   

   Pearson_Correlation_Test  Pearson_Correlation_P_Value_Train  \
0                   0.36446                                0.0   

   Pearson_Correlation_P_Value_Test  Spearman_Correlation_Train  \
0                          0.002838                    0.999957   

   Spearman_Correlation_Test  Spearman_Correlation_P_Value_Train  \
0                     0.5441                       5.243499e-303   

   Spearman_Correlation_P_Value_Test  RMSE_Train  RMSE_Test  
0                           0.000003    0.000436   1.824141  

Drug 14: XGBoost is running for PD-0332991
         Drug  R_Square_Train  R_Square_Test  Pearson_Correlation_Train  \
0  PD-0332991             1.0      -0.308145                        1.0   

   Pearson_Correlation_Test  Pearson_Correlation_P_Value_Train  \
0                  0.098325                      6.349080e-

In [6]:
All_XG_Result = All_Result_XG_CCLE_without_separated_tissue.sort_values(by='R_Square_Test', ascending=False)
All_XG_Result.to_excel('C:\\Users\\MONSTER\\Desktop\\TEZ SON\\CCLE\\OUTPUT3\\All_Result_XG_CCLE_without_separated_tissue.xlsx', index=False)