In [1]:
import os
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneOut, KFold, cross_val_score

In [3]:
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error

In [4]:
output_dir_ccle = 'C:\\Users\\MONSTER\\Desktop\\TEZ SON\\CCLE\\OUTPUT - CCLE-GDSC'
output_dir_gdsc = 'C:\\Users\\MONSTER\\Desktop\\TEZ SON\\GDSC\\output'

In [5]:
os.chdir(output_dir_ccle)

In [6]:
Histologies = [item1 for item1 in os.listdir() if os.path.isdir(item1)]

In [7]:
Histologies

['aero_dig_tract',
 'breast',
 'digestive_system',
 'kidney',
 'large_intestine',
 'leukemia',
 'lung',
 'lymphoma',
 'myeloma',
 'nervous_system',
 'pancreas',
 'skin',
 'thyroid',
 'urogenital_system']

In [8]:
Histology_dict = dict()
Drug_dict = dict()
IC50_predict = dict()

for histology in Histologies:
    os.chdir(os.path.join(output_dir_ccle, histology))
    if len(os.listdir()) > 1:
        Drugs_ccle = [item1 for item1 in os.listdir() if os.path.isdir(item1)]
        
        os.chdir(os.path.join(output_dir_gdsc, histology))
        if len(os.listdir()) > 1:
            Drugs_gdsc = [item1 for item1 in os.listdir() if os.path.isdir(item1)]
            
            common_drug_ccle_gdsc = set(Drugs_ccle).intersection(Drugs_gdsc)
            if len(common_drug_ccle_gdsc) > 0:
                
                for drug in common_drug_ccle_gdsc:
                    
                    expression_data = pd.read_csv(os.path.join(output_dir_ccle,histology,drug,drug+" "+histology+" "+"- Expression Data(IC50).tsv"), sep = ",")
                    gdsc_train_data = pd.read_csv(os.path.join(output_dir_gdsc,histology,drug,drug+" "+histology+" "+"- GDSC Histology Project.tsv"), sep = "\t")
                    
                    if len(expression_data.columns) > 3 and len(expression_data['Cell Line']) > 20 and len(gdsc_train_data['Cell Line']) > 20:
                        
                        x_ccle = expression_data.drop(['Unnamed: 0', 'Cell Line', 'LN_IC50'], axis=1)
                        y_ccle = expression_data['LN_IC50']
                        
                        x_gdsc_1 = gdsc_train_data.drop(['Cell Line','GDSC\nTissue descriptor 1','LN_IC50','Drug'], axis=1)
                        y_gdsc = gdsc_train_data['LN_IC50']
                    
                        common_features_ccle_gdsc = set(x_ccle.columns).intersection(x_gdsc_1.columns)
                        if len(common_features_ccle_gdsc) > 0:
                            
                            features_name = common_features_ccle_gdsc
                            sample_number_gdsc_drug = len(gdsc_train_data['Cell Line'])
                        
                            x_ccle = x_ccle[list(features_name)]
                            #y_ccle = boruta_drug_data_ccle['log IC50 (uM)']
                                
                            x_gdsc = x_gdsc_1[list(features_name)]
                            #y_gdsc = boruta_drug_data_gdsc[drug]
                                
                            features_name_number= len((x_gdsc).columns)
                                
                            x_ccle = np.array(x_ccle)
                            y_ccle = np.array(y_ccle)
                            x_gdsc = np.array(x_gdsc)
                            y_gdsc = np.array(y_gdsc)
                        
                        
                            # MODEL OLUŞTURUYORUZ
                                
                            # LOOCV (Leave-One-Out Cross-Validation)
                            loo = LeaveOneOut()
                            rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
                            loo_scores = cross_val_score(rf_model, x_ccle, y_ccle, cv=loo)
                                
                                
                            # k-fold kullanılıyor.
                            kfold = KFold(n_splits=2, random_state=1, shuffle = True)
                            
                            cv_results = cross_validate(rf_model, x_ccle, y_ccle, cv=kfold, return_train_score=True)
                            
                            average_test_score = cv_results['test_score'].mean()
                            average_train_score = cv_results['train_score'].mean()
                            best_model_index = np.argmax(cv_results['test_score'])
                            best_model = rf_model
                            best_model.fit(x_ccle, y_ccle)
                                                    
                            
                                
                            # Tahmin yapılıyor.
                            y_gdsc_pred = best_model.predict(x_gdsc)                        

                            print(histology)
                            print(f"\nDrug : Random Forest is running for {drug}", flush=True)
            
                
                            r_squared = r2_score(y_gdsc, y_gdsc_pred)
                            pearson_correlation, p_value_pearsonr = pearsonr(y_gdsc_pred, y_gdsc)
                            corr_spearmanr, p_value_spearmanr = spearmanr(y_gdsc_pred, y_gdsc)
                            rmse_val = np.sqrt(mean_squared_error(y_gdsc_pred, y_gdsc))
                
                            features_name = list(features_name)
                            result_table_drug = pd.DataFrame({
                            'Tissue': [histology],
                            'Drug': [drug],
                            'R_Square': [r_squared],
                            'Pearson_Correlation_Train': [pearson_correlation],
                            'features_name': [features_name],
                            'features_name_number': [features_name_number],
                            'sample_number_gdsc_drug':[sample_number_gdsc_drug],    
                            'Pearson_Correlation_P_Value_Train': [p_value_pearsonr],
                            'Spearman_Correlation_Train': [corr_spearmanr],
                            'Spearman_Correlation_P_Value_Train': [p_value_spearmanr],
                            'RMSE_Train': [rmse_val]
                             })
            

                            Drug_dict[drug] = result_table_drug
                
                            # Save results to files
                            os.mkdir(os.path.join(output_dir_gdsc, histology, drug, "Random Forest - Regression"))
                            result_table_drug.to_csv(os.path.join(output_dir_gdsc, histology, drug, "Random Forest - Regression", drug + "- Result_RF_Reg.tsv"), 
                                     sep='\t', index=False, quoting=False)
                            IC50_predict[drug] = pd.DataFrame({'Actual_IC50_Data': y_gdsc, 'Predicted_IC50_Train_Data': y_gdsc_pred})
                            IC50_predict[drug].to_csv(os.path.join(output_dir_gdsc, histology, drug, "Random Forest - Regression", drug + "- IC50_Result.tsv"), 
                                            sep='\t', index=False, quoting=False)
                       
                    
                    
                    
    if Drug_dict:
        Histology_dict[histology] = pd.concat(Drug_dict.values())  # Her bir histoloji için ilaç sonuçlarını birleştirin

# Tüm histolojilerin sonuçlarını birleştirin ve dosyalara kaydedin
if Histology_dict:
    All_RF_Result = pd.concat(Histology_dict.values())



                    
                    
    
   


aero_dig_tract

Drug : Random Forest is running for PD0325901
aero_dig_tract

Drug : Random Forest is running for AZD6244
aero_dig_tract

Drug : Random Forest is running for 17-AAG
breast

Drug : Random Forest is running for PD0325901
breast

Drug : Random Forest is running for AZD6244
breast

Drug : Random Forest is running for 17-AAG
digestive_system

Drug : Random Forest is running for Nilotinib
digestive_system

Drug : Random Forest is running for PD0325901
digestive_system

Drug : Random Forest is running for PLX-4720
digestive_system

Drug : Random Forest is running for AZD6244
digestive_system

Drug : Random Forest is running for 17-AAG
kidney

Drug : Random Forest is running for Nilotinib
kidney

Drug : Random Forest is running for PD0325901
kidney

Drug : Random Forest is running for AZD6244
kidney

Drug : Random Forest is running for 17-AAG
large_intestine

Drug : Random Forest is running for Nilotinib
large_intestine

Drug : Random Forest is running for PD0325901
large_intes

In [11]:
All_RF_Result = All_RF_Result.sort_values(by='R_Square', ascending=False)
All_RF_Result.to_excel('C:\\Users\\MONSTER\\Desktop\\TEZ SON\\CCLE\\ML RESULT\\RF Regression_CCLE_GDSC.xlsx', index=False)

In [8]:
from sklearn.model_selection import GridSearchCV


In [None]:
Histology_dict = dict()
Drug_dict = dict()
IC50_predict = dict()

for histology in Histologies:
    os.chdir(os.path.join(output_dir_ccle, histology))
    if len(os.listdir()) > 1:
        Drugs_ccle = [item1 for item1 in os.listdir() if os.path.isdir(item1)]
        
        os.chdir(os.path.join(output_dir_gdsc, histology))
        if len(os.listdir()) > 1:
            Drugs_gdsc = [item1 for item1 in os.listdir() if os.path.isdir(item1)]
            
            common_drug_ccle_gdsc = set(Drugs_ccle).intersection(Drugs_gdsc)
            if len(common_drug_ccle_gdsc) > 0:
                
                for drug in common_drug_ccle_gdsc:
                    
                    os.chdir(os.path.join(output_dir_ccle,histology,drug))
                    if len(os.listdir()) > 1:
                        expression_data = pd.read_csv(os.path.join(output_dir_ccle,histology,drug,drug+" "+histology+" "+"- Expression Data(IC50).tsv"), sep = ",")
                        
                        os.chdir(os.path.join(output_dir_gdsc,histology,drug))
                        if len(os.listdir()) > 1:
                            gdsc_train_data = pd.read_csv(os.path.join(output_dir_gdsc,histology,drug,drug+" "+histology+" "+"- GDSC Histology Project.tsv"), sep = "\t")
                            
                            if len(expression_data.columns) > 3 and len(expression_data['Cell Line']) > 20 and len(gdsc_train_data['Cell Line']) > 20:
                                x_ccle = expression_data.drop(['Unnamed: 0', 'Cell Line', 'LN_IC50'], axis=1)
                                y_ccle = expression_data['LN_IC50']
                        
                                x_gdsc_1 = gdsc_train_data.drop(['Cell Line','GDSC\nTissue descriptor 1','LN_IC50','Drug'], axis=1)
                                y_gdsc = gdsc_train_data['LN_IC50']
                                
                                common_features_ccle_gdsc = set(x_ccle.columns).intersection(x_gdsc_1.columns)
                                if len(common_features_ccle_gdsc) > 0:
                                    
                                    features_name = common_features_ccle_gdsc
                                    sample_number_gdsc_drug = len(gdsc_train_data['Cell Line'])
                        
                                    x_ccle = x_ccle[list(features_name)]
                                    print(x_ccle)
                                
                                    x_gdsc = x_gdsc_1[list(features_name)]
                                    print(x_gdsc)
                                
                                    features_name_number= len((x_gdsc).columns)
                                
                                    x_ccle = np.array(x_ccle)
                                    y_ccle = np.array(y_ccle)
                                    x_gdsc = np.array(x_gdsc)
                                    y_gdsc = np.array(y_gdsc)
                        
                        
                                    # MODEL OLUŞTURUYORUZ
                            
                                    param_grid = {
                                    'n_estimators': [100, 200, 300],
                                    'max_depth': [None, 10, 20, 30],
                                    'min_samples_split': [2, 5, 10],
                                    'min_samples_leaf': [1, 2, 4],
                                    'max_features': ['auto', 'sqrt', 'log2'] }
        
                                    model = RandomForestRegressor(random_state=42)
                                    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
                                    grid_search.fit(x_ccle, y_ccle)
        
                                    best_model = grid_search.best_estimator_
        
                                    y_gdsc_pred = best_model.predict(x_gdsc)
                        

                                    print(histology)
                                    print(f"\nDrug : Random Forest is running for {drug}", flush=True)
            
                            
                                    r_squared = r2_score(y_gdsc, y_gdsc_pred)
                                    print(r_squared)
                                    pearson_correlation, p_value_pearsonr = pearsonr(y_gdsc_pred, y_gdsc)
                                    corr_spearmanr, p_value_spearmanr = spearmanr(y_gdsc_pred, y_gdsc)
                                    rmse_val = np.sqrt(mean_squared_error(y_gdsc_pred, y_gdsc))
                            
                
                                    features_name = list(features_name)
                                    result_table_drug = pd.DataFrame({
                                    'Tissue': [histology],
                                    'Drug': [drug],
                                    'R_Square': [r_squared],
                                    'Pearson_Correlation_Train': [pearson_correlation],
                                    'features_name': [features_name],
                                    'features_name_number': [features_name_number],
                                    'sample_number_gdsc_drug':[sample_number_gdsc_drug],    
                                    'Pearson_Correlation_P_Value_Train': [p_value_pearsonr],
                                    'Spearman_Correlation_Train': [corr_spearmanr],
                                    'Spearman_Correlation_P_Value_Train': [p_value_spearmanr],
                                    'RMSE_Train': [rmse_val]
                                     })
                                    print(result_table_drug)
                        
            

                                    Drug_dict[drug] = result_table_drug
                
                            # Save results to files
                            os.mkdir(os.path.join(output_dir_gdsc, histology, drug, "Random Forest - Regression DENEME hyper "))
                            result_table_drug.to_csv(os.path.join(output_dir_gdsc, histology, drug, "Random Forest - Regression DENEME hyper", drug + "- Result_RF_Reg.tsv"), 
                                     sep='\t', index=False, quoting=False)
                            IC50_predict[drug] = pd.DataFrame({'Actual_IC50_Data': y_gdsc, 'Predicted_IC50_Data': y_gdsc_pred})
                            IC50_predict[drug].to_csv(os.path.join(output_dir_gdsc, histology, drug, "Random Forest - Regression DENEME hyper", drug + "- IC50_Result.tsv"), 
                                            sep='\t', index=False, quoting=False)
                       
                    
                    
                    
    if Drug_dict:
        Histology_dict[histology] = pd.concat(Drug_dict.values())  # Her bir histoloji için ilaç sonuçlarını birleştirin

 #Tüm histolojilerin sonuçlarını birleştirin ve dosyalara kaydedin
if Histology_dict:
    All_RF_Result = pd.concat(Histology_dict.values())



                    
                    
    

      BCLAF3
0   5.397742
1   6.057872
2   6.270824
3   5.163112
4   5.483860
..       ...
68  7.158782
69  6.867650
70  6.164544
71  5.677891
72  6.416633

[73 rows x 1 columns]
       BCLAF3
0    5.277532
1    5.277532
2    5.719696
3    5.719696
4    5.667912
..        ...
133  4.506816
134  4.487679
135  4.487679
136  6.119404
137  6.119404

[138 rows x 1 columns]
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
aero_dig_tract

Drug : Random Forest is running for AZD6244
-2.2600678713739217
           Tissue     Drug  R_Square  Pearson_Correlation_Train features_name  \
0  aero_dig_tract  AZD6244 -2.260068                  -0.040452      [BCLAF3]   

   features_name_number  sample_number_gdsc_drug  \
0                     1                      138   

   Pearson_Correlation_P_Value_Train  Spearman_Correlation_Train  \
0                           0.637591                    0.040148   

   Spearman_Correlation_P_Value_Train  RMSE_Train  
0                           

      BCLAF3
0   5.397742
1   6.057872
2   6.270824
3   5.163112
4   5.483860
..       ...
68  7.158782
69  6.867650
70  6.164544
71  5.677891
72  6.416633

[73 rows x 1 columns]
      BCLAF3
0   4.951954
1   4.797781
2   4.797781
3   4.384341
4   4.384341
..       ...
82  4.723029
83  4.723029
84  6.234461
85  5.113683
86  5.113683

[87 rows x 1 columns]
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
breast

Drug : Random Forest is running for AZD6244
-7.010974315589062
   Tissue     Drug  R_Square  Pearson_Correlation_Train features_name  \
0  breast  AZD6244 -7.010974                   0.039851      [BCLAF3]   

   features_name_number  sample_number_gdsc_drug  \
0                     1                       87   

   Pearson_Correlation_P_Value_Train  Spearman_Correlation_Train  \
0                           0.714009                    0.062675   

   Spearman_Correlation_P_Value_Train  RMSE_Train  
0                            0.564139    3.589914  
        SPNS2 

breast

Drug : Random Forest is running for PD0325901
-0.2877916649376906
   Tissue       Drug  R_Square  Pearson_Correlation_Train  \
0  breast  PD0325901 -0.287792                   0.006768   

                                       features_name  features_name_number  \
0  [SPNS2, NLGN1, PPA1, MIPOL1, LHX9, DZIP1, S1PR...                    14   

   sample_number_gdsc_drug  Pearson_Correlation_P_Value_Train  \
0                       44                            0.96522   

   Spearman_Correlation_Train  Spearman_Correlation_P_Value_Train  RMSE_Train  
0                    0.129105                            0.403585    2.120737  
         CADM1      ALCAM    CEMIP2    RAB27B       NQO1      TUT7   ZFYVE19
0     5.688096   9.960699  6.450936  5.968660   8.980115  7.074857  8.910277
1     5.244614   9.752571  8.562465  5.944833  10.659003  6.846214  8.528091
2     5.194472   8.995110  6.046223  4.513331   9.043453  6.828989  7.349333
3     8.613827  10.072471  9.201432  5.155452  

       RIPK3    ZBTB43     HCFC2
0   5.192765  7.056096  5.810859
1   4.913655  6.844151  5.984405
2   4.816033  7.684860  7.430015
3   4.969280  6.959080  5.783327
4   4.900242  6.705380  6.515928
..       ...       ...       ...
57  5.462321  6.891344  6.525342
58  4.978896  6.634344  6.698177
59  5.945446  7.592135  6.165299
60  4.788979  7.991597  7.145542
61  4.652934  6.887849  5.419427

[62 rows x 3 columns]
       RIPK3    ZBTB43     HCFC2
0   4.032185  6.400123  5.678476
1   4.643459  6.379787  5.842078
2   3.812187  6.847477  5.253698
3   3.925333  6.608641  5.193034
4   3.919797  6.410445  4.846516
5   3.841837  6.734759  5.414720
6   4.329650  6.575926  5.248155
7   5.799116  6.646408  5.657591
8   3.918000  6.391313  5.399773
9   4.026597  6.912543  4.812849
10  4.301138  6.820753  5.629396
11  3.662518  6.873378  5.511187
12  3.761240  6.334593  4.772200
13  3.801509  6.190649  4.749039
14  4.343018  6.636040  5.517495
15  3.628292  6.715111  5.421046
16  3.844108  6.8289

          NQO1   ZFYVE19
0     8.980115  8.910277
1    10.659003  8.528091
2     9.043453  7.349333
3    12.220304  7.224360
4    10.885993  7.463124
..         ...       ...
274  12.159712  9.392620
275  10.624908  7.365871
276  12.401074  7.293325
277  11.999385  9.531237
278  12.493298  8.582176

[279 rows x 2 columns]
         NQO1   ZFYVE19
0    5.427292  6.942616
1    9.367613  7.104171
2    9.834896  7.067992
3    6.901214  6.036957
4   12.425111  7.577318
5    7.079810  7.554936
6   12.537060  6.752409
7    9.297590  7.335983
8    9.762866  6.996202
9   12.401834  8.607574
10   4.697946  6.899431
11  12.828712  7.117015
12   9.507623  7.118767
13  12.939554  9.374066
14  13.065512  6.968704
15  10.194161  6.521071
16  10.139291  6.487860
17   6.553109  6.697452
18   6.966457  6.236853
19   7.896751  7.271438
20   6.305439  6.543334
21   5.678955  6.537740
22  10.696265  7.914537
23  10.386582  7.171432
24   9.438006  6.705170
25   6.742566  6.917844
26   7.780094  7.086455
27  

kidney

Drug : Random Forest is running for PD0325901
-0.07364868585805984
   Tissue       Drug  R_Square  Pearson_Correlation_Train  \
0  kidney  PD0325901 -0.073649                   0.205445   

                                       features_name  features_name_number  \
0  [MDFIC, MIPOL1, VLDLR, LIFR, DZIP1, ANKH, CSRP...                    20   

   sample_number_gdsc_drug  Pearson_Correlation_P_Value_Train  \
0                       31                           0.267542   

   Spearman_Correlation_Train  Spearman_Correlation_P_Value_Train  RMSE_Train  
0                    0.171371                            0.356639    1.383662  
          NQO1   ZFYVE19     ZNF593
0     8.980115  8.910277   9.573155
1    10.659003  8.528091  10.224815
2     9.043453  7.349333   9.893606
3    12.220304  7.224360   8.237236
4    10.885993  7.463124  10.590119
..         ...       ...        ...
276  12.159712  9.392620   8.428959
277  10.624908  7.365871   9.528512
278  12.401074  7.293325   8.1

large_intestine

Drug : Random Forest is running for PD0325901
-0.6401884893687722
            Tissue       Drug  R_Square  Pearson_Correlation_Train  \
0  large_intestine  PD0325901 -0.640188                  -0.022557   

                  features_name  features_name_number  \
0  [PPA1, S1PR3, MTIF3, NBEAL2]                     4   

   sample_number_gdsc_drug  Pearson_Correlation_P_Value_Train  \
0                       41                            0.88867   

   Spearman_Correlation_Train  Spearman_Correlation_P_Value_Train  RMSE_Train  
0                   -0.171097                            0.284804    1.941261  
          NQO1   ZFYVE19
0     8.980115  8.910277
1    10.659003  8.528091
2     9.043453  7.349333
3    12.220304  7.224360
4    10.885993  7.463124
..         ...       ...
274  12.159712  9.392620
275  10.624908  7.365871
276  12.401074  7.293325
277  11.999385  9.531237
278  12.493298  8.582176

[279 rows x 2 columns]
         NQO1   ZFYVE19
0   12.787863  6.86242

leukemia

Drug : Random Forest is running for Nilotinib
-0.10321666181384259
     Tissue       Drug  R_Square  Pearson_Correlation_Train  \
0  leukemia  Nilotinib -0.103217                  -0.125551   

                                       features_name  features_name_number  \
0  [ZNF471, TGS1, ZNF573, ZNF701, BAIAP2, ZNF557,...                    12   

   sample_number_gdsc_drug  Pearson_Correlation_P_Value_Train  \
0                       55                           0.361061   

   Spearman_Correlation_Train  Spearman_Correlation_P_Value_Train  RMSE_Train  
0                    0.040372                            0.769789    2.924821  
      TSPAN4
0   6.727527
1   6.566393
2   7.583448
3   5.034512
4   7.514423
5   9.268096
6   8.957572
7   6.202783
8   5.905148
9   6.202279
10  6.367099
11  5.161425
12  7.320523
13  7.897601
14  7.787433
15  5.961542
16  7.795719
17  6.149131
18  5.236020
19  5.669142
20  9.125879
21  7.050936
22  7.999691
23  6.989780
24  5.863263
25  4.5882

          TGM2     ABCB1      UTRN
0     4.598405  3.670545  4.477096
1    10.757019  3.561069  6.684492
2     6.241013  3.742792  6.942095
3     7.845467  6.428791  8.021787
4     8.996697  8.256236  7.111359
..         ...       ...       ...
272   5.004476  3.791937  6.466693
273   9.237378  4.111179  6.859139
274   6.577700  7.476740  8.678898
275   4.262176  4.044408  5.797850
276   4.575339  3.680461  8.229216

[277 rows x 3 columns]
        TGM2      ABCB1      UTRN
0   4.362837   3.718162  5.184358
1   8.220235   3.422260  6.932686
2   3.948287   3.369688  6.526654
3   3.811004   3.318254  6.018378
4   3.621333   3.246032  5.910157
5   3.868325   3.552156  4.621969
6   5.817258   3.359585  7.205379
7   7.276522   8.357368  7.252342
8   4.412587   3.536583  6.681995
9   3.981586   9.294729  4.117681
10  4.176105   4.105117  4.382617
11  7.195037   4.269135  6.969741
12  4.133035   3.927455  3.965435
13  3.918804   3.284984  6.513204
14  4.759318   4.826607  5.602779
15  5.226619

lung

Drug : Random Forest is running for Crizotinib
-7.148849350497876
  Tissue        Drug  R_Square  Pearson_Correlation_Train features_name  \
0   lung  Crizotinib -7.148849                   0.131383         [MET]   

   features_name_number  sample_number_gdsc_drug  \
0                     1                       56   

   Pearson_Correlation_P_Value_Train  Spearman_Correlation_Train  \
0                           0.334444                    0.087347   

   Spearman_Correlation_P_Value_Train  RMSE_Train  
0                            0.522088    1.974762  
        MSI2
0   6.827147
1   6.609522
2   5.430853
3   6.168119
4   6.533840
..       ...
61  6.753409
62  6.683060
63  5.856400
64  5.488933
65  9.477389

[66 rows x 1 columns]
        MSI2
0   5.973972
1   4.886920
2   6.262326
3   4.549680
4   4.801698
5   7.141543
6   5.666120
7   5.445115
8   4.636489
9   5.628083
10  5.208384
11  8.256789
12  4.804498
13  5.798194
14  4.625274
15  4.614759
16  5.083218
17  5.672089
18  6

        ERBB2       GRB7       P3H1     ZBED4       FES   RAPGEF5      VAMP3  \
0    4.776857   7.051471   5.900716  7.537342  3.998420  7.051592  10.118159   
1    5.594016   7.696131   7.783012  7.610711  4.295826  7.366431  11.428072   
2    4.640193   6.344099   7.993085  7.083499  4.100984  8.061610  11.141334   
3    4.728198   5.520133  10.482048  6.975462  3.899656  5.091089  12.326376   
4    5.218538   5.685018   8.775206  8.053273  4.650884  4.594206  10.989516   
..        ...        ...        ...       ...       ...       ...        ...   
95   4.923872   6.371353   6.577290  6.999826  4.741956  7.835283  11.310866   
96   6.240122   6.671821   6.648261  7.266958  4.235248  8.525056  11.221884   
97   4.554882   5.797784  10.037570  6.897899  4.399291  7.484676  12.015713   
98   4.510614   7.228966   7.924494  6.183771  3.902214  5.818736  11.430349   
99  10.944001  11.343100   5.533547  7.931431  3.823061  9.589207   9.560335   

       MED24      MREG  
0   8.241354  

          NQO1      ALCAM   ZFYVE19
0     8.980115   9.960699  8.910277
1    10.659003   9.752571  8.528091
2     9.043453   8.995110  7.349333
3    12.220304  10.072471  7.224360
4    10.885993   9.730142  7.463124
..         ...        ...       ...
282  12.159712   9.139984  9.392620
283  10.624908   9.895339  7.365871
284  12.401074  10.680587  7.293325
285  11.999385   9.740225  9.531237
286  12.493298   9.477634  8.582176

[287 rows x 3 columns]
          NQO1      ALCAM   ZFYVE19
0    10.447733   6.566468  6.768755
1     9.704761   5.114037  6.784294
2     6.547846   7.243424  7.097658
3     8.495833   6.667502  6.936943
4    10.049348   4.185412  7.285896
..         ...        ...       ...
132   9.397151   6.739085  7.073680
133   9.369461   8.225141  7.095448
134  12.554036   7.925892  7.508867
135  12.315702   6.882795  7.928434
136   8.297023  10.176097  7.818304

[137 rows x 3 columns]
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
lung

Drug : Random Fore

In [3]:
Drug_dict[Nilotinib]

NameError: name 'Drug_dict' is not defined

In [2]:
result_table_drug

NameError: name 'result_table_drug' is not defined

In [None]:
All_RF_Result = All_RF_Result.sort_values(by='R_Square', ascending=False)
All_RF_Result.to_excel('C:\\Users\\MONSTER\\Desktop\\TEZ SON\\CCLE\\ML RESULT\\RF Regression_CCLE_GDSC DENEME hyper.xlsx', index=False)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [9]:
Histology_dict = dict()
Drug_dict = dict()
IC50_predict = dict()

for histology in Histologies:
    os.chdir(os.path.join(output_dir_ccle, histology))
    if len(os.listdir()) > 1:
        Drugs_ccle = [item1 for item1 in os.listdir() if os.path.isdir(item1)]
        
        os.chdir(os.path.join(output_dir_gdsc, histology))
        if len(os.listdir()) > 1:
            Drugs_gdsc = [item1 for item1 in os.listdir() if os.path.isdir(item1)]
            
            common_drug_ccle_gdsc = set(Drugs_ccle).intersection(Drugs_gdsc)
            if len(common_drug_ccle_gdsc) > 0:
                
                for drug in common_drug_ccle_gdsc:
                    
                    expression_data = pd.read_csv(os.path.join(output_dir_ccle,histology,drug,drug+" "+histology+" "+"- Expression Data(IC50).tsv"), sep = ",")
                    gdsc_train_data = pd.read_csv(os.path.join(output_dir_gdsc,histology,drug,drug+" "+histology+" "+"- GDSC Histology Project.tsv"), sep = "\t")
                    
                    if len(expression_data.columns) > 3 and len(expression_data['Cell Line']) > 20 and len(gdsc_train_data['Cell Line']) > 20:
                        
                        x_ccle = expression_data.drop(['Unnamed: 0', 'Cell Line', 'LN_IC50'], axis=1)
                        y_ccle = expression_data['LN_IC50']
                        
                        x_gdsc_1 = gdsc_train_data.drop(['Cell Line','GDSC\nTissue descriptor 1','LN_IC50','Drug'], axis=1)
                        y_gdsc = gdsc_train_data['LN_IC50']
                    
                        common_features_ccle_gdsc = set(x_ccle.columns).intersection(x_gdsc_1.columns)
                        if len(common_features_ccle_gdsc) > 0:
                            
                            features_name = common_features_ccle_gdsc
                            sample_number_gdsc_drug = len(gdsc_train_data['Cell Line'])
                        
                            x_ccle = x_ccle[list(features_name)]
                            #y_ccle = boruta_drug_data_ccle['log IC50 (uM)']
                                
                            x_gdsc = x_gdsc_1[list(features_name)]
                            #y_gdsc = boruta_drug_data_gdsc[drug]
                                
                            features_name_number= len((x_gdsc).columns)
                                
                            x_ccle = np.array(x_ccle)
                            y_ccle = np.array(y_ccle)
                            x_gdsc = np.array(x_gdsc)
                            y_gdsc = np.array(y_gdsc)
                        
                        
                            # MODEL OLUŞTURUYORUZ
                            def perform_loocv(model, x_ccle, y_ccle):
                                loo = LeaveOneOut()
                                loo_scores = cross_val_score(model, x_ccle, y_ccle, cv=loo)
                                average_loo_score = loo_scores.mean()
                                print(f"LOOCV Average Score: {average_loo_score}")
                                return average_loo_score

                            def perform_kfold_cv(model, x_ccle, y_ccle, n_splits=2):
                                kfold = KFold(n_splits=n_splits, random_state=1, shuffle=True)
                                cv_results = cross_validate(model, x_ccle, y_ccle, cv=kfold, return_train_score=True)
                                average_test_score = cv_results['test_score'].mean()
                                average_train_score = cv_results['train_score'].mean()
                                best_model_index = np.argmax(cv_results['test_score'])
                                print(f"KFold CV Average Test Score: {average_test_score}")
                                print(f"KFold CV Average Train Score: {average_train_score}")
                                return cv_results, average_test_score, average_train_score, best_model_index

                            def train_and_predict(model, x_ccle, y_ccle, x_gdsc):
                                model.fit(x_ccle, y_ccle)
                                predictions = model.predict(x_gdsc)
                                return model, predictions

                            # Ana kodun çalıştırılması
                            try:
                            # LOOCV (Leave-One-Out Cross-Validation)
                                rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
                                loocv_score = perform_loocv(rf_model, x_ccle, y_ccle)
    
                            # K-Fold Cross-Validation
                                cv_results, avg_test_score, avg_train_score, best_model_index = perform_kfold_cv(rf_model, x_ccle, y_ccle, n_splits=2)
    
                            # En iyi modelin yeniden eğitilmesi
                                best_model, y_gdsc_pred = train_and_predict(rf_model, x_ccle, y_ccle, x_gdsc)
            
                            except Exception as e:
                                 
                                print(f"An error occurred: {e}")
                                
                                
                            # Tahmin yapılıyor.
                            y_gdsc_pred = best_model.predict(x_gdsc)                        

                            print(histology)
                            print(f"\nDrug : Random Forest is running for {drug}", flush=True)
            
                
                            r_squared = r2_score(y_gdsc, y_gdsc_pred)
                            pearson_correlation, p_value_pearsonr = pearsonr(y_gdsc_pred, y_gdsc)
                            corr_spearmanr, p_value_spearmanr = spearmanr(y_gdsc_pred, y_gdsc)
                            rmse_val = np.sqrt(mean_squared_error(y_gdsc_pred, y_gdsc))
                
                            features_name = list(features_name)
                            result_table_drug = pd.DataFrame({
                            'Tissue': [histology],
                            'Drug': [drug],
                            'R_Square': [r_squared],
                            'Pearson_Correlation_Train': [pearson_correlation],
                            'features_name': [features_name],
                            'features_name_number': [features_name_number],
                            'sample_number_gdsc_drug':[sample_number_gdsc_drug],    
                            'Pearson_Correlation_P_Value_Train': [p_value_pearsonr],
                            'Spearman_Correlation_Train': [corr_spearmanr],
                            'Spearman_Correlation_P_Value_Train': [p_value_spearmanr],
                            'RMSE_Train': [rmse_val]
                             })
            

                            Drug_dict[drug] = result_table_drug
                
                            # Save results to files
                            os.mkdir(os.path.join(output_dir_gdsc, histology, drug, "Random Forest - Regression2"))
                            result_table_drug.to_csv(os.path.join(output_dir_gdsc, histology, drug, "Random Forest - Regression2", drug + "- Result_RF_Reg.tsv"), 
                                     sep='\t', index=False, quoting=False)
                            IC50_predict[drug] = pd.DataFrame({'Actual_IC50_Data': y_gdsc, 'Predicted_IC50_Train_Data': y_gdsc_pred})
                            IC50_predict[drug].to_csv(os.path.join(output_dir_gdsc, histology, drug, "Random Forest - Regression2", drug + "- IC50_Result.tsv"), 
                                            sep='\t', index=False, quoting=False)
                       
                    
                    
                    
    if Drug_dict:
        Histology_dict[histology] = pd.concat(Drug_dict.values())  # Her bir histoloji için ilaç sonuçlarını birleştirin

# Tüm histolojilerin sonuçlarını birleştirin ve dosyalara kaydedin
if Histology_dict:
    All_RF_Result_2 = pd.concat(Histology_dict.values())



                    
                    
    
   

LOOCV Average Score: nan
KFold CV Average Test Score: 0.22183643292489397
KFold CV Average Train Score: 0.8647759587317352
aero_dig_tract

Drug : Random Forest is running for AZD6244
LOOCV Average Score: nan
KFold CV Average Test Score: 0.3361814083540588
KFold CV Average Train Score: 0.9098221024025451
aero_dig_tract

Drug : Random Forest is running for 17-AAG
LOOCV Average Score: nan
KFold CV Average Test Score: 0.37224449770352314
KFold CV Average Train Score: 0.9005149120161879
aero_dig_tract

Drug : Random Forest is running for PD0325901
LOOCV Average Score: nan
KFold CV Average Test Score: 0.22183643292489397
KFold CV Average Train Score: 0.8647759587317352
breast

Drug : Random Forest is running for AZD6244
LOOCV Average Score: nan
KFold CV Average Test Score: 0.36388788526305693
KFold CV Average Train Score: 0.9057498314549421
breast

Drug : Random Forest is running for 17-AAG
LOOCV Average Score: nan
KFold CV Average Test Score: 0.32653478617586873
KFold CV Average Train Score

LOOCV Average Score: nan
KFold CV Average Test Score: 0.5050960758606462
KFold CV Average Train Score: 0.9521100164666524
skin

Drug : Random Forest is running for PD0325901
LOOCV Average Score: nan
KFold CV Average Test Score: 0.6010266231395787
KFold CV Average Train Score: 0.8927714906797137
urogenital_system

Drug : Random Forest is running for Nilotinib
LOOCV Average Score: nan
KFold CV Average Test Score: 0.6183898981620746
KFold CV Average Train Score: 0.9510078420500799
urogenital_system

Drug : Random Forest is running for PLX-4720
LOOCV Average Score: nan
KFold CV Average Test Score: 0.24180912153084494
KFold CV Average Train Score: 0.897425383538271
urogenital_system

Drug : Random Forest is running for AZD6244
LOOCV Average Score: nan
KFold CV Average Test Score: 0.15355655110988747
KFold CV Average Train Score: 0.8993580304966937
urogenital_system

Drug : Random Forest is running for 17-AAG
LOOCV Average Score: nan
KFold CV Average Test Score: 0.3413715888158616
KFold CV A

In [10]:
All_RF_Result_2

Unnamed: 0,Tissue,Drug,R_Square,Pearson_Correlation_Train,features_name,features_name_number,sample_number_gdsc_drug,Pearson_Correlation_P_Value_Train,Spearman_Correlation_Train,Spearman_Correlation_P_Value_Train,RMSE_Train
0,aero_dig_tract,AZD6244,-2.659237,-0.148357,[BCLAF3],1,138,0.082463,-0.098217,0.251765,2.621241
0,aero_dig_tract,17-AAG,-0.575868,-0.176440,"[CADM1, MN1, ZFYVE19, NQO1, MOXD1, TUT7, RAP1G...",8,70,0.143984,-0.100831,0.406232,1.865517
0,aero_dig_tract,PD0325901,-0.314562,-0.143724,"[MTIF3, DPYS, S100A5, TRAF5, GPR176, NBEAL2, G...",14,70,0.235225,-0.179179,0.137764,1.799991
0,breast,AZD6244,-7.318106,0.062813,[BCLAF3],1,87,0.563278,0.178855,0.097416,3.658084
0,breast,17-AAG,-0.823174,-0.272263,"[ALCAM, CADM1, RAB27B, ZFYVE19, NQO1, TUT7, CE...",7,45,0.070396,-0.241765,0.109596,2.252534
...,...,...,...,...,...,...,...,...,...,...,...
0,lung,Lapatinib,-3.443860,0.006847,"[VAMP3, FES, RAPGEF5, P3H1, MED24, MREG, ERBB2...",9,57,0.959682,0.043816,0.746216,2.526080
0,lung,Erlotinib,-3.651930,0.130137,[BCO2],1,57,0.334616,-0.015397,0.909496,2.736471
0,lymphoma,Crizotinib,-0.694361,0.221129,[CPA1],1,28,0.258129,0.039378,0.842304,1.263651
0,lung,AZD-0530,-3.314896,0.267668,[VCPIP1],1,55,0.048189,0.133138,0.332533,2.289754


In [12]:
All_RF_Result_2 = All_RF_Result_2.sort_values(by='R_Square', ascending=False)


In [13]:
All_RF_Result_2

Unnamed: 0,Tissue,Drug,R_Square,Pearson_Correlation_Train,features_name,features_name_number,sample_number_gdsc_drug,Pearson_Correlation_P_Value_Train,Spearman_Correlation_Train,Spearman_Correlation_P_Value_Train,RMSE_Train
0,skin,AZD6244,-0.074660,0.080014,[NKX2-3],1,87,0.461297,0.127792,0.238173,2.172315
0,skin,AZD6244,-0.074660,0.080014,[NKX2-3],1,87,0.461297,0.127792,0.238173,2.172315
0,kidney,PD0325901,-0.082097,0.155112,"[POLR1D, VLDLR, ZYG11A, PPA1, MIPOL1, EML1, DZ...",20,31,0.404734,0.092339,0.621270,1.389095
0,nervous_system,17-AAG,-0.099847,-0.183380,[HJV],1,53,0.188720,-0.297028,0.030785,1.450146
0,leukemia,Nilotinib,-0.104508,-0.066327,"[BAIAP2, CSPP1, TGS1, ZNF451, ZNF557, ZNF471, ...",12,55,0.630429,0.044449,0.747277,2.926532
...,...,...,...,...,...,...,...,...,...,...,...
0,lung,Nilotinib,-11.010172,-0.051720,"[ZBTB43, MTMR11, HCFC2, RIPK3]",4,139,0.545398,-0.107439,0.208070,2.476395
0,pancreas,Nilotinib,-15.019855,-0.543933,"[ZBTB43, RIPK3, LRRC66, TMEM54, HCFC2]",5,27,0.003360,-0.451770,0.018000,2.397041
0,pancreas,Nilotinib,-15.019855,-0.543933,"[ZBTB43, RIPK3, LRRC66, TMEM54, HCFC2]",5,27,0.003360,-0.451770,0.018000,2.397041
0,pancreas,Nilotinib,-15.019855,-0.543933,"[ZBTB43, RIPK3, LRRC66, TMEM54, HCFC2]",5,27,0.003360,-0.451770,0.018000,2.397041


In [None]:
def perform_loocv(model, X, y):
    loo = LeaveOneOut()
    loo_scores = cross_val_score(model, X, y, cv=loo)
    average_loo_score = loo_scores.mean()
    print(f"LOOCV Average Score: {average_loo_score}")
    return average_loo_score

def perform_kfold_cv(model, X, y, n_splits=2):
    kfold = KFold(n_splits=n_splits, random_state=1, shuffle=True)
    cv_results = cross_validate(model, X, y, cv=kfold, return_train_score=True)
    average_test_score = cv_results['test_score'].mean()
    average_train_score = cv_results['train_score'].mean()
    best_model_index = np.argmax(cv_results['test_score'])
    print(f"KFold CV Average Test Score: {average_test_score}")
    print(f"KFold CV Average Train Score: {average_train_score}")
    return cv_results, average_test_score, average_train_score, best_model_index

def train_and_predict(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return model, predictions

# Ana kodun çalıştırılması
try:
    # LOOCV (Leave-One-Out Cross-Validation)
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    loocv_score = perform_loocv(rf_model, x_ccle, y_ccle)
    
    # K-Fold Cross-Validation
    cv_results, avg_test_score, avg_train_score, best_model_index = perform_kfold_cv(rf_model, x_ccle, y_ccle, n_splits=2)
    
    # En iyi modelin yeniden eğitilmesi
    best_model, y_gdsc_pred = train_and_predict(rf_model, x_ccle, y_ccle, x_gdsc)