# Support Vector Machine for Classification Problem
### *Exploring the association between neoantigen-related variables and immune scores*
This notebook is the continuation of the `support_vector_reg.ipynb` notebook, detailing the testing of SVM application on our neoantigen dataset, converted into a classification problem.

#### **Package and Raw Data Loading**
First, import necessary packages and load in the raw data table into `pandas` dataFrame. 



In [1]:
# first, import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from itables import show
from IPython.display import HTML, display
from warnings import simplefilter, filterwarnings
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
filterwarnings("ignore", category=UserWarning)
pd.set_option('display.max_columns', None)
%config InlineBackend.figure_format = 'retina'

# load pretty jupyter's magics
%load_ext pretty_jupyter

Load up the cleaned-up dataset wrangled from MH's latest work.

In [2]:
# read in latest data
# use the 202409_new_excludedIHC_batch-duplicate-removed.tsv
df = pd.read_csv("../input-data/SA/202409_new_excludedIHC_batch-duplicate-removed.tsv",sep="\t")
print(f"Before trimming columns: {df.shape}")

# exclude the 29 Cibersort scores, leaving only 3
df = df.drop(columns=['Bindea_full', 'Expanded_IFNg', 
        'C_Bcellsmemory','C_Plasmacells','C_TcellsCD8','C_TcellsCD4naive',
         'C_TcellsCD4memoryactivated','C_Tcellsfollicularhelper',
         'C_Tcellsregulatory(Tregs)','C_Tcellsgammadelta','C_NKcellsresting',
         'C_NKcellsactivated', 'C_Monocytes', 'C_MacrophagesM0',
         'C_MacrophagesM1','C_Dendriticcellsresting',
         'C_Dendriticcellsactivated', 'C_Mastcellsresting',
         'C_Mastcellsactivated','C_Eosinophils', 'C_Neutrophils', 'S_PAM100HRD'])

print(f"After trimming columns: {df.shape}")
df.head()

Before trimming columns: (953, 156)
After trimming columns: (953, 134)


Unnamed: 0,ID,Batch,PAM50,Subtype,HR_status,HER_status,Age,AgeGroup,Stage,TumorGrade,TumourSize,FusionNeo_Count,FusionNeo_bestScore,FusionTransscript_Count,Fusion_T2NeoRate,SNVindelNeo_Count,SNVindelNeo_IC50,SNVindelNeo_IC50Percentile,TotalNeo_Count,ESTIMATE,IMPRES,C_Bcellsnaive,C_TcellsCD4memoryresting,C_MacrophagesM2,S_Attractors_LYM,S_Attractors_IFIT3,S_Attractors_G_GIMAP4,S_Attractors_G_HLA.DPA1,S_Attractors_G_SLAMF6,S_Attractors_G_LILRB4,S_Attractors_G_SIGLEC9,S_Attractors_G_CYTH4,S_Attractors_G_CD3E,S_Lymph_Vessels,S_ICR_SCORE,S_ICR_INHIB_SCORE,S_ICR_ACT_SCORE,S_Angiogenesis,S_APM1,S_APM2,S_ICS5_score,S_LIexpression_score,S_Chemokine12_score,S_NHI_5gene_score,S_CD68,S_CD8A,S_PD1_data,S_PDL1_data,S_PD1_PDL1_score,S_CTLA4_data,S_Bcell_mg_IGJ,S_Bcell_receptors_score,S_STAT1_score,S_CSF1_response,S_TcClassII_score,S_IL12_score_21050467,S_IL4_score_21050467,S_IL2_score_21050467,S_IL13_score_21050467,S_IFNG_score_21050467,S_TGFB_score_21050467,S_TREM1_data,S_DAP12_data,S_Tcell_receptors_score,S_IL8_21978456,S_IFN_21978456,S_MHC1_21978456,S_MHC2_21978456,S_Bcell_21978456,S_Tcell_21978456,S_CD103pos_mean_25446897,S_CD103neg_mean_25446897,S_IgG_19272155,S_Interferon_19272155,S_LCK_19272155,S_MHC.I_19272155,S_MHC.II_19272155,S_STAT1_19272155,S_Troester_WoundSig_19887484,S_MDACC.FNA.1_20805453,S_IGG_Cluster_21214954,S_Minterferon_Cluster_21214954,S_Immune_cell_Cluster_21214954,S_MCD3_CD8_21214954,S_Interferon_Cluster_21214954,S_B_cell_PCA_16704732,S_CD8_PCA_16704732,S_GRANS_PCA_16704732,S_LYMPHS_PCA_16704732,S_T_cell_PCA_16704732,S_TGFB_PCA_17349583,S_Rotterdam_ERneg_PCA_15721472,S_HER2_Immune_PCA_18006808,S_IR7_score,S_Buck14_score,S_TAMsurr_score,S_Immune_NSCLC_score,S_Module3_IFN_score,S_Module4_TcellBcell_score,S_Module5_TcellBcell_score,S_Module11_Prolif_score,S_CD8_CD68_ratio,S_TAMsurr_TcClassII_ratio,S_CHANG_CORE_SERUM_RESPONSE_UP,S_CSR_Activated_15701700,S_B_cells,S_T_cells,S_T_helper,S_Tcm,S_Tem,S_Th1,S_Th2,S_TFH,S_CD8_Tcells,S_Th17,S_Treg,S_Tgd,S_Cytotoxic_cells,S_NK_cells,S_NK_cd56dim,S_NK_cd56bright,S_DC,S_iDC,S_aDC,S_pDC,S_Eosinophils,S_Macrophages,S_Mast,S_Neutrophils,S_Bindea_full,S_Expanded_IFNg,S_KEGG_MMR,S_KEGG_TGF_Beta,S_KEGG_Cytosolic_DNA_Sensing
0,SD0012,Batch_1,LumB,HR+/HER2-,HR+,HER2-,50.0,41-50,2.0,2.0,2.3,20.0,5.79,42.0,0.476190476,357.0,1.7,0.0025,377.0,2895.605487,9.0,0.120394,0.117468,0.44845,0.362,0.4216,0.3034,0.4425,0.2749,0.2983,0.2756,0.3121,0.2765,0.3203,0.2463,0.2341,0.2495,0.3571,0.4591,0.3985,0.213,0.2223,0.2766,0.3573,0.1561,0.2942,0.1536,0.253,0.2042,0.2061,0.1942,0.2764,0.3327,0.3795,0.3435,0.2846,0.3797,0.3761,0.3197,0.4008,0.4403,0.2907,0.4011,0.2526,0.2237,0.4256,0.3971,0.4477,0.1852,0.2516,0.2262,0.3854,0.1591,0.4462,0.2912,0.4211,0.4502,0.361,0.2711,0.3414,0.2183,0.3728,0.3512,0.3529,0.4162,0.3612,0.2215,0.3614,0.4294,0.3451,0.4648,0.2937,0.3338,0.2999,0.2094,0.2743,0.3203,0.4136,0.2546,0.273,0.3697,0.2268,0.3335,0.3959,0.4066,0.2199,0.2395,0.4132,0.3633,0.321,0.2599,0.3561,0.3112,0.3645,0.281,0.2348,0.1945,0.2188,0.3281,0.0602,0.2038,0.2445,0.322,0.2212,0.2866,0.303,0.3657,0.2499,0.2531,0.3031,0.3097,0.4053,0.3537,0.253
1,SD0014,Batch_1,LumA,HR+/HER2-,HR+,HER2-,58.0,51-60,2.0,2.0,2.5,10.0,5.28,17.0,0.588235294,85.0,4.1,0.0039,95.0,4257.831526,11.0,0.165023,0.207531,0.124223,0.4126,0.3815,0.3619,0.476,0.3343,0.3001,0.2769,0.3761,0.3525,0.3452,0.3051,0.2701,0.3143,0.3803,0.4819,0.4125,0.3002,0.3106,0.3646,0.3753,0.103,0.3644,0.2726,0.2377,0.2553,0.2513,0.282,0.3079,0.3515,0.3869,0.3868,0.3133,0.4051,0.4031,0.3308,0.412,0.4496,0.2012,0.436,0.329,0.2562,0.3882,0.431,0.4682,0.299,0.3007,0.2859,0.3998,0.2225,0.4068,0.3481,0.4494,0.471,0.4011,0.284,0.3779,0.2727,0.3831,0.3864,0.3511,0.3983,0.3676,0.2761,0.3637,0.4268,0.3555,0.458,0.3266,0.3549,0.3763,0.2157,0.3028,0.3278,0.3896,0.3111,0.3159,0.3126,0.2395,0.3747,0.3838,0.403,0.2731,0.2909,0.4092,0.3404,0.353,0.2808,0.3102,0.3229,0.3713,0.2645,0.3182,0.1977,0.2985,0.3502,0.1191,0.3316,0.2976,0.3445,0.2403,0.345,0.3203,0.3484,0.2768,0.2783,0.32,0.3668,0.3803,0.347,0.2606
2,SD0015,Batch_1,LumA,HR+/HER2-,HR+,HER2-,46.0,41-50,1.0,2.0,1.8,4.0,11.48,16.0,0.25,150.0,2.4,0.0042,154.0,3123.055856,8.0,0.162653,0.235337,0.279972,0.3556,0.3782,0.3363,0.4718,0.3015,0.2659,0.2274,0.318,0.3025,0.3124,0.2622,0.2304,0.2706,0.3761,0.4613,0.3814,0.3055,0.2764,0.3211,0.3784,-0.1955,0.3122,0.1932,0.2485,0.2211,0.2004,0.2478,0.304,0.3316,0.3646,0.3483,0.3012,0.4038,0.3875,0.3158,0.3986,0.4391,0.1509,0.3958,0.2886,0.2535,0.3764,0.4054,0.4601,0.2509,0.283,0.2712,0.3801,0.175,0.3987,0.3063,0.4239,0.4616,0.3587,0.2955,0.3431,0.2278,0.3599,0.3549,0.4011,0.3865,0.3661,0.2671,0.3591,0.4313,0.3489,0.4442,0.2985,0.3383,0.337,0.2609,0.2706,0.3312,0.3792,0.2851,0.2897,0.2984,0.0936,0.337,0.388,0.4038,0.2799,0.2873,0.4159,0.352,0.356,0.26,0.3116,0.3225,0.3762,0.237,0.2251,0.2021,0.272,0.3434,0.0626,0.3025,0.267,0.3212,0.2098,0.2967,0.3236,0.3266,0.2455,0.2522,0.3107,0.3372,0.3793,0.3522,0.2564
3,SD0016,Batch_1,,,,,41.0,41-50,0.0,,11.0,1.0,43.92,15.0,0.066666667,218.0,2.8,0.0027,219.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,SD0017,Batch_1,LumB,HR+/HER2-,HR+,HER2-,54.0,51-60,2.0,2.0,2.5,19.0,15.11,47.0,0.404255319,1369.0,1.7,0.003,1388.0,5275.497847,11.0,0.155523,0.148387,0.128777,0.4314,0.4482,0.408,0.498,0.3912,0.312,0.2934,0.3864,0.3957,0.3131,0.3589,0.3341,0.3654,0.3842,0.4957,0.4328,0.3533,0.3732,0.4177,0.435,0.1401,0.4006,0.3023,0.3429,0.3228,0.2922,0.3591,0.3671,0.3926,0.4113,0.4082,0.3325,0.4092,0.4129,0.3338,0.419,0.4248,0.1319,0.4878,0.3797,0.1889,0.4601,0.4445,0.4859,0.3224,0.3693,0.3003,0.4095,0.2938,0.4707,0.3968,0.4616,0.4893,0.4644,0.2735,0.3955,0.3234,0.4165,0.4337,0.419,0.4511,0.3758,0.3174,0.3679,0.4314,0.3648,0.4439,0.2886,0.3688,0.4143,0.2489,0.3562,0.3498,0.4563,0.3576,0.3706,0.3377,0.2758,0.4006,0.4,0.4062,0.3182,0.346,0.417,0.3275,0.3432,0.2914,0.3307,0.3525,0.3879,0.2212,0.333,0.2251,0.3151,0.3495,0.1939,0.2953,0.301,0.3467,0.2547,0.3313,0.323,0.3597,0.2726,0.2648,0.3309,0.4165,0.3997,0.3468,0.2682


#### **Data Preprocessing**

Decide all the clinical variables and neoantigen-related variables to keep in the X matrix (features).

1. `Subtype` column has already been encoded categorically by `HR_status` and `HER_status` columns so these two columns can be dropped. ***UPDATE: due to their lesser importance during the default XGBoost modeling, `PAM50` column was dropped as well.***

2.  `AgeGroup` is just a binned information of `Age` column so it is dropped as it is redundant.

3. Drop `FusionNeo_bestScore`, `FusionTransscript_Count`, `Fusion_T2NeoRate` columns as well as the `SNVindelNeo_IC50` and `SNVindelNeo_IC50Percentile` columns for now to reduce complexity.

4. Drop `Batch` column.

> **UPDATE 1: Exclude `TotalNeo_Count`, and include `Fusion_T2NeoRate` and `SNVindelNeo_IC50` columns. Also, rename `Fusion_T2NeoRate` to `FN/FT_Ratio`.**

> **UPDATE 2: put back `FusionNeo_bestScore` into the X variable set and rename it into `FusionNeo_bestIC50`**

In [3]:
# let's drop all NaN for now and set col 'ID' as index
dfd = df.drop(columns = ['Batch', 'Stage', 'PAM50', 'HR_status', 'HER_status', 'AgeGroup', 'TotalNeo_Count', 'FusionTransscript_Count', 'SNVindelNeo_IC50Percentile']).dropna().set_index('ID')

# rename the column `Fusion_T2NeoRate` to `FN/FT_Ratio` and `FusionNeo_bestScore` to `FusionNeo_bestIC50`
dfd.rename(columns={'Fusion_T2NeoRate': 'FN/FT_Ratio'}, inplace=True)
dfd.rename(columns={'FusionNeo_bestScore': 'FusionNeo_bestIC50'}, inplace=True)

print(dfd.shape)
dfd.head()

(674, 124)


Unnamed: 0_level_0,Subtype,Age,TumorGrade,TumourSize,FusionNeo_Count,FusionNeo_bestIC50,FN/FT_Ratio,SNVindelNeo_Count,SNVindelNeo_IC50,ESTIMATE,IMPRES,C_Bcellsnaive,C_TcellsCD4memoryresting,C_MacrophagesM2,S_Attractors_LYM,S_Attractors_IFIT3,S_Attractors_G_GIMAP4,S_Attractors_G_HLA.DPA1,S_Attractors_G_SLAMF6,S_Attractors_G_LILRB4,S_Attractors_G_SIGLEC9,S_Attractors_G_CYTH4,S_Attractors_G_CD3E,S_Lymph_Vessels,S_ICR_SCORE,S_ICR_INHIB_SCORE,S_ICR_ACT_SCORE,S_Angiogenesis,S_APM1,S_APM2,S_ICS5_score,S_LIexpression_score,S_Chemokine12_score,S_NHI_5gene_score,S_CD68,S_CD8A,S_PD1_data,S_PDL1_data,S_PD1_PDL1_score,S_CTLA4_data,S_Bcell_mg_IGJ,S_Bcell_receptors_score,S_STAT1_score,S_CSF1_response,S_TcClassII_score,S_IL12_score_21050467,S_IL4_score_21050467,S_IL2_score_21050467,S_IL13_score_21050467,S_IFNG_score_21050467,S_TGFB_score_21050467,S_TREM1_data,S_DAP12_data,S_Tcell_receptors_score,S_IL8_21978456,S_IFN_21978456,S_MHC1_21978456,S_MHC2_21978456,S_Bcell_21978456,S_Tcell_21978456,S_CD103pos_mean_25446897,S_CD103neg_mean_25446897,S_IgG_19272155,S_Interferon_19272155,S_LCK_19272155,S_MHC.I_19272155,S_MHC.II_19272155,S_STAT1_19272155,S_Troester_WoundSig_19887484,S_MDACC.FNA.1_20805453,S_IGG_Cluster_21214954,S_Minterferon_Cluster_21214954,S_Immune_cell_Cluster_21214954,S_MCD3_CD8_21214954,S_Interferon_Cluster_21214954,S_B_cell_PCA_16704732,S_CD8_PCA_16704732,S_GRANS_PCA_16704732,S_LYMPHS_PCA_16704732,S_T_cell_PCA_16704732,S_TGFB_PCA_17349583,S_Rotterdam_ERneg_PCA_15721472,S_HER2_Immune_PCA_18006808,S_IR7_score,S_Buck14_score,S_TAMsurr_score,S_Immune_NSCLC_score,S_Module3_IFN_score,S_Module4_TcellBcell_score,S_Module5_TcellBcell_score,S_Module11_Prolif_score,S_CD8_CD68_ratio,S_TAMsurr_TcClassII_ratio,S_CHANG_CORE_SERUM_RESPONSE_UP,S_CSR_Activated_15701700,S_B_cells,S_T_cells,S_T_helper,S_Tcm,S_Tem,S_Th1,S_Th2,S_TFH,S_CD8_Tcells,S_Th17,S_Treg,S_Tgd,S_Cytotoxic_cells,S_NK_cells,S_NK_cd56dim,S_NK_cd56bright,S_DC,S_iDC,S_aDC,S_pDC,S_Eosinophils,S_Macrophages,S_Mast,S_Neutrophils,S_Bindea_full,S_Expanded_IFNg,S_KEGG_MMR,S_KEGG_TGF_Beta,S_KEGG_Cytosolic_DNA_Sensing
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1
SD0012,HR+/HER2-,50.0,2.0,2.3,20.0,5.79,0.476190476,357.0,1.7,2895.605487,9.0,0.120394,0.117468,0.44845,0.362,0.4216,0.3034,0.4425,0.2749,0.2983,0.2756,0.3121,0.2765,0.3203,0.2463,0.2341,0.2495,0.3571,0.4591,0.3985,0.213,0.2223,0.2766,0.3573,0.1561,0.2942,0.1536,0.253,0.2042,0.2061,0.1942,0.2764,0.3327,0.3795,0.3435,0.2846,0.3797,0.3761,0.3197,0.4008,0.4403,0.2907,0.4011,0.2526,0.2237,0.4256,0.3971,0.4477,0.1852,0.2516,0.2262,0.3854,0.1591,0.4462,0.2912,0.4211,0.4502,0.361,0.2711,0.3414,0.2183,0.3728,0.3512,0.3529,0.4162,0.3612,0.2215,0.3614,0.4294,0.3451,0.4648,0.2937,0.3338,0.2999,0.2094,0.2743,0.3203,0.4136,0.2546,0.273,0.3697,0.2268,0.3335,0.3959,0.4066,0.2199,0.2395,0.4132,0.3633,0.321,0.2599,0.3561,0.3112,0.3645,0.281,0.2348,0.1945,0.2188,0.3281,0.0602,0.2038,0.2445,0.322,0.2212,0.2866,0.303,0.3657,0.2499,0.2531,0.3031,0.3097,0.4053,0.3537,0.253
SD0014,HR+/HER2-,58.0,2.0,2.5,10.0,5.28,0.588235294,85.0,4.1,4257.831526,11.0,0.165023,0.207531,0.124223,0.4126,0.3815,0.3619,0.476,0.3343,0.3001,0.2769,0.3761,0.3525,0.3452,0.3051,0.2701,0.3143,0.3803,0.4819,0.4125,0.3002,0.3106,0.3646,0.3753,0.103,0.3644,0.2726,0.2377,0.2553,0.2513,0.282,0.3079,0.3515,0.3869,0.3868,0.3133,0.4051,0.4031,0.3308,0.412,0.4496,0.2012,0.436,0.329,0.2562,0.3882,0.431,0.4682,0.299,0.3007,0.2859,0.3998,0.2225,0.4068,0.3481,0.4494,0.471,0.4011,0.284,0.3779,0.2727,0.3831,0.3864,0.3511,0.3983,0.3676,0.2761,0.3637,0.4268,0.3555,0.458,0.3266,0.3549,0.3763,0.2157,0.3028,0.3278,0.3896,0.3111,0.3159,0.3126,0.2395,0.3747,0.3838,0.403,0.2731,0.2909,0.4092,0.3404,0.353,0.2808,0.3102,0.3229,0.3713,0.2645,0.3182,0.1977,0.2985,0.3502,0.1191,0.3316,0.2976,0.3445,0.2403,0.345,0.3203,0.3484,0.2768,0.2783,0.32,0.3668,0.3803,0.347,0.2606
SD0015,HR+/HER2-,46.0,2.0,1.8,4.0,11.48,0.25,150.0,2.4,3123.055856,8.0,0.162653,0.235337,0.279972,0.3556,0.3782,0.3363,0.4718,0.3015,0.2659,0.2274,0.318,0.3025,0.3124,0.2622,0.2304,0.2706,0.3761,0.4613,0.3814,0.3055,0.2764,0.3211,0.3784,-0.1955,0.3122,0.1932,0.2485,0.2211,0.2004,0.2478,0.304,0.3316,0.3646,0.3483,0.3012,0.4038,0.3875,0.3158,0.3986,0.4391,0.1509,0.3958,0.2886,0.2535,0.3764,0.4054,0.4601,0.2509,0.283,0.2712,0.3801,0.175,0.3987,0.3063,0.4239,0.4616,0.3587,0.2955,0.3431,0.2278,0.3599,0.3549,0.4011,0.3865,0.3661,0.2671,0.3591,0.4313,0.3489,0.4442,0.2985,0.3383,0.337,0.2609,0.2706,0.3312,0.3792,0.2851,0.2897,0.2984,0.0936,0.337,0.388,0.4038,0.2799,0.2873,0.4159,0.352,0.356,0.26,0.3116,0.3225,0.3762,0.237,0.2251,0.2021,0.272,0.3434,0.0626,0.3025,0.267,0.3212,0.2098,0.2967,0.3236,0.3266,0.2455,0.2522,0.3107,0.3372,0.3793,0.3522,0.2564
SD0017,HR+/HER2-,54.0,2.0,2.5,19.0,15.11,0.404255319,1369.0,1.7,5275.497847,11.0,0.155523,0.148387,0.128777,0.4314,0.4482,0.408,0.498,0.3912,0.312,0.2934,0.3864,0.3957,0.3131,0.3589,0.3341,0.3654,0.3842,0.4957,0.4328,0.3533,0.3732,0.4177,0.435,0.1401,0.4006,0.3023,0.3429,0.3228,0.2922,0.3591,0.3671,0.3926,0.4113,0.4082,0.3325,0.4092,0.4129,0.3338,0.419,0.4248,0.1319,0.4878,0.3797,0.1889,0.4601,0.4445,0.4859,0.3224,0.3693,0.3003,0.4095,0.2938,0.4707,0.3968,0.4616,0.4893,0.4644,0.2735,0.3955,0.3234,0.4165,0.4337,0.419,0.4511,0.3758,0.3174,0.3679,0.4314,0.3648,0.4439,0.2886,0.3688,0.4143,0.2489,0.3562,0.3498,0.4563,0.3576,0.3706,0.3377,0.2758,0.4006,0.4,0.4062,0.3182,0.346,0.417,0.3275,0.3432,0.2914,0.3307,0.3525,0.3879,0.2212,0.333,0.2251,0.3151,0.3495,0.1939,0.2953,0.301,0.3467,0.2547,0.3313,0.323,0.3597,0.2726,0.2648,0.3309,0.4165,0.3997,0.3468,0.2682
SD0018,HR+/HER2+,58.0,3.0,3.0,39.0,3.0,0.30952381,382.0,1.6,3548.34822,11.0,0.129397,0.133531,0.304963,0.3701,0.3859,0.3067,0.4858,0.3141,0.298,0.2006,0.3222,0.3017,0.3358,0.287,0.2544,0.2955,0.3292,0.4758,0.42,0.2892,0.2845,0.3767,0.4084,0.1187,0.2786,0.2363,0.2357,0.236,0.231,0.3125,0.3054,0.343,0.3742,0.3709,0.286,0.3689,0.388,0.3172,0.4095,0.4343,0.2824,0.4664,0.2799,0.2613,0.3975,0.4264,0.4805,0.321,0.267,0.2429,0.3952,0.2321,0.4166,0.3061,0.4446,0.4658,0.4181,0.2747,0.391,0.3019,0.3722,0.3716,0.3075,0.3931,0.3605,0.2308,0.3603,0.4258,0.3393,0.4654,0.292,0.3385,0.3668,0.2237,0.3404,0.3264,0.3909,0.2758,0.305,0.377,0.2009,0.3664,0.4026,0.4092,0.2699,0.253,0.3955,0.3185,0.3066,0.2713,0.3333,0.3041,0.3523,0.2401,0.2709,0.1413,0.2636,0.3061,0.0764,0.2639,0.2623,0.3348,0.2124,0.2731,0.302,0.354,0.2233,0.2182,0.2979,0.353,0.4033,0.3444,0.2496


**Sanity Check:** Check to make sure there is no duplicated index rows in the dataset.

In [4]:
print(dfd.index[dfd.index.duplicated()].unique())
rows_dupe = list(dfd.index[dfd.index.duplicated()].unique())
rows_dupe

Index([], dtype='object', name='ID')


[]

Now, We need to encode the `object` columns of `Subtype` and `FN/FT_Ratio` into appropriate types. Change `Age`, `TumorGrade`, and `IMPRES` into `int64` as well as all `*_Count` columns because they are discrete variables. Change the `FN/FT_Ratio` into `float64`.

In [5]:
dfd['Subtype'] = dfd['Subtype'].astype('category')
dfd['Age'] = dfd['Age'].astype('int64')
dfd['TumorGrade'] = dfd['TumorGrade'].astype('int64')
dfd['IMPRES'] = dfd['IMPRES'].astype('int64')
dfd['FusionNeo_Count'] = dfd['FusionNeo_Count'].astype('int64')
dfd['SNVindelNeo_Count'] = dfd['SNVindelNeo_Count'].astype('int64')
dfd['FN/FT_Ratio'] = dfd['FN/FT_Ratio'].astype('float64')

# print(dfd.dtypes)
pd.set_option('display.max_rows', 8)

Now we can use Feature_Engine's `OneHotEncoder()` to create a `k` dummy variable set for `Subtype`.

**NOTE**: The encoded columns will be appended at the end of the dataFrame. 


In [6]:
from feature_engine.encoding import OneHotEncoder

encoder = OneHotEncoder(
    variables=['Subtype'],
    drop_last=False)

encoder.fit(dfd)
dfd_ = encoder.transform(dfd)
dfd_.head()

Unnamed: 0_level_0,Age,TumorGrade,TumourSize,FusionNeo_Count,FusionNeo_bestIC50,FN/FT_Ratio,SNVindelNeo_Count,SNVindelNeo_IC50,ESTIMATE,IMPRES,C_Bcellsnaive,C_TcellsCD4memoryresting,C_MacrophagesM2,S_Attractors_LYM,S_Attractors_IFIT3,S_Attractors_G_GIMAP4,S_Attractors_G_HLA.DPA1,S_Attractors_G_SLAMF6,S_Attractors_G_LILRB4,S_Attractors_G_SIGLEC9,S_Attractors_G_CYTH4,S_Attractors_G_CD3E,S_Lymph_Vessels,S_ICR_SCORE,S_ICR_INHIB_SCORE,S_ICR_ACT_SCORE,S_Angiogenesis,S_APM1,S_APM2,S_ICS5_score,S_LIexpression_score,S_Chemokine12_score,S_NHI_5gene_score,S_CD68,S_CD8A,S_PD1_data,S_PDL1_data,S_PD1_PDL1_score,S_CTLA4_data,S_Bcell_mg_IGJ,S_Bcell_receptors_score,S_STAT1_score,S_CSF1_response,S_TcClassII_score,S_IL12_score_21050467,S_IL4_score_21050467,S_IL2_score_21050467,S_IL13_score_21050467,S_IFNG_score_21050467,S_TGFB_score_21050467,S_TREM1_data,S_DAP12_data,S_Tcell_receptors_score,S_IL8_21978456,S_IFN_21978456,S_MHC1_21978456,S_MHC2_21978456,S_Bcell_21978456,S_Tcell_21978456,S_CD103pos_mean_25446897,S_CD103neg_mean_25446897,S_IgG_19272155,S_Interferon_19272155,S_LCK_19272155,S_MHC.I_19272155,S_MHC.II_19272155,S_STAT1_19272155,S_Troester_WoundSig_19887484,S_MDACC.FNA.1_20805453,S_IGG_Cluster_21214954,S_Minterferon_Cluster_21214954,S_Immune_cell_Cluster_21214954,S_MCD3_CD8_21214954,S_Interferon_Cluster_21214954,S_B_cell_PCA_16704732,S_CD8_PCA_16704732,S_GRANS_PCA_16704732,S_LYMPHS_PCA_16704732,S_T_cell_PCA_16704732,S_TGFB_PCA_17349583,S_Rotterdam_ERneg_PCA_15721472,S_HER2_Immune_PCA_18006808,S_IR7_score,S_Buck14_score,S_TAMsurr_score,S_Immune_NSCLC_score,S_Module3_IFN_score,S_Module4_TcellBcell_score,S_Module5_TcellBcell_score,S_Module11_Prolif_score,S_CD8_CD68_ratio,S_TAMsurr_TcClassII_ratio,S_CHANG_CORE_SERUM_RESPONSE_UP,S_CSR_Activated_15701700,S_B_cells,S_T_cells,S_T_helper,S_Tcm,S_Tem,S_Th1,S_Th2,S_TFH,S_CD8_Tcells,S_Th17,S_Treg,S_Tgd,S_Cytotoxic_cells,S_NK_cells,S_NK_cd56dim,S_NK_cd56bright,S_DC,S_iDC,S_aDC,S_pDC,S_Eosinophils,S_Macrophages,S_Mast,S_Neutrophils,S_Bindea_full,S_Expanded_IFNg,S_KEGG_MMR,S_KEGG_TGF_Beta,S_KEGG_Cytosolic_DNA_Sensing,Subtype_HR+/HER2-,Subtype_HR+/HER2+,Subtype_TNBC,Subtype_HR-/HER2+
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1
SD0012,50,2,2.3,20,5.79,0.47619,357,1.7,2895.605487,9,0.120394,0.117468,0.44845,0.362,0.4216,0.3034,0.4425,0.2749,0.2983,0.2756,0.3121,0.2765,0.3203,0.2463,0.2341,0.2495,0.3571,0.4591,0.3985,0.213,0.2223,0.2766,0.3573,0.1561,0.2942,0.1536,0.253,0.2042,0.2061,0.1942,0.2764,0.3327,0.3795,0.3435,0.2846,0.3797,0.3761,0.3197,0.4008,0.4403,0.2907,0.4011,0.2526,0.2237,0.4256,0.3971,0.4477,0.1852,0.2516,0.2262,0.3854,0.1591,0.4462,0.2912,0.4211,0.4502,0.361,0.2711,0.3414,0.2183,0.3728,0.3512,0.3529,0.4162,0.3612,0.2215,0.3614,0.4294,0.3451,0.4648,0.2937,0.3338,0.2999,0.2094,0.2743,0.3203,0.4136,0.2546,0.273,0.3697,0.2268,0.3335,0.3959,0.4066,0.2199,0.2395,0.4132,0.3633,0.321,0.2599,0.3561,0.3112,0.3645,0.281,0.2348,0.1945,0.2188,0.3281,0.0602,0.2038,0.2445,0.322,0.2212,0.2866,0.303,0.3657,0.2499,0.2531,0.3031,0.3097,0.4053,0.3537,0.253,1,0,0,0
SD0014,58,2,2.5,10,5.28,0.588235,85,4.1,4257.831526,11,0.165023,0.207531,0.124223,0.4126,0.3815,0.3619,0.476,0.3343,0.3001,0.2769,0.3761,0.3525,0.3452,0.3051,0.2701,0.3143,0.3803,0.4819,0.4125,0.3002,0.3106,0.3646,0.3753,0.103,0.3644,0.2726,0.2377,0.2553,0.2513,0.282,0.3079,0.3515,0.3869,0.3868,0.3133,0.4051,0.4031,0.3308,0.412,0.4496,0.2012,0.436,0.329,0.2562,0.3882,0.431,0.4682,0.299,0.3007,0.2859,0.3998,0.2225,0.4068,0.3481,0.4494,0.471,0.4011,0.284,0.3779,0.2727,0.3831,0.3864,0.3511,0.3983,0.3676,0.2761,0.3637,0.4268,0.3555,0.458,0.3266,0.3549,0.3763,0.2157,0.3028,0.3278,0.3896,0.3111,0.3159,0.3126,0.2395,0.3747,0.3838,0.403,0.2731,0.2909,0.4092,0.3404,0.353,0.2808,0.3102,0.3229,0.3713,0.2645,0.3182,0.1977,0.2985,0.3502,0.1191,0.3316,0.2976,0.3445,0.2403,0.345,0.3203,0.3484,0.2768,0.2783,0.32,0.3668,0.3803,0.347,0.2606,1,0,0,0
SD0015,46,2,1.8,4,11.48,0.25,150,2.4,3123.055856,8,0.162653,0.235337,0.279972,0.3556,0.3782,0.3363,0.4718,0.3015,0.2659,0.2274,0.318,0.3025,0.3124,0.2622,0.2304,0.2706,0.3761,0.4613,0.3814,0.3055,0.2764,0.3211,0.3784,-0.1955,0.3122,0.1932,0.2485,0.2211,0.2004,0.2478,0.304,0.3316,0.3646,0.3483,0.3012,0.4038,0.3875,0.3158,0.3986,0.4391,0.1509,0.3958,0.2886,0.2535,0.3764,0.4054,0.4601,0.2509,0.283,0.2712,0.3801,0.175,0.3987,0.3063,0.4239,0.4616,0.3587,0.2955,0.3431,0.2278,0.3599,0.3549,0.4011,0.3865,0.3661,0.2671,0.3591,0.4313,0.3489,0.4442,0.2985,0.3383,0.337,0.2609,0.2706,0.3312,0.3792,0.2851,0.2897,0.2984,0.0936,0.337,0.388,0.4038,0.2799,0.2873,0.4159,0.352,0.356,0.26,0.3116,0.3225,0.3762,0.237,0.2251,0.2021,0.272,0.3434,0.0626,0.3025,0.267,0.3212,0.2098,0.2967,0.3236,0.3266,0.2455,0.2522,0.3107,0.3372,0.3793,0.3522,0.2564,1,0,0,0
SD0017,54,2,2.5,19,15.11,0.404255,1369,1.7,5275.497847,11,0.155523,0.148387,0.128777,0.4314,0.4482,0.408,0.498,0.3912,0.312,0.2934,0.3864,0.3957,0.3131,0.3589,0.3341,0.3654,0.3842,0.4957,0.4328,0.3533,0.3732,0.4177,0.435,0.1401,0.4006,0.3023,0.3429,0.3228,0.2922,0.3591,0.3671,0.3926,0.4113,0.4082,0.3325,0.4092,0.4129,0.3338,0.419,0.4248,0.1319,0.4878,0.3797,0.1889,0.4601,0.4445,0.4859,0.3224,0.3693,0.3003,0.4095,0.2938,0.4707,0.3968,0.4616,0.4893,0.4644,0.2735,0.3955,0.3234,0.4165,0.4337,0.419,0.4511,0.3758,0.3174,0.3679,0.4314,0.3648,0.4439,0.2886,0.3688,0.4143,0.2489,0.3562,0.3498,0.4563,0.3576,0.3706,0.3377,0.2758,0.4006,0.4,0.4062,0.3182,0.346,0.417,0.3275,0.3432,0.2914,0.3307,0.3525,0.3879,0.2212,0.333,0.2251,0.3151,0.3495,0.1939,0.2953,0.301,0.3467,0.2547,0.3313,0.323,0.3597,0.2726,0.2648,0.3309,0.4165,0.3997,0.3468,0.2682,1,0,0,0
SD0018,58,3,3.0,39,3.0,0.309524,382,1.6,3548.34822,11,0.129397,0.133531,0.304963,0.3701,0.3859,0.3067,0.4858,0.3141,0.298,0.2006,0.3222,0.3017,0.3358,0.287,0.2544,0.2955,0.3292,0.4758,0.42,0.2892,0.2845,0.3767,0.4084,0.1187,0.2786,0.2363,0.2357,0.236,0.231,0.3125,0.3054,0.343,0.3742,0.3709,0.286,0.3689,0.388,0.3172,0.4095,0.4343,0.2824,0.4664,0.2799,0.2613,0.3975,0.4264,0.4805,0.321,0.267,0.2429,0.3952,0.2321,0.4166,0.3061,0.4446,0.4658,0.4181,0.2747,0.391,0.3019,0.3722,0.3716,0.3075,0.3931,0.3605,0.2308,0.3603,0.4258,0.3393,0.4654,0.292,0.3385,0.3668,0.2237,0.3404,0.3264,0.3909,0.2758,0.305,0.377,0.2009,0.3664,0.4026,0.4092,0.2699,0.253,0.3955,0.3185,0.3066,0.2713,0.3333,0.3041,0.3523,0.2401,0.2709,0.1413,0.2636,0.3061,0.0764,0.2639,0.2623,0.3348,0.2124,0.2731,0.302,0.354,0.2233,0.2182,0.2979,0.353,0.4033,0.3444,0.2496,0,1,0,0


In [7]:
# Specify the encoded columns to shift
enc_cols = ['Subtype_HR+/HER2-', 'Subtype_HR+/HER2+', 'Subtype_TNBC', 'Subtype_HR-/HER2+']

# Drop the specified columns and store them
encoded_df = dfd_[enc_cols]
dfenc = dfd.drop(columns=['Subtype'])

# Specify the index where you want to reinsert the columns
insert_index = 0  # This will insert at the first column

# Reinsert the columns
for i, col in enumerate(encoded_df.columns):
    dfenc.insert(insert_index + i, col, encoded_df[col])

Below is the categorically-encoded dataframe.

In [8]:
print(dfenc.shape)
dfenc.head()

(674, 127)


Unnamed: 0_level_0,Subtype_HR+/HER2-,Subtype_HR+/HER2+,Subtype_TNBC,Subtype_HR-/HER2+,Age,TumorGrade,TumourSize,FusionNeo_Count,FusionNeo_bestIC50,FN/FT_Ratio,SNVindelNeo_Count,SNVindelNeo_IC50,ESTIMATE,IMPRES,C_Bcellsnaive,C_TcellsCD4memoryresting,C_MacrophagesM2,S_Attractors_LYM,S_Attractors_IFIT3,S_Attractors_G_GIMAP4,S_Attractors_G_HLA.DPA1,S_Attractors_G_SLAMF6,S_Attractors_G_LILRB4,S_Attractors_G_SIGLEC9,S_Attractors_G_CYTH4,S_Attractors_G_CD3E,S_Lymph_Vessels,S_ICR_SCORE,S_ICR_INHIB_SCORE,S_ICR_ACT_SCORE,S_Angiogenesis,S_APM1,S_APM2,S_ICS5_score,S_LIexpression_score,S_Chemokine12_score,S_NHI_5gene_score,S_CD68,S_CD8A,S_PD1_data,S_PDL1_data,S_PD1_PDL1_score,S_CTLA4_data,S_Bcell_mg_IGJ,S_Bcell_receptors_score,S_STAT1_score,S_CSF1_response,S_TcClassII_score,S_IL12_score_21050467,S_IL4_score_21050467,S_IL2_score_21050467,S_IL13_score_21050467,S_IFNG_score_21050467,S_TGFB_score_21050467,S_TREM1_data,S_DAP12_data,S_Tcell_receptors_score,S_IL8_21978456,S_IFN_21978456,S_MHC1_21978456,S_MHC2_21978456,S_Bcell_21978456,S_Tcell_21978456,S_CD103pos_mean_25446897,S_CD103neg_mean_25446897,S_IgG_19272155,S_Interferon_19272155,S_LCK_19272155,S_MHC.I_19272155,S_MHC.II_19272155,S_STAT1_19272155,S_Troester_WoundSig_19887484,S_MDACC.FNA.1_20805453,S_IGG_Cluster_21214954,S_Minterferon_Cluster_21214954,S_Immune_cell_Cluster_21214954,S_MCD3_CD8_21214954,S_Interferon_Cluster_21214954,S_B_cell_PCA_16704732,S_CD8_PCA_16704732,S_GRANS_PCA_16704732,S_LYMPHS_PCA_16704732,S_T_cell_PCA_16704732,S_TGFB_PCA_17349583,S_Rotterdam_ERneg_PCA_15721472,S_HER2_Immune_PCA_18006808,S_IR7_score,S_Buck14_score,S_TAMsurr_score,S_Immune_NSCLC_score,S_Module3_IFN_score,S_Module4_TcellBcell_score,S_Module5_TcellBcell_score,S_Module11_Prolif_score,S_CD8_CD68_ratio,S_TAMsurr_TcClassII_ratio,S_CHANG_CORE_SERUM_RESPONSE_UP,S_CSR_Activated_15701700,S_B_cells,S_T_cells,S_T_helper,S_Tcm,S_Tem,S_Th1,S_Th2,S_TFH,S_CD8_Tcells,S_Th17,S_Treg,S_Tgd,S_Cytotoxic_cells,S_NK_cells,S_NK_cd56dim,S_NK_cd56bright,S_DC,S_iDC,S_aDC,S_pDC,S_Eosinophils,S_Macrophages,S_Mast,S_Neutrophils,S_Bindea_full,S_Expanded_IFNg,S_KEGG_MMR,S_KEGG_TGF_Beta,S_KEGG_Cytosolic_DNA_Sensing
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1
SD0012,1,0,0,0,50,2,2.3,20,5.79,0.47619,357,1.7,2895.605487,9,0.120394,0.117468,0.44845,0.362,0.4216,0.3034,0.4425,0.2749,0.2983,0.2756,0.3121,0.2765,0.3203,0.2463,0.2341,0.2495,0.3571,0.4591,0.3985,0.213,0.2223,0.2766,0.3573,0.1561,0.2942,0.1536,0.253,0.2042,0.2061,0.1942,0.2764,0.3327,0.3795,0.3435,0.2846,0.3797,0.3761,0.3197,0.4008,0.4403,0.2907,0.4011,0.2526,0.2237,0.4256,0.3971,0.4477,0.1852,0.2516,0.2262,0.3854,0.1591,0.4462,0.2912,0.4211,0.4502,0.361,0.2711,0.3414,0.2183,0.3728,0.3512,0.3529,0.4162,0.3612,0.2215,0.3614,0.4294,0.3451,0.4648,0.2937,0.3338,0.2999,0.2094,0.2743,0.3203,0.4136,0.2546,0.273,0.3697,0.2268,0.3335,0.3959,0.4066,0.2199,0.2395,0.4132,0.3633,0.321,0.2599,0.3561,0.3112,0.3645,0.281,0.2348,0.1945,0.2188,0.3281,0.0602,0.2038,0.2445,0.322,0.2212,0.2866,0.303,0.3657,0.2499,0.2531,0.3031,0.3097,0.4053,0.3537,0.253
SD0014,1,0,0,0,58,2,2.5,10,5.28,0.588235,85,4.1,4257.831526,11,0.165023,0.207531,0.124223,0.4126,0.3815,0.3619,0.476,0.3343,0.3001,0.2769,0.3761,0.3525,0.3452,0.3051,0.2701,0.3143,0.3803,0.4819,0.4125,0.3002,0.3106,0.3646,0.3753,0.103,0.3644,0.2726,0.2377,0.2553,0.2513,0.282,0.3079,0.3515,0.3869,0.3868,0.3133,0.4051,0.4031,0.3308,0.412,0.4496,0.2012,0.436,0.329,0.2562,0.3882,0.431,0.4682,0.299,0.3007,0.2859,0.3998,0.2225,0.4068,0.3481,0.4494,0.471,0.4011,0.284,0.3779,0.2727,0.3831,0.3864,0.3511,0.3983,0.3676,0.2761,0.3637,0.4268,0.3555,0.458,0.3266,0.3549,0.3763,0.2157,0.3028,0.3278,0.3896,0.3111,0.3159,0.3126,0.2395,0.3747,0.3838,0.403,0.2731,0.2909,0.4092,0.3404,0.353,0.2808,0.3102,0.3229,0.3713,0.2645,0.3182,0.1977,0.2985,0.3502,0.1191,0.3316,0.2976,0.3445,0.2403,0.345,0.3203,0.3484,0.2768,0.2783,0.32,0.3668,0.3803,0.347,0.2606
SD0015,1,0,0,0,46,2,1.8,4,11.48,0.25,150,2.4,3123.055856,8,0.162653,0.235337,0.279972,0.3556,0.3782,0.3363,0.4718,0.3015,0.2659,0.2274,0.318,0.3025,0.3124,0.2622,0.2304,0.2706,0.3761,0.4613,0.3814,0.3055,0.2764,0.3211,0.3784,-0.1955,0.3122,0.1932,0.2485,0.2211,0.2004,0.2478,0.304,0.3316,0.3646,0.3483,0.3012,0.4038,0.3875,0.3158,0.3986,0.4391,0.1509,0.3958,0.2886,0.2535,0.3764,0.4054,0.4601,0.2509,0.283,0.2712,0.3801,0.175,0.3987,0.3063,0.4239,0.4616,0.3587,0.2955,0.3431,0.2278,0.3599,0.3549,0.4011,0.3865,0.3661,0.2671,0.3591,0.4313,0.3489,0.4442,0.2985,0.3383,0.337,0.2609,0.2706,0.3312,0.3792,0.2851,0.2897,0.2984,0.0936,0.337,0.388,0.4038,0.2799,0.2873,0.4159,0.352,0.356,0.26,0.3116,0.3225,0.3762,0.237,0.2251,0.2021,0.272,0.3434,0.0626,0.3025,0.267,0.3212,0.2098,0.2967,0.3236,0.3266,0.2455,0.2522,0.3107,0.3372,0.3793,0.3522,0.2564
SD0017,1,0,0,0,54,2,2.5,19,15.11,0.404255,1369,1.7,5275.497847,11,0.155523,0.148387,0.128777,0.4314,0.4482,0.408,0.498,0.3912,0.312,0.2934,0.3864,0.3957,0.3131,0.3589,0.3341,0.3654,0.3842,0.4957,0.4328,0.3533,0.3732,0.4177,0.435,0.1401,0.4006,0.3023,0.3429,0.3228,0.2922,0.3591,0.3671,0.3926,0.4113,0.4082,0.3325,0.4092,0.4129,0.3338,0.419,0.4248,0.1319,0.4878,0.3797,0.1889,0.4601,0.4445,0.4859,0.3224,0.3693,0.3003,0.4095,0.2938,0.4707,0.3968,0.4616,0.4893,0.4644,0.2735,0.3955,0.3234,0.4165,0.4337,0.419,0.4511,0.3758,0.3174,0.3679,0.4314,0.3648,0.4439,0.2886,0.3688,0.4143,0.2489,0.3562,0.3498,0.4563,0.3576,0.3706,0.3377,0.2758,0.4006,0.4,0.4062,0.3182,0.346,0.417,0.3275,0.3432,0.2914,0.3307,0.3525,0.3879,0.2212,0.333,0.2251,0.3151,0.3495,0.1939,0.2953,0.301,0.3467,0.2547,0.3313,0.323,0.3597,0.2726,0.2648,0.3309,0.4165,0.3997,0.3468,0.2682
SD0018,0,1,0,0,58,3,3.0,39,3.0,0.309524,382,1.6,3548.34822,11,0.129397,0.133531,0.304963,0.3701,0.3859,0.3067,0.4858,0.3141,0.298,0.2006,0.3222,0.3017,0.3358,0.287,0.2544,0.2955,0.3292,0.4758,0.42,0.2892,0.2845,0.3767,0.4084,0.1187,0.2786,0.2363,0.2357,0.236,0.231,0.3125,0.3054,0.343,0.3742,0.3709,0.286,0.3689,0.388,0.3172,0.4095,0.4343,0.2824,0.4664,0.2799,0.2613,0.3975,0.4264,0.4805,0.321,0.267,0.2429,0.3952,0.2321,0.4166,0.3061,0.4446,0.4658,0.4181,0.2747,0.391,0.3019,0.3722,0.3716,0.3075,0.3931,0.3605,0.2308,0.3603,0.4258,0.3393,0.4654,0.292,0.3385,0.3668,0.2237,0.3404,0.3264,0.3909,0.2758,0.305,0.377,0.2009,0.3664,0.4026,0.4092,0.2699,0.253,0.3955,0.3185,0.3066,0.2713,0.3333,0.3041,0.3523,0.2401,0.2709,0.1413,0.2636,0.3061,0.0764,0.2639,0.2623,0.3348,0.2124,0.2731,0.302,0.354,0.2233,0.2182,0.2979,0.353,0.4033,0.3444,0.2496


And below is the original, unencoded dataframe.

In [9]:
print(dfd.shape)
dfd.head()

(674, 124)


Unnamed: 0_level_0,Subtype,Age,TumorGrade,TumourSize,FusionNeo_Count,FusionNeo_bestIC50,FN/FT_Ratio,SNVindelNeo_Count,SNVindelNeo_IC50,ESTIMATE,IMPRES,C_Bcellsnaive,C_TcellsCD4memoryresting,C_MacrophagesM2,S_Attractors_LYM,S_Attractors_IFIT3,S_Attractors_G_GIMAP4,S_Attractors_G_HLA.DPA1,S_Attractors_G_SLAMF6,S_Attractors_G_LILRB4,S_Attractors_G_SIGLEC9,S_Attractors_G_CYTH4,S_Attractors_G_CD3E,S_Lymph_Vessels,S_ICR_SCORE,S_ICR_INHIB_SCORE,S_ICR_ACT_SCORE,S_Angiogenesis,S_APM1,S_APM2,S_ICS5_score,S_LIexpression_score,S_Chemokine12_score,S_NHI_5gene_score,S_CD68,S_CD8A,S_PD1_data,S_PDL1_data,S_PD1_PDL1_score,S_CTLA4_data,S_Bcell_mg_IGJ,S_Bcell_receptors_score,S_STAT1_score,S_CSF1_response,S_TcClassII_score,S_IL12_score_21050467,S_IL4_score_21050467,S_IL2_score_21050467,S_IL13_score_21050467,S_IFNG_score_21050467,S_TGFB_score_21050467,S_TREM1_data,S_DAP12_data,S_Tcell_receptors_score,S_IL8_21978456,S_IFN_21978456,S_MHC1_21978456,S_MHC2_21978456,S_Bcell_21978456,S_Tcell_21978456,S_CD103pos_mean_25446897,S_CD103neg_mean_25446897,S_IgG_19272155,S_Interferon_19272155,S_LCK_19272155,S_MHC.I_19272155,S_MHC.II_19272155,S_STAT1_19272155,S_Troester_WoundSig_19887484,S_MDACC.FNA.1_20805453,S_IGG_Cluster_21214954,S_Minterferon_Cluster_21214954,S_Immune_cell_Cluster_21214954,S_MCD3_CD8_21214954,S_Interferon_Cluster_21214954,S_B_cell_PCA_16704732,S_CD8_PCA_16704732,S_GRANS_PCA_16704732,S_LYMPHS_PCA_16704732,S_T_cell_PCA_16704732,S_TGFB_PCA_17349583,S_Rotterdam_ERneg_PCA_15721472,S_HER2_Immune_PCA_18006808,S_IR7_score,S_Buck14_score,S_TAMsurr_score,S_Immune_NSCLC_score,S_Module3_IFN_score,S_Module4_TcellBcell_score,S_Module5_TcellBcell_score,S_Module11_Prolif_score,S_CD8_CD68_ratio,S_TAMsurr_TcClassII_ratio,S_CHANG_CORE_SERUM_RESPONSE_UP,S_CSR_Activated_15701700,S_B_cells,S_T_cells,S_T_helper,S_Tcm,S_Tem,S_Th1,S_Th2,S_TFH,S_CD8_Tcells,S_Th17,S_Treg,S_Tgd,S_Cytotoxic_cells,S_NK_cells,S_NK_cd56dim,S_NK_cd56bright,S_DC,S_iDC,S_aDC,S_pDC,S_Eosinophils,S_Macrophages,S_Mast,S_Neutrophils,S_Bindea_full,S_Expanded_IFNg,S_KEGG_MMR,S_KEGG_TGF_Beta,S_KEGG_Cytosolic_DNA_Sensing
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1
SD0012,HR+/HER2-,50,2,2.3,20,5.79,0.47619,357,1.7,2895.605487,9,0.120394,0.117468,0.44845,0.362,0.4216,0.3034,0.4425,0.2749,0.2983,0.2756,0.3121,0.2765,0.3203,0.2463,0.2341,0.2495,0.3571,0.4591,0.3985,0.213,0.2223,0.2766,0.3573,0.1561,0.2942,0.1536,0.253,0.2042,0.2061,0.1942,0.2764,0.3327,0.3795,0.3435,0.2846,0.3797,0.3761,0.3197,0.4008,0.4403,0.2907,0.4011,0.2526,0.2237,0.4256,0.3971,0.4477,0.1852,0.2516,0.2262,0.3854,0.1591,0.4462,0.2912,0.4211,0.4502,0.361,0.2711,0.3414,0.2183,0.3728,0.3512,0.3529,0.4162,0.3612,0.2215,0.3614,0.4294,0.3451,0.4648,0.2937,0.3338,0.2999,0.2094,0.2743,0.3203,0.4136,0.2546,0.273,0.3697,0.2268,0.3335,0.3959,0.4066,0.2199,0.2395,0.4132,0.3633,0.321,0.2599,0.3561,0.3112,0.3645,0.281,0.2348,0.1945,0.2188,0.3281,0.0602,0.2038,0.2445,0.322,0.2212,0.2866,0.303,0.3657,0.2499,0.2531,0.3031,0.3097,0.4053,0.3537,0.253
SD0014,HR+/HER2-,58,2,2.5,10,5.28,0.588235,85,4.1,4257.831526,11,0.165023,0.207531,0.124223,0.4126,0.3815,0.3619,0.476,0.3343,0.3001,0.2769,0.3761,0.3525,0.3452,0.3051,0.2701,0.3143,0.3803,0.4819,0.4125,0.3002,0.3106,0.3646,0.3753,0.103,0.3644,0.2726,0.2377,0.2553,0.2513,0.282,0.3079,0.3515,0.3869,0.3868,0.3133,0.4051,0.4031,0.3308,0.412,0.4496,0.2012,0.436,0.329,0.2562,0.3882,0.431,0.4682,0.299,0.3007,0.2859,0.3998,0.2225,0.4068,0.3481,0.4494,0.471,0.4011,0.284,0.3779,0.2727,0.3831,0.3864,0.3511,0.3983,0.3676,0.2761,0.3637,0.4268,0.3555,0.458,0.3266,0.3549,0.3763,0.2157,0.3028,0.3278,0.3896,0.3111,0.3159,0.3126,0.2395,0.3747,0.3838,0.403,0.2731,0.2909,0.4092,0.3404,0.353,0.2808,0.3102,0.3229,0.3713,0.2645,0.3182,0.1977,0.2985,0.3502,0.1191,0.3316,0.2976,0.3445,0.2403,0.345,0.3203,0.3484,0.2768,0.2783,0.32,0.3668,0.3803,0.347,0.2606
SD0015,HR+/HER2-,46,2,1.8,4,11.48,0.25,150,2.4,3123.055856,8,0.162653,0.235337,0.279972,0.3556,0.3782,0.3363,0.4718,0.3015,0.2659,0.2274,0.318,0.3025,0.3124,0.2622,0.2304,0.2706,0.3761,0.4613,0.3814,0.3055,0.2764,0.3211,0.3784,-0.1955,0.3122,0.1932,0.2485,0.2211,0.2004,0.2478,0.304,0.3316,0.3646,0.3483,0.3012,0.4038,0.3875,0.3158,0.3986,0.4391,0.1509,0.3958,0.2886,0.2535,0.3764,0.4054,0.4601,0.2509,0.283,0.2712,0.3801,0.175,0.3987,0.3063,0.4239,0.4616,0.3587,0.2955,0.3431,0.2278,0.3599,0.3549,0.4011,0.3865,0.3661,0.2671,0.3591,0.4313,0.3489,0.4442,0.2985,0.3383,0.337,0.2609,0.2706,0.3312,0.3792,0.2851,0.2897,0.2984,0.0936,0.337,0.388,0.4038,0.2799,0.2873,0.4159,0.352,0.356,0.26,0.3116,0.3225,0.3762,0.237,0.2251,0.2021,0.272,0.3434,0.0626,0.3025,0.267,0.3212,0.2098,0.2967,0.3236,0.3266,0.2455,0.2522,0.3107,0.3372,0.3793,0.3522,0.2564
SD0017,HR+/HER2-,54,2,2.5,19,15.11,0.404255,1369,1.7,5275.497847,11,0.155523,0.148387,0.128777,0.4314,0.4482,0.408,0.498,0.3912,0.312,0.2934,0.3864,0.3957,0.3131,0.3589,0.3341,0.3654,0.3842,0.4957,0.4328,0.3533,0.3732,0.4177,0.435,0.1401,0.4006,0.3023,0.3429,0.3228,0.2922,0.3591,0.3671,0.3926,0.4113,0.4082,0.3325,0.4092,0.4129,0.3338,0.419,0.4248,0.1319,0.4878,0.3797,0.1889,0.4601,0.4445,0.4859,0.3224,0.3693,0.3003,0.4095,0.2938,0.4707,0.3968,0.4616,0.4893,0.4644,0.2735,0.3955,0.3234,0.4165,0.4337,0.419,0.4511,0.3758,0.3174,0.3679,0.4314,0.3648,0.4439,0.2886,0.3688,0.4143,0.2489,0.3562,0.3498,0.4563,0.3576,0.3706,0.3377,0.2758,0.4006,0.4,0.4062,0.3182,0.346,0.417,0.3275,0.3432,0.2914,0.3307,0.3525,0.3879,0.2212,0.333,0.2251,0.3151,0.3495,0.1939,0.2953,0.301,0.3467,0.2547,0.3313,0.323,0.3597,0.2726,0.2648,0.3309,0.4165,0.3997,0.3468,0.2682
SD0018,HR+/HER2+,58,3,3.0,39,3.0,0.309524,382,1.6,3548.34822,11,0.129397,0.133531,0.304963,0.3701,0.3859,0.3067,0.4858,0.3141,0.298,0.2006,0.3222,0.3017,0.3358,0.287,0.2544,0.2955,0.3292,0.4758,0.42,0.2892,0.2845,0.3767,0.4084,0.1187,0.2786,0.2363,0.2357,0.236,0.231,0.3125,0.3054,0.343,0.3742,0.3709,0.286,0.3689,0.388,0.3172,0.4095,0.4343,0.2824,0.4664,0.2799,0.2613,0.3975,0.4264,0.4805,0.321,0.267,0.2429,0.3952,0.2321,0.4166,0.3061,0.4446,0.4658,0.4181,0.2747,0.391,0.3019,0.3722,0.3716,0.3075,0.3931,0.3605,0.2308,0.3603,0.4258,0.3393,0.4654,0.292,0.3385,0.3668,0.2237,0.3404,0.3264,0.3909,0.2758,0.305,0.377,0.2009,0.3664,0.4026,0.4092,0.2699,0.253,0.3955,0.3185,0.3066,0.2713,0.3333,0.3041,0.3523,0.2401,0.2709,0.1413,0.2636,0.3061,0.0764,0.2639,0.2623,0.3348,0.2124,0.2731,0.302,0.354,0.2233,0.2182,0.2979,0.353,0.4033,0.3444,0.2496


#### **Subsetting Y Labels**

In the previous exploration, many of the immune scores (Y targets/labels) might not really show much relationship with fusion neoantigen variables so they may not be as informative. We decided to use Caitlin's finding and subset the Y labels into several clinically meaningful groups.

In [10]:
# use the unencoded categorical dataframe (dfd) and drop the Subtype categorical column
df_dcat = dfd.drop(columns=['Subtype'])
print(df_dcat.shape)
df_dcat.head()

(674, 123)


Unnamed: 0_level_0,Age,TumorGrade,TumourSize,FusionNeo_Count,FusionNeo_bestIC50,FN/FT_Ratio,SNVindelNeo_Count,SNVindelNeo_IC50,ESTIMATE,IMPRES,C_Bcellsnaive,C_TcellsCD4memoryresting,C_MacrophagesM2,S_Attractors_LYM,S_Attractors_IFIT3,S_Attractors_G_GIMAP4,S_Attractors_G_HLA.DPA1,S_Attractors_G_SLAMF6,S_Attractors_G_LILRB4,S_Attractors_G_SIGLEC9,S_Attractors_G_CYTH4,S_Attractors_G_CD3E,S_Lymph_Vessels,S_ICR_SCORE,S_ICR_INHIB_SCORE,S_ICR_ACT_SCORE,S_Angiogenesis,S_APM1,S_APM2,S_ICS5_score,S_LIexpression_score,S_Chemokine12_score,S_NHI_5gene_score,S_CD68,S_CD8A,S_PD1_data,S_PDL1_data,S_PD1_PDL1_score,S_CTLA4_data,S_Bcell_mg_IGJ,S_Bcell_receptors_score,S_STAT1_score,S_CSF1_response,S_TcClassII_score,S_IL12_score_21050467,S_IL4_score_21050467,S_IL2_score_21050467,S_IL13_score_21050467,S_IFNG_score_21050467,S_TGFB_score_21050467,S_TREM1_data,S_DAP12_data,S_Tcell_receptors_score,S_IL8_21978456,S_IFN_21978456,S_MHC1_21978456,S_MHC2_21978456,S_Bcell_21978456,S_Tcell_21978456,S_CD103pos_mean_25446897,S_CD103neg_mean_25446897,S_IgG_19272155,S_Interferon_19272155,S_LCK_19272155,S_MHC.I_19272155,S_MHC.II_19272155,S_STAT1_19272155,S_Troester_WoundSig_19887484,S_MDACC.FNA.1_20805453,S_IGG_Cluster_21214954,S_Minterferon_Cluster_21214954,S_Immune_cell_Cluster_21214954,S_MCD3_CD8_21214954,S_Interferon_Cluster_21214954,S_B_cell_PCA_16704732,S_CD8_PCA_16704732,S_GRANS_PCA_16704732,S_LYMPHS_PCA_16704732,S_T_cell_PCA_16704732,S_TGFB_PCA_17349583,S_Rotterdam_ERneg_PCA_15721472,S_HER2_Immune_PCA_18006808,S_IR7_score,S_Buck14_score,S_TAMsurr_score,S_Immune_NSCLC_score,S_Module3_IFN_score,S_Module4_TcellBcell_score,S_Module5_TcellBcell_score,S_Module11_Prolif_score,S_CD8_CD68_ratio,S_TAMsurr_TcClassII_ratio,S_CHANG_CORE_SERUM_RESPONSE_UP,S_CSR_Activated_15701700,S_B_cells,S_T_cells,S_T_helper,S_Tcm,S_Tem,S_Th1,S_Th2,S_TFH,S_CD8_Tcells,S_Th17,S_Treg,S_Tgd,S_Cytotoxic_cells,S_NK_cells,S_NK_cd56dim,S_NK_cd56bright,S_DC,S_iDC,S_aDC,S_pDC,S_Eosinophils,S_Macrophages,S_Mast,S_Neutrophils,S_Bindea_full,S_Expanded_IFNg,S_KEGG_MMR,S_KEGG_TGF_Beta,S_KEGG_Cytosolic_DNA_Sensing
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1
SD0012,50,2,2.3,20,5.79,0.47619,357,1.7,2895.605487,9,0.120394,0.117468,0.44845,0.362,0.4216,0.3034,0.4425,0.2749,0.2983,0.2756,0.3121,0.2765,0.3203,0.2463,0.2341,0.2495,0.3571,0.4591,0.3985,0.213,0.2223,0.2766,0.3573,0.1561,0.2942,0.1536,0.253,0.2042,0.2061,0.1942,0.2764,0.3327,0.3795,0.3435,0.2846,0.3797,0.3761,0.3197,0.4008,0.4403,0.2907,0.4011,0.2526,0.2237,0.4256,0.3971,0.4477,0.1852,0.2516,0.2262,0.3854,0.1591,0.4462,0.2912,0.4211,0.4502,0.361,0.2711,0.3414,0.2183,0.3728,0.3512,0.3529,0.4162,0.3612,0.2215,0.3614,0.4294,0.3451,0.4648,0.2937,0.3338,0.2999,0.2094,0.2743,0.3203,0.4136,0.2546,0.273,0.3697,0.2268,0.3335,0.3959,0.4066,0.2199,0.2395,0.4132,0.3633,0.321,0.2599,0.3561,0.3112,0.3645,0.281,0.2348,0.1945,0.2188,0.3281,0.0602,0.2038,0.2445,0.322,0.2212,0.2866,0.303,0.3657,0.2499,0.2531,0.3031,0.3097,0.4053,0.3537,0.253
SD0014,58,2,2.5,10,5.28,0.588235,85,4.1,4257.831526,11,0.165023,0.207531,0.124223,0.4126,0.3815,0.3619,0.476,0.3343,0.3001,0.2769,0.3761,0.3525,0.3452,0.3051,0.2701,0.3143,0.3803,0.4819,0.4125,0.3002,0.3106,0.3646,0.3753,0.103,0.3644,0.2726,0.2377,0.2553,0.2513,0.282,0.3079,0.3515,0.3869,0.3868,0.3133,0.4051,0.4031,0.3308,0.412,0.4496,0.2012,0.436,0.329,0.2562,0.3882,0.431,0.4682,0.299,0.3007,0.2859,0.3998,0.2225,0.4068,0.3481,0.4494,0.471,0.4011,0.284,0.3779,0.2727,0.3831,0.3864,0.3511,0.3983,0.3676,0.2761,0.3637,0.4268,0.3555,0.458,0.3266,0.3549,0.3763,0.2157,0.3028,0.3278,0.3896,0.3111,0.3159,0.3126,0.2395,0.3747,0.3838,0.403,0.2731,0.2909,0.4092,0.3404,0.353,0.2808,0.3102,0.3229,0.3713,0.2645,0.3182,0.1977,0.2985,0.3502,0.1191,0.3316,0.2976,0.3445,0.2403,0.345,0.3203,0.3484,0.2768,0.2783,0.32,0.3668,0.3803,0.347,0.2606
SD0015,46,2,1.8,4,11.48,0.25,150,2.4,3123.055856,8,0.162653,0.235337,0.279972,0.3556,0.3782,0.3363,0.4718,0.3015,0.2659,0.2274,0.318,0.3025,0.3124,0.2622,0.2304,0.2706,0.3761,0.4613,0.3814,0.3055,0.2764,0.3211,0.3784,-0.1955,0.3122,0.1932,0.2485,0.2211,0.2004,0.2478,0.304,0.3316,0.3646,0.3483,0.3012,0.4038,0.3875,0.3158,0.3986,0.4391,0.1509,0.3958,0.2886,0.2535,0.3764,0.4054,0.4601,0.2509,0.283,0.2712,0.3801,0.175,0.3987,0.3063,0.4239,0.4616,0.3587,0.2955,0.3431,0.2278,0.3599,0.3549,0.4011,0.3865,0.3661,0.2671,0.3591,0.4313,0.3489,0.4442,0.2985,0.3383,0.337,0.2609,0.2706,0.3312,0.3792,0.2851,0.2897,0.2984,0.0936,0.337,0.388,0.4038,0.2799,0.2873,0.4159,0.352,0.356,0.26,0.3116,0.3225,0.3762,0.237,0.2251,0.2021,0.272,0.3434,0.0626,0.3025,0.267,0.3212,0.2098,0.2967,0.3236,0.3266,0.2455,0.2522,0.3107,0.3372,0.3793,0.3522,0.2564
SD0017,54,2,2.5,19,15.11,0.404255,1369,1.7,5275.497847,11,0.155523,0.148387,0.128777,0.4314,0.4482,0.408,0.498,0.3912,0.312,0.2934,0.3864,0.3957,0.3131,0.3589,0.3341,0.3654,0.3842,0.4957,0.4328,0.3533,0.3732,0.4177,0.435,0.1401,0.4006,0.3023,0.3429,0.3228,0.2922,0.3591,0.3671,0.3926,0.4113,0.4082,0.3325,0.4092,0.4129,0.3338,0.419,0.4248,0.1319,0.4878,0.3797,0.1889,0.4601,0.4445,0.4859,0.3224,0.3693,0.3003,0.4095,0.2938,0.4707,0.3968,0.4616,0.4893,0.4644,0.2735,0.3955,0.3234,0.4165,0.4337,0.419,0.4511,0.3758,0.3174,0.3679,0.4314,0.3648,0.4439,0.2886,0.3688,0.4143,0.2489,0.3562,0.3498,0.4563,0.3576,0.3706,0.3377,0.2758,0.4006,0.4,0.4062,0.3182,0.346,0.417,0.3275,0.3432,0.2914,0.3307,0.3525,0.3879,0.2212,0.333,0.2251,0.3151,0.3495,0.1939,0.2953,0.301,0.3467,0.2547,0.3313,0.323,0.3597,0.2726,0.2648,0.3309,0.4165,0.3997,0.3468,0.2682
SD0018,58,3,3.0,39,3.0,0.309524,382,1.6,3548.34822,11,0.129397,0.133531,0.304963,0.3701,0.3859,0.3067,0.4858,0.3141,0.298,0.2006,0.3222,0.3017,0.3358,0.287,0.2544,0.2955,0.3292,0.4758,0.42,0.2892,0.2845,0.3767,0.4084,0.1187,0.2786,0.2363,0.2357,0.236,0.231,0.3125,0.3054,0.343,0.3742,0.3709,0.286,0.3689,0.388,0.3172,0.4095,0.4343,0.2824,0.4664,0.2799,0.2613,0.3975,0.4264,0.4805,0.321,0.267,0.2429,0.3952,0.2321,0.4166,0.3061,0.4446,0.4658,0.4181,0.2747,0.391,0.3019,0.3722,0.3716,0.3075,0.3931,0.3605,0.2308,0.3603,0.4258,0.3393,0.4654,0.292,0.3385,0.3668,0.2237,0.3404,0.3264,0.3909,0.2758,0.305,0.377,0.2009,0.3664,0.4026,0.4092,0.2699,0.253,0.3955,0.3185,0.3066,0.2713,0.3333,0.3041,0.3523,0.2401,0.2709,0.1413,0.2636,0.3061,0.0764,0.2639,0.2623,0.3348,0.2124,0.2731,0.302,0.354,0.2233,0.2182,0.2979,0.353,0.4033,0.3444,0.2496


First list all the clinical variables that would be the X feature set.

In [11]:
X_features = ['Subtype_HR+/HER2-', 'Subtype_HR+/HER2+', 'Subtype_TNBC', 'Subtype_HR-/HER2+', 'Age', 'TumorGrade', 'TumourSize', 'FusionNeo_Count', 'FusionNeo_bestIC50', 'FN/FT_Ratio', 'SNVindelNeo_Count', 'SNVindelNeo_IC50']

In [12]:
X_features_nocat = ['Age', 'TumorGrade', 'TumourSize', 'FusionNeo_Count', 'FusionNeo_bestIC50', 'FN/FT_Ratio', 'SNVindelNeo_Count', 'SNVindelNeo_IC50']

In [13]:
# Now get the Y variable set
Y_labels_all = [col for col in dfd.drop(columns=['Subtype']).columns if col not in X_features]
print(Y_labels_all[:5])
len(Y_labels_all)

['ESTIMATE', 'IMPRES', 'C_Bcellsnaive', 'C_TcellsCD4memoryresting', 'C_MacrophagesM2']


115

In [14]:
# load up the tsv containing the groupings of the different immune scores
df_imscores = pd.read_csv('../input-data/SA/immune_score_groupings.tsv', sep='\t')
df_imscores.head()

Unnamed: 0,HR>1,HR>1_worst_10_prog,HR<1,HR<1_best_10_prog,cytokine_chemokine_activator,activator_T,suppressor_T,B_cell_all,innate_cell_all,general
0,S_TREM1_data,S_TGFB_score_21050467,S_Buck14_score,S_Buck14_score,S_Expanded_IFNg,S_T_cells,S_Treg,S_CSR_Activated_15701700,S_pDC,S_Buck14_score
1,S_CHANG_CORE_SERUM_RESPONSE_UP,S_TGFB_PCA_17349583,S_Bcell_receptors_score,S_Bcell_receptors_score,S_IL2_score_21050467,S_Tcell_receptors_score,S_TGFB_score_21050467,S_Bcell_receptors_score,,S_Rotterdam_ERneg_PCA_15721472
2,S_IL8_21978456,S_Lymph_Vessels,S_TFH,S_TFH,S_IL12_score_21050467,S_Tcell_21978456,S_CTLA4_data,S_B_cell_PCA_16704732,,S_KEGG_Cytosolic_DNA_Sensing
3,,S_Rotterdam_ERneg_PCA_15721472,S_CD103pos_mean_25446897,S_CD103pos_mean_25446897,S_IFNG_score_21050467,S_CD8_Tcells,S_PD1_data,,,
4,,S_IFNG_score_21050467,S_T_helper,S_T_helper,S_IR7_score,S_CD8A,S_PDL1_data,,,


In [15]:
df_imscores.columns

Index(['HR>1', 'HR>1_worst_10_prog', 'HR<1', 'HR<1_best_10_prog',
       'cytokine_chemokine_activator', 'activator_T', 'suppressor_T',
       'B_cell_all', 'innate_cell_all', 'general'],
      dtype='object')

In [16]:
# now convert each column into a Series and drop NA
# Create a dictionary to store the Series
imscore_series_dict = {}

# Iterate through each column in the DataFrame
for column in df_imscores.columns:
    # Convert the column to a Series, drop NaN values, and store in the dictionary
    imscore_series_dict[column] = df_imscores[column].dropna().tolist()


#### **Split Dataset with `train_test_split`**

Split the dataset before modeling to avoid information leakage, then preprocess the data through the set up Pipeline before XGBoost.

In [None]:
# # subset X features; use the list generated before
# X = dfenc[X_features]
# X

In [None]:
# subset to remove categorical Xs
X = dfenc[X_features_nocat]
X = X.drop(columns=["FusionNeo_bestIC50", "SNVindelNeo_IC50"])
X

Now grab the Y targets (do this as a whole, but we will train on each column individually later).

In [None]:
# Now get the Y variable set
Y = dfenc[Y_labels_all]
Y

Now we perform train test split on the X and Y variables.

In [None]:
# Perform train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
Y_train.head()

In [None]:
Y_train.head()

In [None]:
Y_test.head()

As we don't want to transform all the X columns (because some of them are discrete numerical data and some of them are one-hot encoded categorical variables), we need to specify the columns to transform.

In [None]:
X.info()

#### **Create a Data Transformation Pipeline from `feature_engine` Package**
First, the pipeline will apply the Yeo-Johnson transformation on the split datasets on select X features and all Y labels, and scale them using `StandardScaler` (but wrapped within `feature_engine`'s wrapper) on select X features and all Y labels.

This pipeline would enable easy inverse transform steps for both X and Y datasets later.

In [None]:
# X_vars_to_transform = ['TumourSize', 'FusionNeo_Count', 'FusionNeo_bestIC50', 'FN/FT_Ratio', 'SNVindelNeo_Count', 'SNVindelNeo_IC50']
X_vars_to_transform = ['TumourSize', 'FusionNeo_Count', 'FN/FT_Ratio', 'SNVindelNeo_Count']

Do for X datasets.

In [None]:
from feature_engine.pipeline import Pipeline
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

# select variables to scale
scale_cols_X = X_train.columns.tolist()
# scale_cols_X = [col for col in scale_cols_X if col not in ['Age', 'TumorGrade', 'Subtype_HR+/HER2-', 'Subtype_HR+/HER2+', 'Subtype_TNBC', 'Subtype_HR-/HER2+']]
scale_cols_X = [col for col in scale_cols_X if col not in ['Age', 'TumorGrade']]

# Create the pipeline
preprocess_pipeline_X = Pipeline([
    ('yeo_johnson', YeoJohnsonTransformer(variables=X_vars_to_transform)),
    ('scaler', SklearnTransformerWrapper(transformer = StandardScaler(), variables = scale_cols_X))
])

# Fit the pipeline to the training data
preprocess_pipeline_X.fit(X_train)

# Transform the training data
X_train_yjs = preprocess_pipeline_X.transform(X_train)
# Transform the test data
X_test_yjs = preprocess_pipeline_X.transform(X_test)


In [None]:
X_train

In [None]:
X_train_yjs

In [None]:
# select variables to scale
scale_cols_Y = Y_train.columns.tolist()

# Create the pipeline
preprocess_pipeline_Y = Pipeline([
    ('yeo_johnson', YeoJohnsonTransformer()),
    ('scaler', SklearnTransformerWrapper(transformer = StandardScaler(), variables = scale_cols_Y))
])

# Fit the pipeline to the training data
preprocess_pipeline_Y.fit(Y_train)

# Transform the training data
Y_train_yjs = preprocess_pipeline_Y.transform(Y_train)
# Transform the test data
Y_test_yjs = preprocess_pipeline_Y.transform(Y_test)

In [None]:
Y_train

In [None]:
Y_train_yjs

#### **SVR Learning**

Time to test SVR. Select a Y column as the first target/label (`y`) variable first.

In [None]:
from sklearn.svm import SVR
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_target = "S_Module11_Prolif_score" #"S_CHANG_CORE_SERUM_RESPONSE_UP" #'S_Buck14_score'
y_train_tg = Y_train_yjs[y_target]
y_test_tg = Y_test_yjs[y_target]

# Initialize and fit SVR model
model_svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Fit
model_svr.fit(X_train_yjs, y_train_tg)

# Predict
y_transformed_trainpred = model_svr.predict(X_train_yjs)
y_transformed_testpred = model_svr.predict(X_test_yjs)

# Create dummy DataFrames
y_trainpred_dummy_df = pd.DataFrame(0, index=X_train_yjs.index, columns=Y_train_yjs.columns)
y_trainpred_dummy_df[y_target] = y_transformed_trainpred

y_testpred_dummy_df = pd.DataFrame(0, index=X_test_yjs.index, columns=Y_test_yjs.columns)
y_testpred_dummy_df[y_target] = y_transformed_testpred

# Apply inverse transform
y_trainpred_dummy_df_inv = preprocess_pipeline_Y.inverse_transform(y_trainpred_dummy_df)
y_testpred_dummy_df_inv = preprocess_pipeline_Y.inverse_transform(y_testpred_dummy_df)

# Extract predictions
y_trainpred = y_trainpred_dummy_df_inv[y_target].to_numpy()
y_testpred = y_testpred_dummy_df_inv[y_target].to_numpy()

# Evaluate
train_r2 = r2_score(Y_train[y_target], y_trainpred)
test_r2 = r2_score(Y_test[y_target], y_testpred)
# print(test_r2)
train_rmse = np.sqrt(mean_squared_error(Y_train[y_target], y_trainpred))
test_rmse = np.sqrt(mean_squared_error(Y_test[y_target], y_testpred))
# print(test_rmse)
train_mae = mean_absolute_error(Y_train[y_target], y_trainpred)
test_mae = mean_absolute_error(Y_test[y_target], y_testpred)
# print(test_mae)
# Print results
print("SVR Model Performance:")
print(f"{'Metric':<10} {'Train':<10} {'Test':<10}")
print("-" * 30)
print(f"{'R2':<10} {train_r2:<10.4f} {test_r2:<10.4f}")
print(f"{'RMSE':<10} {train_rmse:<10.4f} {test_rmse:<10.4f}")
print(f"{'MAE':<10} {train_mae:<10.4f} {test_mae:<10.4f}")

# Plot actual vs predicted
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.scatter(Y_train[y_target], y_trainpred, alpha=0.5)
ax1.plot([Y_train[y_target].min(), Y_train[y_target].max()], 
         [Y_train[y_target].min(), Y_train[y_target].max()], 'r--', lw=2)
ax1.set_xlabel('Actual')
ax1.set_ylabel('Predicted')
ax1.set_title('Train Set')

ax2.scatter(Y_test[y_target], y_testpred, alpha=0.5)
ax2.plot([Y_test[y_target].min(), Y_test[y_target].max()], 
         [Y_test[y_target].min(), Y_test[y_target].max()], 'r--', lw=2)
ax2.set_xlabel('Actual')
ax2.set_ylabel('Predicted')
ax2.set_title('Test Set')

# Add gridlines
ax1.grid(True, linestyle='--', alpha=0.7)
ax2.grid(True, linestyle='--', alpha=0.7)
# Make the plot square
ax1.axis('equal')
ax2.axis('equal')

plt.tight_layout()
plt.show()


In [None]:
# Feature importance using permutation importance
result = permutation_importance(
    model_svr, X_test_yjs, y_test_tg,
    n_repeats=10,
    random_state=42
)

# Create feature importance plot
feature_importance = pd.DataFrame(
    {'feature': X_test_yjs.columns,
     'importance': result.importances_mean}
).sort_values('importance', ascending=False)

plt.figure(figsize=(18, 16))
plt.bar(feature_importance['feature'], feature_importance['importance'])
plt.xticks(rotation=90)
plt.title('Feature Importance (Permutation Importance)')
plt.tight_layout()
plt.show()

# Simplified feature visualization
def plot_feature_relationships(X, y_pred, feature_names, n_features=6):
    n_cols = 3
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6*n_rows))
    if n_rows == 1:
        axes = axes.reshape(1, -1)
    
    for i, feature in enumerate(feature_names):
        if i >= n_features:
            break
            
        row = i // n_cols
        col = i % n_cols
        
        axes[row, col].scatter(X[feature], y_pred, alpha=0.5)
        axes[row, col].set_xlabel(feature)
        axes[row, col].set_ylabel('Predicted Value')
        axes[row, col].grid(True)
    
    # Hide empty subplots
    for i in range(len(feature_names), n_rows * n_cols):
        row = i // n_cols
        col = i % n_cols
        axes[row, col].set_visible(False)
    
    plt.tight_layout()
    plt.show()

# Plot relationships for top 6 important features
top_features = feature_importance['feature'].head(6).tolist()
plot_feature_relationships(X_test_yjs, y_transformed_testpred, top_features)

In [None]:
# Print a few actual vs predicted values
for actual, pred in zip(Y_test[y_target], y_testpred[:10]):
    print(f"Actual: {actual:.4f}, Predicted: {pred:.4f}")

# Create the scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(Y_test[y_target], y_testpred, alpha=0.5)

# Add the perfect prediction line (y=x)
plt.plot([Y_test[y_target].min(), Y_test[y_target].max()], [Y_test[y_target].min(), Y_test[y_target].max()], 
         'r--', label='Perfect Prediction')

# Labels and title
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values (Training Data)')
plt.legend()

# Add gridlines
plt.grid(True, linestyle='--', alpha=0.7)

# Make the plot square
plt.axis('equal')

plt.tight_layout()
plt.show()

#### **Iterative Learning over all Y labels**

The learning using SVR above was done on just one Y label. Let's put these into a set of functions so we can run this process iteratively on all Y columns we have set up.

First, set up a class for storing metrics.

In [None]:
class YTargetMetrics:
    def __init__(self, target_name, train_r2, test_r2, train_rmse, test_rmse, train_mae, test_mae):
        self.target_name = target_name
        self.train_r2 = train_r2
        self.test_r2 = test_r2
        self.train_rmse = train_rmse
        self.test_rmse = test_rmse
        self.train_mae = train_mae
        self.test_mae = test_mae

    def __str__(self):
        return f"""Model Performance for {self.target_name}:
{'Metric':<10} {'Train':<10} {'Test':<10}
{'-' * 30}
{'R2':<10} {self.train_r2:<10.4f} {self.test_r2:<10.4f}
{'RMSE':<10} {self.train_rmse:<10.4f} {self.test_rmse:<10.4f}
{'MAE':<10} {self.train_mae:<10.4f} {self.test_mae:<10.4f}"""

    def to_dict(self):
        return {
            'target_name': self.target_name,
            'train_r2': self.train_r2,
            'test_r2': self.test_r2,
            'train_rmse': self.train_rmse,
            'test_rmse': self.test_rmse,
            'train_mae': self.train_mae,
            'test_mae': self.test_mae
        }

Then, define functions.

In [None]:
import os
from typing import TypeVar, Dict

def run_svr_model(
   y_target: str,
   Y_train: pd.DataFrame,
   Y_test: pd.DataFrame,
   X_train_transformed: pd.DataFrame,
   X_test_transformed: pd.DataFrame,
   Y_train_transformed: pd.DataFrame,
   Y_test_transformed: pd.DataFrame,
   preprocess_pipeline_Y: Pipeline
) -> YTargetMetrics:
    """Run SVR model for a single target variable and return performance metrics.
    
    Args:
        y_target: Name of the target column
        Y_train: Original training dataframe with untransformed target variables
        Y_test: Original test dataframe with untransformed target variables 
        X_train_transformed: Transformed training features
        X_test_transformed: Transformed test features
        Y_train_transformed: Transformed training targets
        Y_test_transformed: Transformed test targets
        preprocess_pipeline_Y: Fitted preprocessing pipeline for target variables
    
    Returns:
        YTargetMetrics object containing model performance metrics
    """
    
    # assign untransformed, raw target data
    raw_y_train = Y_train[y_target]
    raw_y_test = Y_test[y_target]

    # Initialize and fit model
    model_instance = SVR(kernel='rbf', C=1.0, epsilon=0.1)
    model_instance.fit(X_train_transformed, Y_train_transformed[y_target])
    
    # predict
    y_transformed_train_pred = model_instance.predict(X_train_transformed)
    y_transformed_test_pred = model_instance.predict(X_test_transformed)

    # Create dummy DataFrames for inverse transform
    dummy_train_y = pd.DataFrame(0, index=X_train_transformed.index, 
                                columns=Y_train_transformed.columns)
    dummy_train_y[y_target] = y_transformed_train_pred

    dummy_test_y = pd.DataFrame(0, index=X_test_transformed.index, 
                               columns=Y_test_transformed.columns)
    dummy_test_y[y_target] = y_transformed_test_pred

    # apply inverse transform
    dummy_train_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_train_y)
    dummy_test_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_test_y)

    # Extract predictions
    y_train_pred = dummy_train_y_inv[y_target].to_numpy()
    y_test_pred = dummy_test_y_inv[y_target].to_numpy()

    # Calculate metrics
    train_r2 = r2_score(raw_y_train, y_train_pred)
    test_r2 = r2_score(raw_y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(raw_y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(raw_y_test, y_test_pred))
    train_mae = mean_absolute_error(raw_y_train, y_train_pred)
    test_mae = mean_absolute_error(raw_y_test, y_test_pred)

    # Plot actual vs predicted
    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), dpi=300)

    # Training set plot
    ax1.scatter(raw_y_train, y_train_pred, alpha=0.5)
    ax1.plot([raw_y_train.min(), raw_y_train.max()], 
             [raw_y_train.min(), raw_y_train.max()], 'r--', lw=2)
    ax1.set_xlabel('Actual')
    ax1.set_ylabel('Predicted')
    ax1.set_title('Training Set')
    ax1.grid(True)

    # Test set plot
    ax2.scatter(raw_y_test, y_test_pred, alpha=0.5)
    ax2.plot([raw_y_test.min(), raw_y_test.max()], 
             [raw_y_test.min(), raw_y_test.max()], 'r--', lw=2)
    ax2.set_xlabel('Actual')
    ax2.set_ylabel('Predicted')
    ax2.set_title('Testing Set')
    ax2.grid(True)

	# Add gridlines
    ax1.grid(True, linestyle='--', alpha=0.7)
    ax2.grid(True, linestyle='--', alpha=0.7)
    # Make the plot square
    ax1.axis('equal')
    ax2.axis('equal')
    plt.tight_layout()

    # Create 'plots' directory if it doesn't exist
    os.makedirs(f'plots/{y_target}', exist_ok=True)
    plt.savefig(f'plots/{y_target}/{y_target}-SVR-def-model-performance-comparison.png')
    plt.close()

    # For SVR, instead of feature importance, we can plot feature coefficients 
    # if using linear kernel, or skip if using rbf kernel
    if model_instance.kernel == 'linear':
        _, ax = plt.subplots(figsize=(18, 16), dpi=300)
        coefficients = pd.Series(model_instance.coef_[0], 
                               index=X_train_transformed.columns)
        coefficients.sort_values(ascending=True).plot(kind='barh')
        plt.title('SVR Feature Coefficients (Linear Kernel)')
        plt.tight_layout()
        plt.savefig(f"plots/{y_target}/{y_target}-SVR-def-model-feature-coefficients.png")
        plt.close()

    print(f"Model training and evaluation for {y_target} completed.")

    return YTargetMetrics(y_target, train_r2, test_r2, train_rmse, test_rmse, 
                         train_mae, test_mae)

Run the function in a loop across all Y labels.

In [None]:
# Dictionary to store results for each target
results_dict = {}

# Loop through each target
for y_target in Y_labels_all:
    print(f"\nProcessing target: {y_target}")
    
    results = run_svr_model(
        y_target=y_target,
        Y_train=Y_train,
        Y_test=Y_test,
        X_train_transformed=X_train_yjs,
        X_test_transformed=X_test_yjs,
        Y_train_transformed=Y_train_yjs,
        Y_test_transformed=Y_test_yjs,
        preprocess_pipeline_Y=preprocess_pipeline_Y
    )
    
    # Store results
    results_dict[y_target] = results
    
    # Print individual results
    print(results)

# Optional: Create summary DataFrame of all results
summary_df = pd.DataFrame({
    target: results.to_dict() 
    for target, results in results_dict.items()
}).T

# Print overall summary
print("\nOverall Summary:")


# Optionally save results
# summary_df.to_csv('svr_model_results.csv')

In [None]:
show(summary_df, maxBytes=0)

#### **Iterative Learning over Y Labels with `GridSearchCV` for Hyperparameter Tuning**

Now we can rewrite the functions to incorporate `GridSearchCV`.

In [None]:
# from sklearn.model_selection import GridSearchCV

# def run_svr_model_gridsearch(
#     y_target: str,
#     Y_train: pd.DataFrame,
#     Y_test: pd.DataFrame,
#     X_train_transformed: pd.DataFrame,
#     X_test_transformed: pd.DataFrame,
#     Y_train_transformed: pd.DataFrame,
#     Y_test_transformed: pd.DataFrame,
#     preprocess_pipeline_Y: Pipeline
# ) -> YTargetMetrics:
#     """Run SVR model with GridSearchCV for a single target variable and return performance metrics."""

#     # Define parameter grid
#     param_grid = {
#         'C': [0.1, 1, 10, 100],
#         'epsilon': [0.01, 0.1, 0.2],
#         'gamma': ['scale', 'auto', 0.1, 0.01]
#     }

#     # Initialize base model
#     base_model = SVR()

#     # Setup GridSearchCV
#     grid_search = GridSearchCV(
#         estimator=base_model,
#         param_grid=param_grid,
#         cv=5,
#         scoring='neg_mean_squared_error',
#         n_jobs=-1,
#         verbose=1
#     )

#     # Fit GridSearchCV
#     print(f"\nPerforming GridSearchCV for {y_target}...")
#     grid_search.fit(X_train_transformed, Y_train_transformed[y_target])

#     # Print best parameters
#     print(f"\nBest parameters for {y_target}:")
#     print(grid_search.best_params_)

#     # Use best model for predictions
#     model_instance = grid_search.best_estimator_
    
#     # Get predictions (transformed space)
#     y_train_pred_transformed = model_instance.predict(X_train_transformed)
#     y_test_pred_transformed = model_instance.predict(X_test_transformed)

#     # Create dummy DataFrames for inverse transform
#     dummy_train_y = pd.DataFrame(0, index=X_train_transformed.index, 
#                                 columns=Y_train_transformed.columns)
#     dummy_train_y[y_target] = y_train_pred_transformed

#     dummy_test_y = pd.DataFrame(0, index=X_test_transformed.index, 
#                                columns=Y_test_transformed.columns)
#     dummy_test_y[y_target] = y_test_pred_transformed

#     # Inverse transform predictions
#     dummy_train_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_train_y)
#     dummy_test_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_test_y)

#     # Extract the relevant target column
#     y_train_pred = dummy_train_y_inv[y_target].to_numpy()
#     y_test_pred = dummy_test_y_inv[y_target].to_numpy()

#     # Get raw target data
#     raw_y_train = Y_train[y_target]
#     raw_y_test = Y_test[y_target]

#     # Calculate metrics
#     train_r2 = r2_score(raw_y_train, y_train_pred)
#     test_r2 = r2_score(raw_y_test, y_test_pred)
#     train_rmse = np.sqrt(mean_squared_error(raw_y_train, y_train_pred))
#     test_rmse = np.sqrt(mean_squared_error(raw_y_test, y_test_pred))
#     train_mae = mean_absolute_error(raw_y_train, y_train_pred)
#     test_mae = mean_absolute_error(raw_y_test, y_test_pred)

#     # Create plots directory
#     os.makedirs(f'plots/{y_target}', exist_ok=True)

#     # Plot CV results
#     cv_results = pd.DataFrame(grid_search.cv_results_)
#     plt.figure(figsize=(15, 5))
#     plt.subplot(1, 2, 1)
#     plt.plot(cv_results['param_C'], -cv_results['mean_test_score'], 'o-')
#     plt.xlabel('C parameter')
#     plt.ylabel('Mean Squared Error')
#     plt.xscale('log')
    
#     plt.subplot(1, 2, 2)
#     plt.plot(cv_results['param_epsilon'], -cv_results['mean_test_score'], 'o-')
#     plt.xlabel('Epsilon parameter')
#     plt.ylabel('Mean Squared Error')
#     plt.tight_layout()
#     plt.savefig(f'plots/{y_target}/{y_target}-SVR-grid-search-results.png')
#     plt.close()

#     # Plot actual vs predicted
#     _, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), dpi=300)

#     ax1.scatter(raw_y_train, y_train_pred, alpha=0.5)
#     ax1.plot([raw_y_train.min(), raw_y_train.max()], 
#              [raw_y_train.min(), raw_y_train.max()], 'r--', lw=2)
#     ax1.set_xlabel('Actual')
#     ax1.set_ylabel('Predicted')
#     ax1.set_title('Training Set')
#     ax1.grid(True)

#     ax2.scatter(raw_y_test, y_test_pred, alpha=0.5)
#     ax2.plot([raw_y_test.min(), raw_y_test.max()], 
#              [raw_y_test.min(), raw_y_test.max()], 'r--', lw=2)
#     ax2.set_xlabel('Actual')
#     ax2.set_ylabel('Predicted')
#     ax2.set_title('Testing Set')
#     ax2.grid(True)

#     plt.tight_layout()
#     plt.savefig(f'plots/{y_target}/{y_target}-SVR-tuned-model-performance-comparison.png')
#     plt.close()

#     # Create results object with additional grid search info
#     results = YTargetMetrics(y_target, train_r2, test_r2, train_rmse, test_rmse, train_mae, test_mae)
    
#     # Add grid search results to dictionary
#     grid_search_results = {
#         'best_params': grid_search.best_params_,
#         'best_score': -grid_search.best_score_,  # Convert back from negative MSE
#         'cv_results': grid_search.cv_results_
#     }

#     return results, grid_search_results

# # Modified function to handle multiple targets
# def run_svr_for_multiple_targets(
#     y_columns: list[str],
#     Y_train: pd.DataFrame,
#     Y_test: pd.DataFrame,
#     X_train_transformed: pd.DataFrame,
#     X_test_transformed: pd.DataFrame,
#     Y_train_transformed: pd.DataFrame,
#     Y_test_transformed: pd.DataFrame,
#     preprocess_pipeline_Y: Pipeline
# ) -> Dict[str, tuple[YTargetMetrics, dict]]:
    
#     results_dict = {}
    
#     for y_target in y_columns:
#         try:
#             print(f"\nProcessing target: {y_target}")
            
#             metrics, grid_results = run_svr_model_gridsearch(
#                 y_target=y_target,
#                 Y_train=Y_train,
#                 Y_test=Y_test,
#                 X_train_transformed=X_train_transformed,
#                 X_test_transformed=X_test_transformed,
#                 Y_train_transformed=Y_train_transformed,
#                 Y_test_transformed=Y_test_transformed,
#                 preprocess_pipeline_Y=preprocess_pipeline_Y
#             )
            
#             results_dict[y_target] = (metrics, grid_results)
#             print(f"\nResults for {y_target}:")
#             print(metrics)
#             print("\nBest parameters:", grid_results['best_params'])
#             print("Best CV score (RMSE):", np.sqrt(grid_results['best_score']))
            
#         except Exception as e:
#             print(f"Error processing {y_target}: {str(e)}")
#             continue
    
#     return results_dict

# # Usage:
# Y_columns = Y_labels_all
# all_results = run_svr_for_multiple_targets(
#     y_columns=Y_columns,
#     Y_train=Y_train,
#     Y_test=Y_test,
#     X_train_transformed=X_train_yjs,
#     X_test_transformed=X_test_yjs,
#     Y_train_transformed=Y_train_yjs,
#     Y_test_transformed=Y_test_yjs,
#     preprocess_pipeline_Y=preprocess_pipeline_Y
# )

# # Create summary DataFrame with best parameters
# summary_dict = {
#     target: {
#         **metrics.to_dict(),
#         **{'best_' + k: v for k, v in grid_results['best_params'].items()}
#     }
#     for target, (metrics, grid_results) in all_results.items()
# }

# summary_df = pd.DataFrame.from_dict(summary_dict, orient='index')
# print("\nOverall Summary:")

In [None]:
# show(summary_df)

In [None]:
# get the list from the dict
activator_t = imscore_series_dict['activator_T']
suppressor_t = imscore_series_dict['suppressor_T']
best_prog = imscore_series_dict['HR<1_best_10_prog']
worst_prog = imscore_series_dict['HR>1_worst_10_prog']

# merge
merged_cols = activator_t + suppressor_t + best_prog + worst_prog
merged_cols = list(set(merged_cols))
print(f"Total number of elements in merged_cols (unsorted): {len(merged_cols)}")

# there are repeated immune scores (at least in between two groups, can be more than two groups) so get a list of them first
from itertools import combinations
# list of all the sets
all_sets = [set(activator_t), set(suppressor_t), set(best_prog), set(worst_prog)]

# Get all possible combinations of 2 sets
set_combo = combinations(all_sets, 2)

# Find the union of all set combinations
union_of_combo = list(set.union(*[set.intersection(c1, c2) for c1, c2 in set_combo]))

print(f"Elements that overlap between at least two sets: {union_of_combo}")

# rearrange the list element order based on another list
merged_cols = ["ESTIMATE"] + union_of_combo + [x for x in activator_t if x not in union_of_combo] + [x for x in suppressor_t if x not in union_of_combo] + [x for x in best_prog if x not in union_of_combo] + [x for x in worst_prog if x not in union_of_combo]

print(f"Total number of elements in merged_cols (sorted by original X feature order and groups): {len(merged_cols)}")


Rerun the grid search using the subset of Y labels. 

In [None]:
# Y_columns = merged_cols
# all_results = run_svr_for_multiple_targets(
#     y_columns=Y_columns,
#     Y_train=Y_train,
#     Y_test=Y_test,
#     X_train_transformed=X_train_yjs,
#     X_test_transformed=X_test_yjs,
#     Y_train_transformed=Y_train_yjs,
#     Y_test_transformed=Y_test_yjs,
#     preprocess_pipeline_Y=preprocess_pipeline_Y
# )

# # Create summary DataFrame with best parameters
# summary_dict_ss = {
#     target: {
#         **metrics.to_dict(),
#         **{'best_' + k: v for k, v in grid_results['best_params'].items()}
#     }
#     for target, (metrics, grid_results) in all_results.items()
# }

# summary_df_ss = pd.DataFrame.from_dict(summary_dict_ss, orient='index')
# print("\nOverall Summary:")

In [None]:
# show(summary_df_ss)

#### **Using `Yellowbrick` Package to Visualize Residuals**

We can use the `Y target` that has the best prediction score, which is `S_Module11_Prolif_score` to test this.

##### SVR

In [None]:
####
from yellowbrick.regressor import ResidualsPlot
from sklearn.svm import SVR

X_train_yb = X_train_yjs
X_test_yb = X_test_yjs

y_train_target = Y_train_yjs["S_Module11_Prolif_score"]
y_test_target = Y_test_yjs["S_Module11_Prolif_score"]

# Instantiate the linear model and visualizer
model_inst = SVR(kernel='rbf', C=100.0, epsilon=0.2, gamma='scale')
visualizer = ResidualsPlot(model_inst)

visualizer.fit(X_train_yb, y_train_target)  # Fit the training data to the visualizer
visualizer.score(X_test_yb, y_test_target)  # Evaluate the model on the test data
visualizer.show(outpath="svr_residuals.png", dpi=300)

In [None]:
from sklearn.model_selection import GridSearchCV

def run_svr_model_gridsearch(
    y_target: str,
    Y_train: pd.DataFrame,
    Y_test: pd.DataFrame,
    X_train_transformed: pd.DataFrame,
    X_test_transformed: pd.DataFrame,
    Y_train_transformed: pd.DataFrame,
    Y_test_transformed: pd.DataFrame,
    preprocess_pipeline_Y: Pipeline
) -> YTargetMetrics:
    """Run SVR model with GridSearchCV for a single target variable and return performance metrics."""

    # Define parameter grid
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.01, 0.1, 0.2],
        'gamma': ['scale', 'auto', 0.1, 0.01]
    }

    # Initialize base model
    base_model = SVR()

    # Setup GridSearchCV
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )

    # Fit GridSearchCV
    print(f"\nPerforming GridSearchCV for {y_target}...")
    grid_search.fit(X_train_transformed, Y_train_transformed[y_target])

    # Print best parameters
    print(f"\nBest parameters for {y_target}:")
    print(grid_search.best_params_)

    # Use best model for predictions
    model_instance = grid_search.best_estimator_
    
	# yellowbrick visualizer
    visualizer = ResidualsPlot(model_instance)
    visualizer.fit(X_train_transformed, Y_train_transformed[y_target])  # Fit the training data to the visualizer
    visualizer.score(X_test_transformed, Y_test_transformed[y_target])  # Evaluate the model on the test data
    visualizer.show(outpath=f'plots/{y_target}/{y_target}-SVR-tuned-model-residual-plot.png', dpi=300)
    
    # Get predictions (transformed space)
    y_train_pred_transformed = model_instance.predict(X_train_transformed)
    y_test_pred_transformed = model_instance.predict(X_test_transformed)

    # Create dummy DataFrames for inverse transform
    dummy_train_y = pd.DataFrame(0, index=X_train_transformed.index, 
                                columns=Y_train_transformed.columns)
    dummy_train_y[y_target] = y_train_pred_transformed

    dummy_test_y = pd.DataFrame(0, index=X_test_transformed.index, 
                               columns=Y_test_transformed.columns)
    dummy_test_y[y_target] = y_test_pred_transformed

    # Inverse transform predictions
    dummy_train_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_train_y)
    dummy_test_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_test_y)

    # Extract the relevant target column
    y_train_pred = dummy_train_y_inv[y_target].to_numpy()
    y_test_pred = dummy_test_y_inv[y_target].to_numpy()

    # Get raw target data
    raw_y_train = Y_train[y_target]
    raw_y_test = Y_test[y_target]

    # Calculate metrics
    train_r2 = r2_score(raw_y_train, y_train_pred)
    test_r2 = r2_score(raw_y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(raw_y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(raw_y_test, y_test_pred))
    train_mae = mean_absolute_error(raw_y_train, y_train_pred)
    test_mae = mean_absolute_error(raw_y_test, y_test_pred)

    # Create plots directory
    os.makedirs(f'plots/{y_target}', exist_ok=True)

    # Plot CV results
    cv_results = pd.DataFrame(grid_search.cv_results_)
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 1)
    plt.plot(cv_results['param_C'], -cv_results['mean_test_score'], 'o-')
    plt.xlabel('C parameter')
    plt.ylabel('Mean Squared Error')
    plt.xscale('log')
    
    plt.subplot(1, 2, 2)
    plt.plot(cv_results['param_epsilon'], -cv_results['mean_test_score'], 'o-')
    plt.xlabel('Epsilon parameter')
    plt.ylabel('Mean Squared Error')
    plt.tight_layout()
    plt.savefig(f'plots/{y_target}/{y_target}-SVR-grid-search-results.png')
    plt.close()

    # Plot actual vs predicted
    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), dpi=300)

    ax1.scatter(raw_y_train, y_train_pred, alpha=0.5)
    ax1.plot([raw_y_train.min(), raw_y_train.max()], 
             [raw_y_train.min(), raw_y_train.max()], 'r--', lw=2)
    ax1.set_xlabel('Actual')
    ax1.set_ylabel('Predicted')
    ax1.set_title('Training Set')
    ax1.grid(True)

    ax2.scatter(raw_y_test, y_test_pred, alpha=0.5)
    ax2.plot([raw_y_test.min(), raw_y_test.max()], 
             [raw_y_test.min(), raw_y_test.max()], 'r--', lw=2)
    ax2.set_xlabel('Actual')
    ax2.set_ylabel('Predicted')
    ax2.set_title('Testing Set')
    ax2.grid(True)

    plt.tight_layout()
    plt.savefig(f'plots/{y_target}/{y_target}-SVR-tuned-model-performance-comparison.png')
    plt.close()

    # Create results object with additional grid search info
    results = YTargetMetrics(y_target, train_r2, test_r2, train_rmse, test_rmse, train_mae, test_mae)
    
    # Add grid search results to dictionary
    grid_search_results = {
        'best_params': grid_search.best_params_,
        'best_score': -grid_search.best_score_,  # Convert back from negative MSE
        'cv_results': grid_search.cv_results_
    }

    return results, grid_search_results

# Modified function to handle multiple targets
def run_svr_for_multiple_targets(
    y_columns: list[str],
    Y_train: pd.DataFrame,
    Y_test: pd.DataFrame,
    X_train_transformed: pd.DataFrame,
    X_test_transformed: pd.DataFrame,
    Y_train_transformed: pd.DataFrame,
    Y_test_transformed: pd.DataFrame,
    preprocess_pipeline_Y: Pipeline
) -> Dict[str, tuple[YTargetMetrics, dict]]:
    
    results_dict = {}
    
    for y_target in y_columns:
        try:
            print(f"\nProcessing target: {y_target}")
            
            metrics, grid_results = run_svr_model_gridsearch(
                y_target=y_target,
                Y_train=Y_train,
                Y_test=Y_test,
                X_train_transformed=X_train_transformed,
                X_test_transformed=X_test_transformed,
                Y_train_transformed=Y_train_transformed,
                Y_test_transformed=Y_test_transformed,
                preprocess_pipeline_Y=preprocess_pipeline_Y
            )
            
            results_dict[y_target] = (metrics, grid_results)
            print(f"\nResults for {y_target}:")
            print(metrics)
            print("\nBest parameters:", grid_results['best_params'])
            print("Best CV score (RMSE):", np.sqrt(grid_results['best_score']))
            
        except Exception as e:
            print(f"Error processing {y_target}: {str(e)}")
            continue
    
    return results_dict

# Usage:
Y_columns = Y_labels_all
all_results = run_svr_for_multiple_targets(
    y_columns=Y_columns,
    Y_train=Y_train,
    Y_test=Y_test,
    X_train_transformed=X_train_yjs,
    X_test_transformed=X_test_yjs,
    Y_train_transformed=Y_train_yjs,
    Y_test_transformed=Y_test_yjs,
    preprocess_pipeline_Y=preprocess_pipeline_Y
)

# Create summary DataFrame with best parameters
summary_dict = {
    target: {
        **metrics.to_dict(),
        **{'best_' + k: v for k, v in grid_results['best_params'].items()}
    }
    for target, (metrics, grid_results) in all_results.items()
}

summary_df = pd.DataFrame.from_dict(summary_dict, orient='index')
print("\nOverall Summary:")

In [None]:
show(summary_df, maxBytes=0)

##### RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

X_train_rf_yb = X_train_yjs
X_test_rf_yb = X_test_yjs

y_train_target = Y_train_yjs["S_Module11_Prolif_score"]
y_test_target = Y_test_yjs["S_Module11_Prolif_score"]

# Instantiate the linear model and visualizer
model_inst = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
visualizer = ResidualsPlot(model_inst)

visualizer.fit(X_train_rf_yb, y_train_target)  # Fit the training data to the visualizer
visualizer.score(X_test_rf_yb, y_test_target)  # Evaluate the model on the test data
visualizer.show(outpath="rf_residuals.png", dpi=300)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

def run_rf_model_gridsearch(
    y_target: str,
    Y_train: pd.DataFrame,
    Y_test: pd.DataFrame,
    X_train_transformed: pd.DataFrame,
    X_test_transformed: pd.DataFrame,
    Y_train_transformed: pd.DataFrame,
    Y_test_transformed: pd.DataFrame,
    preprocess_pipeline_Y: Pipeline
) -> YTargetMetrics:
    """Run Random Forest model with GridSearchCV for a single target variable and return performance metrics."""

    # Define parameter grid for Random Forest
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }

    # Initialize base model
    base_model = RandomForestRegressor(random_state=42, n_jobs=-1)

    # Setup GridSearchCV
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )

    # Fit GridSearchCV
    print(f"\nPerforming GridSearchCV for {y_target}...")
    grid_search.fit(X_train_transformed, Y_train_transformed[y_target])

    # Print best parameters
    print(f"\nBest parameters for {y_target}:")
    print(grid_search.best_params_)

    # Use best model for predictions
    model_instance = grid_search.best_estimator_
    
    # yellowbrick visualizer
    visualizer = ResidualsPlot(model_instance)
    visualizer.fit(X_train_transformed, Y_train_transformed[y_target])
    visualizer.score(X_test_transformed, Y_test_transformed[y_target])
    visualizer.show(outpath=f'plots/{y_target}/{y_target}-RF-tuned-model-residual-plot.png', dpi=300)
    
    # Get predictions (transformed space)
    y_train_pred_transformed = model_instance.predict(X_train_transformed)
    y_test_pred_transformed = model_instance.predict(X_test_transformed)

    # Create dummy DataFrames for inverse transform
    dummy_train_y = pd.DataFrame(0, index=X_train_transformed.index, 
                                columns=Y_train_transformed.columns)
    dummy_train_y[y_target] = y_train_pred_transformed

    dummy_test_y = pd.DataFrame(0, index=X_test_transformed.index, 
                               columns=Y_test_transformed.columns)
    dummy_test_y[y_target] = y_test_pred_transformed

    # Inverse transform predictions
    dummy_train_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_train_y)
    dummy_test_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_test_y)

    # Extract the relevant target column
    y_train_pred = dummy_train_y_inv[y_target].to_numpy()
    y_test_pred = dummy_test_y_inv[y_target].to_numpy()

    # Get raw target data
    raw_y_train = Y_train[y_target]
    raw_y_test = Y_test[y_target]

    # Calculate metrics
    train_r2 = r2_score(raw_y_train, y_train_pred)
    test_r2 = r2_score(raw_y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(raw_y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(raw_y_test, y_test_pred))
    train_mae = mean_absolute_error(raw_y_train, y_train_pred)
    test_mae = mean_absolute_error(raw_y_test, y_test_pred)

    # Create plots directory
    os.makedirs(f'plots/{y_target}', exist_ok=True)

    # Plot feature importances
    feature_importance = pd.DataFrame({
        'feature': X_train_transformed.columns,
        'importance': model_instance.feature_importances_
    }).sort_values('importance', ascending=False)

    plt.figure(figsize=(10, 6))
    plt.bar(range(len(feature_importance)), feature_importance['importance'])
    plt.xticks(range(len(feature_importance)), feature_importance['feature'], rotation=45, ha='right')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title(f'Feature Importance for {y_target}')
    plt.tight_layout()
    plt.savefig(f'plots/{y_target}/{y_target}-RF-feature-importance.png', dpi=300)
    plt.close()

    # Plot actual vs predicted
    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), dpi=300)

    ax1.scatter(raw_y_train, y_train_pred, alpha=0.5)
    ax1.plot([raw_y_train.min(), raw_y_train.max()], 
             [raw_y_train.min(), raw_y_train.max()], 'r--', lw=2)
    ax1.set_xlabel('Actual')
    ax1.set_ylabel('Predicted')
    ax1.set_title('Training Set')
    ax1.grid(True)

    ax2.scatter(raw_y_test, y_test_pred, alpha=0.5)
    ax2.plot([raw_y_test.min(), raw_y_test.max()], 
             [raw_y_test.min(), raw_y_test.max()], 'r--', lw=2)
    ax2.set_xlabel('Actual')
    ax2.set_ylabel('Predicted')
    ax2.set_title('Testing Set')
    ax2.grid(True)

    plt.tight_layout()
    plt.savefig(f'plots/{y_target}/{y_target}-RF-tuned-model-performance-comparison.png')
    plt.close()

    # Create results object with additional grid search info
    results = YTargetMetrics(y_target, train_r2, test_r2, train_rmse, test_rmse, train_mae, test_mae)
    
    # Add grid search results to dictionary
    grid_search_results = {
        'best_params': grid_search.best_params_,
        'best_score': -grid_search.best_score_,  # Convert back from negative MSE
        'cv_results': grid_search.cv_results_,
        'feature_importance': feature_importance.to_dict()
    }

    return results, grid_search_results

def run_rf_for_multiple_targets(
    y_columns: list[str],
    Y_train: pd.DataFrame,
    Y_test: pd.DataFrame,
    X_train_transformed: pd.DataFrame,
    X_test_transformed: pd.DataFrame,
    Y_train_transformed: pd.DataFrame,
    Y_test_transformed: pd.DataFrame,
    preprocess_pipeline_Y: Pipeline
) -> Dict[str, tuple[YTargetMetrics, dict]]:
    
    results_dict = {}
    
    for y_target in y_columns:
        try:
            print(f"\nProcessing target: {y_target}")
            
            metrics, grid_results = run_rf_model_gridsearch(
                y_target=y_target,
                Y_train=Y_train,
                Y_test=Y_test,
                X_train_transformed=X_train_transformed,
                X_test_transformed=X_test_transformed,
                Y_train_transformed=Y_train_transformed,
                Y_test_transformed=Y_test_transformed,
                preprocess_pipeline_Y=preprocess_pipeline_Y
            )
            
            results_dict[y_target] = (metrics, grid_results)
            print(f"\nResults for {y_target}:")
            print(metrics)
            print("\nBest parameters:", grid_results['best_params'])
            print("Best CV score (RMSE):", np.sqrt(grid_results['best_score']))
            
            # Print top 10 most important features
            feature_importance = pd.DataFrame(grid_results['feature_importance'])
            print("\nTop 10 most important features:")
            print(feature_importance.sort_values('importance', ascending=False).head(10))
            
        except Exception as e:
            print(f"Error processing {y_target}: {str(e)}")
            continue
    
    return results_dict

# Usage:
Y_columns = Y_labels_all
all_results = run_rf_for_multiple_targets(
    y_columns=Y_columns,
    Y_train=Y_train,
    Y_test=Y_test,
    X_train_transformed=X_train_yjs,
    X_test_transformed=X_test_yjs,
    Y_train_transformed=Y_train_yjs,
    Y_test_transformed=Y_test_yjs,
    preprocess_pipeline_Y=preprocess_pipeline_Y
)

# Create summary DataFrame with best parameters
summary_dict = {
    target: {
        **metrics.to_dict(),
        **{'best_' + k: v for k, v in grid_results['best_params'].items()}
    }
    for target, (metrics, grid_results) in all_results.items()
}

summary_df = pd.DataFrame.from_dict(summary_dict, orient='index')
print("\nOverall Summary:")

In [None]:
show(summary_df, maxBytes=0)

##### EN

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

def run_elasticnet_model_gridsearch(
    y_target: str,
    Y_train: pd.DataFrame,
    Y_test: pd.DataFrame,
    X_train_transformed: pd.DataFrame,
    X_test_transformed: pd.DataFrame,
    Y_train_transformed: pd.DataFrame,
    Y_test_transformed: pd.DataFrame,
    preprocess_pipeline_Y: Pipeline
) -> YTargetMetrics:
    """Run Elastic Net model with GridSearchCV for a single target variable and return performance metrics."""

    # Define parameter grid for Elastic Net
    param_grid = {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],  # 0=Ridge, 1=Lasso
        'max_iter': [2000]  # Higher max_iter to ensure convergence
        #'tol': [1e-4]
    }

    # Initialize base model
    base_model = ElasticNet(random_state=42)

    # Setup GridSearchCV
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )

    # Fit GridSearchCV
    print(f"\nPerforming GridSearchCV for {y_target}...")
    grid_search.fit(X_train_transformed, Y_train_transformed[y_target])

    # Print best parameters
    print(f"\nBest parameters for {y_target}:")
    print(grid_search.best_params_)

    # Use best model for predictions
    model_instance = grid_search.best_estimator_
    
    # yellowbrick visualizer
    visualizer = ResidualsPlot(model_instance)
    visualizer.fit(X_train_transformed, Y_train_transformed[y_target])
    visualizer.score(X_test_transformed, Y_test_transformed[y_target])
    visualizer.show(outpath=f'plots/{y_target}/{y_target}-ElasticNet-tuned-model-residual-plot.png', dpi=300)
    
    # Get predictions (transformed space)
    y_train_pred_transformed = model_instance.predict(X_train_transformed)
    y_test_pred_transformed = model_instance.predict(X_test_transformed)

    # Create dummy DataFrames for inverse transform
    dummy_train_y = pd.DataFrame(0, index=X_train_transformed.index, 
                                columns=Y_train_transformed.columns)
    dummy_train_y[y_target] = y_train_pred_transformed

    dummy_test_y = pd.DataFrame(0, index=X_test_transformed.index, 
                               columns=Y_test_transformed.columns)
    dummy_test_y[y_target] = y_test_pred_transformed

    # Inverse transform predictions
    dummy_train_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_train_y)
    dummy_test_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_test_y)

    # Extract the relevant target column
    y_train_pred = dummy_train_y_inv[y_target].to_numpy()
    y_test_pred = dummy_test_y_inv[y_target].to_numpy()

    # Get raw target data
    raw_y_train = Y_train[y_target]
    raw_y_test = Y_test[y_target]

    # Calculate metrics
    train_r2 = r2_score(raw_y_train, y_train_pred)
    test_r2 = r2_score(raw_y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(raw_y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(raw_y_test, y_test_pred))
    train_mae = mean_absolute_error(raw_y_train, y_train_pred)
    test_mae = mean_absolute_error(raw_y_test, y_test_pred)

    # Create plots directory
    os.makedirs(f'plots/{y_target}', exist_ok=True)

    # Plot coefficient values
    non_zero_coef = pd.DataFrame({
        'feature': X_train_transformed.columns,
        'coefficient': model_instance.coef_
    })
    non_zero_coef = non_zero_coef[non_zero_coef['coefficient'] != 0].sort_values('coefficient', ascending=False)

    plt.figure(figsize=(12, 6))
    plt.bar(range(len(non_zero_coef)), non_zero_coef['coefficient'])
    plt.xticks(range(len(non_zero_coef)), non_zero_coef['feature'], rotation=45, ha='right')
    plt.xlabel('Features')
    plt.ylabel('Coefficient Value')
    plt.title(f'Non-zero Coefficients for {y_target}')
    plt.tight_layout()
    plt.savefig(f'plots/{y_target}/{y_target}-ElasticNet-coefficients.png', dpi=300)
    plt.close()

    # Plot regularization path
    alphas = param_grid['alpha']
    l1_ratios = param_grid['l1_ratio']
    cv_results = pd.DataFrame(grid_search.cv_results_)
    
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    for l1_ratio in l1_ratios:
        mask = cv_results['param_l1_ratio'] == l1_ratio
        plt.plot(cv_results[mask]['param_alpha'], 
                -cv_results[mask]['mean_test_score'], 
                'o-', label=f'l1_ratio={l1_ratio}')
    plt.xscale('log')
    plt.xlabel('alpha')
    plt.ylabel('Mean Squared Error')
    plt.title('Regularization Path')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    for alpha in alphas:
        mask = cv_results['param_alpha'] == alpha
        plt.plot(cv_results[mask]['param_l1_ratio'], 
                -cv_results[mask]['mean_test_score'], 
                'o-', label=f'alpha={alpha}')
    plt.xlabel('l1_ratio')
    plt.ylabel('Mean Squared Error')
    plt.title('L1 Ratio Impact')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(f'plots/{y_target}/{y_target}-ElasticNet-regularization-path.png', dpi=300)
    plt.close()

    # Plot actual vs predicted
    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), dpi=300)

    ax1.scatter(raw_y_train, y_train_pred, alpha=0.5)
    ax1.plot([raw_y_train.min(), raw_y_train.max()], 
             [raw_y_train.min(), raw_y_train.max()], 'r--', lw=2)
    ax1.set_xlabel('Actual')
    ax1.set_ylabel('Predicted')
    ax1.set_title('Training Set')
    ax1.grid(True)

    ax2.scatter(raw_y_test, y_test_pred, alpha=0.5)
    ax2.plot([raw_y_test.min(), raw_y_test.max()], 
             [raw_y_test.min(), raw_y_test.max()], 'r--', lw=2)
    ax2.set_xlabel('Actual')
    ax2.set_ylabel('Predicted')
    ax2.set_title('Testing Set')
    ax2.grid(True)

    plt.tight_layout()
    plt.savefig(f'plots/{y_target}/{y_target}-ElasticNet-tuned-model-performance-comparison.png')
    plt.close()

    # Create results object with additional grid search info
    results = YTargetMetrics(y_target, train_r2, test_r2, train_rmse, test_rmse, train_mae, test_mae)
    
    # Add grid search results to dictionary
    grid_search_results = {
        'best_params': grid_search.best_params_,
        'best_score': -grid_search.best_score_,  # Convert back from negative MSE
        'cv_results': grid_search.cv_results_,
        'coefficients': non_zero_coef.to_dict(),
        'number_of_features_selected': (model_instance.coef_ != 0).sum()
    }

    return results, grid_search_results

def run_elasticnet_for_multiple_targets(
    y_columns: list[str],
    Y_train: pd.DataFrame,
    Y_test: pd.DataFrame,
    X_train_transformed: pd.DataFrame,
    X_test_transformed: pd.DataFrame,
    Y_train_transformed: pd.DataFrame,
    Y_test_transformed: pd.DataFrame,
    preprocess_pipeline_Y: Pipeline
) -> Dict[str, tuple[YTargetMetrics, dict]]:
    
    results_dict = {}
    
    for y_target in y_columns:
        try:
            print(f"\nProcessing target: {y_target}")
            
            metrics, grid_results = run_elasticnet_model_gridsearch(
                y_target=y_target,
                Y_train=Y_train,
                Y_test=Y_test,
                X_train_transformed=X_train_transformed,
                X_test_transformed=X_test_transformed,
                Y_train_transformed=Y_train_transformed,
                Y_test_transformed=Y_test_transformed,
                preprocess_pipeline_Y=preprocess_pipeline_Y
            )
            
            results_dict[y_target] = (metrics, grid_results)
            print(f"\nResults for {y_target}:")
            print(metrics)
            print("\nBest parameters:", grid_results['best_params'])
            print("Best CV score (RMSE):", np.sqrt(grid_results['best_score']))
            print(f"Number of features selected: {grid_results['number_of_features_selected']}")
            
            # Print top 10 most important features by absolute coefficient value
            coef_df = pd.DataFrame(grid_results['coefficients'])
            coef_df['abs_coefficient'] = abs(coef_df['coefficient'])
            print("\nTop 10 most important features by coefficient magnitude:")
            print(coef_df.nlargest(10, 'abs_coefficient')[['feature', 'coefficient']])
            
        except Exception as e:
            print(f"Error processing {y_target}: {str(e)}")
            continue
    
    return results_dict

# Usage:
Y_columns = Y_labels_all
all_results = run_elasticnet_for_multiple_targets(
    y_columns=Y_columns,
    Y_train=Y_train,
    Y_test=Y_test,
    X_train_transformed=X_train_yjs,
    X_test_transformed=X_test_yjs,
    Y_train_transformed=Y_train_yjs,
    Y_test_transformed=Y_test_yjs,
    preprocess_pipeline_Y=preprocess_pipeline_Y
)

# Create summary DataFrame with best parameters
summary_dict = {
    target: {
        **metrics.to_dict(),
        **{'best_' + k: v for k, v in grid_results['best_params'].items()},
        'features_selected': grid_results['number_of_features_selected']
    }
    for target, (metrics, grid_results) in all_results.items()
}

summary_df_en = pd.DataFrame.from_dict(summary_dict, orient='index')
print("\nOverall Summary:")

In [None]:
show(summary_df_en, maxBytes=0)

#### Stacking Models using `mlxtend` Package

We can try stacking models so that we can improve predictions. Last time, MH already tried RF on the dataset. There's a slight level of overfitting, but not as bad as what was observed using XGBoost.

In [None]:
y_subset = ['S_Module11_Prolif_score', 'S_CHANG_CORE_SERUM_RESPONSE_UP', 'S_KEGG_MMR', 'S_LYMPHS_PCA_16704732', 'S_CSR_Activated_15701700', 'S_ICR_INHIB_SCORE', 'S_IFNG_score_21050467', 'S_NK_cd56dim', 'S_Th1', 'ESTIMATE']

In [None]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.svm import SVR
# from sklearn.linear_model import ElasticNet
# from xgboost import XGBRegressor
# from mlxtend.regressor import StackingRegressor
# from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# from yellowbrick.regressor import ResidualsPlot
# import os
# from typing import Dict, Tuple
# from sklearn.pipeline import Pipeline
# from itertools import product

# def create_stacked_model(rf_params, elastic_params, svr_params, xgb_params):
#     """Create a stacked model with specified parameters."""
#     rf = RandomForestRegressor(**rf_params, random_state=42)
#     elastic = ElasticNet(**elastic_params, random_state=42)
#     svr = SVR(**svr_params)
#     meta_regressor = XGBRegressor(**xgb_params, random_state=42)
    
#     return StackingRegressor(
#         regressors=[rf, elastic, svr],
#         meta_regressor=meta_regressor,
#         use_features_in_secondary=True
#     )

# def run_stacked_model_gridsearch(
#     y_target: str,
#     Y_train: pd.DataFrame,
#     Y_test: pd.DataFrame,
#     X_train_transformed: pd.DataFrame,
#     X_test_transformed: pd.DataFrame,
#     Y_train_transformed: pd.DataFrame,
#     Y_test_transformed: pd.DataFrame,
#     preprocess_pipeline_Y: Pipeline
# ) -> Tuple[YTargetMetrics, dict]:
#     """Run Stacked model with GridSearchCV for a single target variable and return performance metrics."""
    
#     # Define parameter ranges for each model
#     rf_param_grid = {
#         'n_estimators': [100, 200],
#         'max_depth': [None, 10, 20]
#     }
    
#     elastic_param_grid = {
#         'alpha': [0.1, 1.0],
#         'l1_ratio': [0.2, 0.5]
#     }
    
#     svr_param_grid = {
#         'C': [0.1, 1.0, 10, 100],
#         'epsilon': [0.01, 0.1, 0.2]
#     }
    
#     xgb_param_grid = {
#         'learning_rate': [0.01, 0.1]
#     }
    
#     # Generate all combinations
#     best_score = float('inf')
#     best_model = None
#     best_params = None
#     cv_results = []
    
#     # Create parameter combinations
#     rf_params_list = [dict(zip(rf_param_grid.keys(), v)) 
#                      for v in product(*rf_param_grid.values())]
#     elastic_params_list = [dict(zip(elastic_param_grid.keys(), v)) 
#                           for v in product(*elastic_param_grid.values())]
#     svr_params_list = [dict(zip(svr_param_grid.keys(), v)) 
#                        for v in product(*svr_param_grid.values())]
#     xgb_params_list = [dict(zip(xgb_param_grid.keys(), v)) 
#                        for v in product(*xgb_param_grid.values())]
    
#     total_combinations = (len(rf_params_list) * len(elastic_params_list) * 
#                          len(svr_params_list) * len(xgb_params_list))
#     print(f"\nTotal parameter combinations to try: {total_combinations}")
    
#     current_combination = 0
#     for rf_p in rf_params_list:
#         for elastic_p in elastic_params_list:
#             for svr_p in svr_params_list:
#                 for xgb_p in xgb_params_list:
#                     current_combination += 1
#                     print(f"Trying combination {current_combination}/{total_combinations}")
                    
#                     # Create and fit model
#                     model = create_stacked_model(rf_p, elastic_p, svr_p, xgb_p)
                    
#                     # Perform cross-validation
#                     scores = []
#                     from sklearn.model_selection import KFold
#                     kf = KFold(n_splits=5, shuffle=True, random_state=42)
                    
#                     for train_idx, val_idx in kf.split(X_train_transformed):
#                         X_train_cv = X_train_transformed.iloc[train_idx]
#                         X_val_cv = X_train_transformed.iloc[val_idx]
#                         y_train_cv = Y_train_transformed[y_target].iloc[train_idx]
#                         y_val_cv = Y_train_transformed[y_target].iloc[val_idx]
                        
#                         model.fit(X_train_cv, y_train_cv)
#                         pred = model.predict(X_val_cv)
#                         score = mean_squared_error(y_val_cv, pred)
#                         scores.append(score)
                    
#                     mean_score = np.mean(scores)
#                     cv_results.append({
#                         'params': {
#                             'rf': rf_p,
#                             'elastic': elastic_p,
#                             'svr': svr_p,
#                             'xgb': xgb_p
#                         },
#                         'mean_test_score': -mean_score,
#                         'std_test_score': np.std(scores)
#                     })
                    
#                     if mean_score < best_score:
#                         best_score = mean_score
#                         best_params = {
#                             'rf': rf_p,
#                             'elastic': elastic_p,
#                             'svr': svr_p,
#                             'xgb': xgb_p
#                         }
#                         best_model = model
    
#     # Fit final model with best parameters
#     print(f"\nBest parameters for {y_target}:")
#     print(best_params)
    
#     # Use best model for predictions
#     model_instance = create_stacked_model(
#         best_params['rf'],
#         best_params['elastic'],
#         best_params['svr'],
#         best_params['xgb']
#     )
#     model_instance.fit(X_train_transformed, Y_train_transformed[y_target])
    
#     # yellowbrick visualizer
#     visualizer = ResidualsPlot(model_instance)
#     visualizer.fit(X_train_transformed, Y_train_transformed[y_target])
#     visualizer.score(X_test_transformed, Y_test_transformed[y_target])
#     visualizer.show(outpath=f'plots/{y_target}/{y_target}-Stacked-tuned-model-residual-plot.png', dpi=300)
    
#     # Get predictions (transformed space)
#     y_train_pred_transformed = model_instance.predict(X_train_transformed)
#     y_test_pred_transformed = model_instance.predict(X_test_transformed)

#     # Create dummy DataFrames for inverse transform
#     dummy_train_y = pd.DataFrame(0, index=X_train_transformed.index, 
#                                 columns=Y_train_transformed.columns)
#     dummy_train_y[y_target] = y_train_pred_transformed

#     dummy_test_y = pd.DataFrame(0, index=X_test_transformed.index, 
#                                columns=Y_test_transformed.columns)
#     dummy_test_y[y_target] = y_test_pred_transformed

#     # Inverse transform predictions
#     dummy_train_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_train_y)
#     dummy_test_y_inv = preprocess_pipeline_Y.inverse_transform(dummy_test_y)

#     # Extract the relevant target column
#     y_train_pred = dummy_train_y_inv[y_target].to_numpy()
#     y_test_pred = dummy_test_y_inv[y_target].to_numpy()

#     # Get raw target data
#     raw_y_train = Y_train[y_target]
#     raw_y_test = Y_test[y_target]

#     # Calculate metrics
#     train_r2 = r2_score(raw_y_train, y_train_pred)
#     test_r2 = r2_score(raw_y_test, y_test_pred)
#     train_rmse = np.sqrt(mean_squared_error(raw_y_train, y_train_pred))
#     test_rmse = np.sqrt(mean_squared_error(raw_y_test, y_test_pred))
#     train_mae = mean_absolute_error(raw_y_train, y_train_pred)
#     test_mae = mean_absolute_error(raw_y_test, y_test_pred)

#     # Create plots directory
#     os.makedirs(f'plots/{y_target}', exist_ok=True)

#     # Plot feature importance for RandomForest (first base model)
#     # Fix: Access the correct attribute for regressors in mlxtend
#     rf_model = model_instance.regr_[0]  # Changed from regressors_ to regr_
    
#     # Add error handling in case the first regressor isn't a Random Forest
#     if isinstance(rf_model, RandomForestRegressor):
#         feature_importance = pd.DataFrame({
#             'feature': X_train_transformed.columns,
#             'importance': rf_model.feature_importances_
#         }).sort_values('importance', ascending=False)

#         plt.figure(figsize=(12, 6))
#         plt.bar(range(len(feature_importance)), feature_importance['importance'])
#         plt.xticks(range(len(feature_importance)), feature_importance['feature'], rotation=45, ha='right')
#         plt.xlabel('Features')
#         plt.ylabel('Importance')
#         plt.title(f'Feature Importance for {y_target} (Random Forest Base Model)')
#         plt.tight_layout()
#         plt.savefig(f'plots/{y_target}/{y_target}-Stacked-RF-importance.png', dpi=300)
#         plt.close()
#     else:
#         print(f"Warning: First base model is not a RandomForest. Skipping feature importance plot.")
#         feature_importance = pd.DataFrame()  # Empty DataFrame as placeholder

#     plt.figure(figsize=(12, 6))
#     plt.bar(range(len(feature_importance)), feature_importance['importance'])
#     plt.xticks(range(len(feature_importance)), feature_importance['feature'], rotation=45, ha='right')
#     plt.xlabel('Features')
#     plt.ylabel('Importance')
#     plt.title(f'Feature Importance for {y_target} (Random Forest Base Model)')
#     plt.tight_layout()
#     plt.savefig(f'plots/{y_target}/{y_target}-Stacked-RF-importance.png', dpi=300)
#     plt.close()

#     # Plot grid search results
#     cv_results_df = pd.DataFrame(cv_results)
    
#     plt.figure(figsize=(12, 6))
#     plt.plot(-cv_results_df['mean_test_score'], 'o-')
#     plt.xlabel('Parameter Combination Index')
#     plt.ylabel('Mean Squared Error')
#     plt.title('Grid Search Results')
#     plt.tight_layout()
#     plt.savefig(f'plots/{y_target}/{y_target}-Stacked-grid-search-results.png', dpi=300)
#     plt.close()

#     # Plot actual vs predicted
#     _, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), dpi=300)

#     ax1.scatter(raw_y_train, y_train_pred, alpha=0.5)
#     ax1.plot([raw_y_train.min(), raw_y_train.max()], 
#              [raw_y_train.min(), raw_y_train.max()], 'r--', lw=2)
#     ax1.set_xlabel('Actual')
#     ax1.set_ylabel('Predicted')
#     ax1.set_title('Training Set')
#     ax1.grid(True)

#     ax2.scatter(raw_y_test, y_test_pred, alpha=0.5)
#     ax2.plot([raw_y_test.min(), raw_y_test.max()], 
#              [raw_y_test.min(), raw_y_test.max()], 'r--', lw=2)
#     ax2.set_xlabel('Actual')
#     ax2.set_ylabel('Predicted')
#     ax2.set_title('Testing Set')
#     ax2.grid(True)

#     plt.tight_layout()
#     plt.savefig(f'plots/{y_target}/{y_target}-Stacked-tuned-model-performance-comparison.png')
#     plt.close()

#     # Create results object
#     results = YTargetMetrics(y_target, train_r2, test_r2, train_rmse, test_rmse, train_mae, test_mae)
    
#     # Update the grid search results dictionary to handle case where feature_importance is empty
#     grid_search_results = {
#         'best_params': best_params,
#         'best_score': best_score,
#         'cv_results': cv_results,
#         'feature_importance': feature_importance.to_dict() if not feature_importance.empty else {},
#         'number_of_important_features': (feature_importance['importance'] > 0.01).sum() if not feature_importance.empty else 0
#     }

#     return results, grid_search_results


# def run_stacked_for_multiple_targets(
#     y_columns: list[str],
#     Y_train: pd.DataFrame,
#     Y_test: pd.DataFrame,
#     X_train_transformed: pd.DataFrame,
#     X_test_transformed: pd.DataFrame,
#     Y_train_transformed: pd.DataFrame,
#     Y_test_transformed: pd.DataFrame,
#     preprocess_pipeline_Y: Pipeline
# ) -> Dict[str, Tuple[YTargetMetrics, dict]]:
    
#     results_dict = {}
    
#     for y_target in y_columns:
#         try:
#             print(f"\nProcessing target: {y_target}")
            
#             metrics, grid_results = run_stacked_model_gridsearch(
#                 y_target=y_target,
#                 Y_train=Y_train,
#                 Y_test=Y_test,
#                 X_train_transformed=X_train_transformed,
#                 X_test_transformed=X_test_transformed,
#                 Y_train_transformed=Y_train_transformed,
#                 Y_test_transformed=Y_test_transformed,
#                 preprocess_pipeline_Y=preprocess_pipeline_Y
#             )
            
#             results_dict[y_target] = (metrics, grid_results)
#             print(f"\nResults for {y_target}:")
#             print(metrics)
#             print("\nBest parameters:", grid_results['best_params'])
#             print("Best CV score (RMSE):", np.sqrt(grid_results['best_score']))
#             print(f"Number of important features: {grid_results['number_of_important_features']}")
            
#             # Print top 10 most important features
#             importance_df = pd.DataFrame(grid_results['feature_importance'])
#             print("\nTop 10 most important features:")
#             print(importance_df.nlargest(10, 'importance')[['feature', 'importance']])
            
#         except Exception as e:
#             print(f"Error processing {y_target}: {str(e)}")
#             continue
    
#     return results_dict

# # Usage:
# Y_columns = y_subset
# all_results = run_stacked_for_multiple_targets(
#     y_columns=Y_columns,
#     Y_train=Y_train,
#     Y_test=Y_test,
#     X_train_transformed=X_train_yjs,
#     X_test_transformed=X_test_yjs,
#     Y_train_transformed=Y_train_yjs,
#     Y_test_transformed=Y_test_yjs,
#     preprocess_pipeline_Y=preprocess_pipeline_Y
# )

# # Create summary DataFrame with best parameters
# summary_dict = {
#     target: {
#         **metrics.to_dict(),
#         **{'best_' + k.split('__')[1]: v for k, v in grid_results['best_params'].items()},
#         'important_features': grid_results['number_of_important_features']
#     }
#     for target, (metrics, grid_results) in all_results.items()
# }

# summary_df_stacked = pd.DataFrame.from_dict(summary_dict, orient='index')
# print("\nOverall Summary:")

##### **VERDICT**

Stacking models above returned poor R2 score, poorer than SVR only model, so maybe we will forego this idea. 