In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import jaccard_score

In [2]:
x_data = pd.read_csv('../../data/RNAseq_with_HGNC_symbols.csv', index_col=0)
x_data.index.name = None
x_data

Unnamed: 0,100130426,100133144,UBE2Q2P2,HMGB1P1,10431,136542,LOC155060,RNU12-2P,SSX9P,317712,...,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,psiTPTE22|387590,tAKR|389932
sample_0,0.0,2.017209,3.265527,5.478487,10.431999,0.0,7.175175,0.591871,0.0,0.0,...,4.926711,8.210257,9.723516,7.220030,9.119813,12.003135,9.650743,8.921326,5.286759,0.000000
sample_1,0.0,0.592732,1.588421,7.586157,9.623011,0.0,6.816049,0.000000,0.0,0.0,...,4.593372,7.323865,9.740931,6.256586,8.381612,12.674552,10.517059,9.397854,2.094168,0.000000
sample_2,0.0,3.511759,4.327199,6.881787,9.870730,0.0,6.972130,0.452595,0.0,0.0,...,5.125213,8.127123,10.908640,5.401607,9.911597,9.045255,9.788359,10.090470,1.683023,0.000000
sample_3,0.0,3.663618,4.507649,6.659068,10.196184,0.0,7.843375,0.434882,0.0,0.0,...,6.076566,8.792959,10.141520,8.942805,9.601208,11.392682,9.694814,9.684365,3.292001,0.000000
sample_4,0.0,2.655741,2.821547,6.539454,9.738265,0.0,6.566967,0.360982,0.0,0.0,...,5.996032,8.891425,10.373790,7.181162,9.846910,11.922439,9.217749,9.461191,5.110372,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sample_796,0.0,1.865642,2.718197,7.350099,10.006003,0.0,6.764792,0.496922,0.0,0.0,...,6.088133,9.118313,10.004852,4.484415,9.614701,12.031267,9.813063,10.092770,8.819269,0.000000
sample_797,0.0,3.942955,4.453807,6.346597,10.056868,0.0,7.320331,0.000000,0.0,0.0,...,6.371876,9.623335,9.823921,6.555327,9.064002,11.633422,10.317266,8.745983,9.659081,0.000000
sample_798,0.0,3.249582,3.707492,8.185901,9.504082,0.0,7.536589,1.811101,0.0,0.0,...,5.719386,8.610704,10.485517,3.589763,9.350636,12.180944,10.681194,9.466711,4.677458,0.586693
sample_799,0.0,2.590339,2.787976,7.318624,9.987136,0.0,9.213464,0.000000,0.0,0.0,...,5.785237,8.605387,11.004677,4.745888,9.626383,11.198279,10.335513,10.400581,5.718751,0.000000


In [3]:
y_labels = pd.read_csv('../../data/labels.csv')['Class']
y_labels

0      PRAD
1      LUAD
2      PRAD
3      PRAD
4      BRCA
       ... 
796    BRCA
797    LUAD
798    COAD
799    PRAD
800    PRAD
Name: Class, Length: 801, dtype: object

In [4]:
print(y_labels)
class_to_num = {cls: i for i, cls in enumerate(y_labels.unique())}
print(class_to_num)
y_encoded = y_labels.map(class_to_num)
print(y_encoded)
y_encoded_values = y_encoded.values

0      PRAD
1      LUAD
2      PRAD
3      PRAD
4      BRCA
       ... 
796    BRCA
797    LUAD
798    COAD
799    PRAD
800    PRAD
Name: Class, Length: 801, dtype: object
{'PRAD': 0, 'LUAD': 1, 'BRCA': 2, 'KIRC': 3, 'COAD': 4}
0      0
1      1
2      0
3      0
4      2
      ..
796    2
797    1
798    4
799    0
800    0
Name: Class, Length: 801, dtype: int64


In [5]:
X_values = x_data.values
print(X_values)

[[ 0.          2.01720929  3.26552691 ...  8.92132623  5.28675919
   0.        ]
 [ 0.          0.59273209  1.58842082 ...  9.39785429  2.09416849
   0.        ]
 [ 0.          3.51175898  4.32719872 ... 10.09046974  1.68302267
   0.        ]
 ...
 [ 0.          3.24958187  3.70749166 ...  9.46671072  4.6774575
   0.5866927 ]
 [ 0.          2.59033853  2.78797567 ... 10.40058062  5.71875068
   0.        ]
 [ 0.          2.32524248  3.80593214 ...  9.84479363  4.55071601
   0.        ]]


In [6]:
X_values.shape

(801, 20531)

In [7]:
k_f = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
K = 200
fold_features = []

for fold_idx, (train_idx, _) in enumerate(k_f.split(X_values, y_encoded_values), start=1):
    print(f"Fold {fold_idx}")

    X_train = X_values[train_idx]
    y_train = y_encoded_values[train_idx]

    mi_scores = mutual_info_classif(X_train, y_train, discrete_features=False, n_neighbors=3, random_state=0)
    mi_series = pd.Series(mi_scores, index=x_data.columns)

    top_features = mi_series.sort_values(ascending=False).head(K)
    fold_features.append(set(top_features.index))

    top_features.to_csv(f"../../Output/mutual_information_results/selected_features_fold_{fold_idx}.csv", index=True)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [9]:
jaccard_matrix = np.zeros((5,5))
for i in range(5):
    for j in range(5):
        intersection = len(fold_features[i] & fold_features[j])
        union = len(fold_features[i] | fold_features[j])
        if union == 0:
            jaccard_matrix[i,j] = 0
        else:
            jaccard_matrix[i,j] = intersection/union

In [10]:
jaccard_df = pd.DataFrame(
    jaccard_matrix,
    index=[f"Fold_{i+1}" for i in range(5)],
    columns=[f"Fold_{i+1}" for i in range(5)]
)
jaccard_df

Unnamed: 0,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5
Fold_1,1.0,0.716738,0.769912,0.769912,0.769912
Fold_2,0.716738,1.0,0.731602,0.731602,0.73913
Fold_3,0.769912,0.731602,1.0,0.769912,0.777778
Fold_4,0.769912,0.731602,0.769912,1.0,0.754386
Fold_5,0.769912,0.73913,0.777778,0.754386,1.0


In [12]:
jaccard_df.to_csv("../../Output/mutual_information_results/jaccard_matrix.csv", index=True)

In [None]:
# mi_scores = mutual_info_classif(X_values, y_encoded_values, discrete_features=False, n_neighbors=3, random_state=0)

In [None]:
# mi_wfeatures = pd.Series(mi_scores, index=x_data.columns)

In [None]:
# mi_wfeatures_sorted = mi_wfeatures.sort_values(ascending=False)[:200]

In [None]:
mi_wfeatures_sorted

In [None]:
top_200_feature_names = mi_wfeatures_sorted.index
top_200_feature_names

In [None]:
x_data_wtop_200_features = x_data[top_200_feature_names]

In [None]:
x_data_wtop_200_features

In [None]:
x_data_wtop_200_features.to_csv("RNAseq_with_HGNC_symbols_top200.csv")