In [51]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load your gene expression data
# Assuming you have a DataFrame named 'data' with columns for features and a target column 'type'
# Make sure to replace 'your_data.csv' with the actual file name or data loading method you are using
data = pd.read_csv('Brain_GSE50161.csv')

# Encode the target column 'type' if it's not already numerical
le = LabelEncoder()
data['type'] = le.fit_transform(data['type'])

# Separate features (X) and target variable (y)
X = data.drop('type', axis=1)
y = data['type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply variance thresholding
# You can adjust the threshold value based on your data and requirements
# For example, setting threshold=0.01 means features with more than 99% of the same value will be removed
threshold = 4
selector = VarianceThreshold(threshold)
X_train_selected = selector.fit_transform(X_train)
X_test_selected = selector.transform(X_test)

# Print the selected features
selected_features = X.columns[selector.get_support()]
print(f"Selected Features: {selected_features}")

# Now you can use X_train_selected and X_test_selected in your classification model


Selected Features: Index(['1552365_at', '1552439_s_at', '1552754_a_at', '1552943_at',
       '1553635_s_at', '1555564_a_at', '1555778_a_at', '1555804_a_at',
       '1556057_s_at', '1556096_s_at', '1556351_at', '1557122_s_at',
       '1557395_at', '1557636_a_at', '1558034_s_at', '1562371_s_at',
       '1565483_at', '1565484_x_at', '1568612_at', '201291_s_at', '201292_at',
       '201890_at', '201909_at', '202018_s_at', '202237_at', '202409_at',
       '202410_x_at', '202437_s_at', '202454_s_at', '202508_s_at',
       '202859_x_at', '203000_at', '203001_s_at', '203131_at', '203400_s_at',
       '203797_at', '203798_s_at', '203819_s_at', '203820_s_at', '203854_at',
       '203889_at', '203998_s_at', '203999_at', '204187_at', '204260_at',
       '204298_s_at', '204379_s_at', '204409_s_at', '204465_s_at', '204584_at',
       '204712_at', '204846_at', '204850_s_at', '204851_s_at', '204865_at',
       '204932_at', '204933_s_at', '204955_at', '205000_at', '205113_at',
       '205278_at', '2053

In [52]:
# Create DataFrames with selected features for both training and testing sets
X_train_selected_df = pd.DataFrame(X_train_selected, columns=selected_features)
X_test_selected_df = pd.DataFrame(X_test_selected, columns=selected_features)

# Reset indices for the selected feature DataFrames and target variables
X_train_selected_df.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test_selected_df.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Concatenate the selected features with the target variable for both training and testing sets
train_data_selected = pd.concat([X_train_selected_df, y_train], axis=1)
test_data_selected = pd.concat([X_test_selected_df, y_test], axis=1)




In [53]:
# Concatenate training and testing sets to get the complete dataset with selected features
complete_data_selected = pd.concat([train_data_selected, test_data_selected], axis=0)

# Save the complete dataset with selected features to a CSV file
complete_data_selected.to_csv('brain_cancer.csv', index=False)


In [54]:
complete_data_selected

Unnamed: 0,1552365_at,1552439_s_at,1552754_a_at,1552943_at,1553635_s_at,1555564_a_at,1555778_a_at,1555804_a_at,1556057_s_at,1556096_s_at,...,206456_at,206502_s_at,206552_s_at,206773_at,206785_s_at,206826_at,206898_at,206915_at,206935_at,type
0,8.865194,7.741421,9.936971,3.606764,9.305012,9.647561,7.918614,4.871236,5.409013,4.596628,...,4.377654,10.652172,3.877130,5.890641,4.315782,12.276582,4.099800,10.064219,10.682396,1
1,9.646929,8.582222,9.284966,3.679206,6.600509,8.149917,9.754584,5.810494,7.651897,5.686386,...,4.658157,8.016931,8.481872,8.938906,7.355310,12.580836,4.616757,9.799376,9.496678,1
2,6.026914,8.356390,8.644779,4.562770,5.010252,5.592693,5.109612,5.598259,5.853089,5.598683,...,5.790827,8.531334,9.432907,8.357959,11.889255,12.203605,5.677102,11.523480,5.576004,1
3,7.027073,8.195263,7.142456,10.566940,10.724790,7.213543,5.574216,9.975859,6.381179,4.802306,...,5.527334,5.621846,3.859794,8.058252,4.387647,10.564605,4.406017,4.486196,9.680311,0
4,8.104710,6.813598,5.449861,9.936684,11.301047,9.952337,5.901542,10.891082,6.016853,4.789472,...,9.610553,10.200455,3.435855,10.677271,4.495609,8.489932,3.982261,6.304304,4.012993,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,9.737534,9.139117,8.409922,9.063715,10.581169,9.448405,6.022448,9.297876,5.318138,4.488305,...,5.580821,4.276531,3.334098,10.167172,6.834395,12.366607,4.110771,7.331065,5.006926,0
22,9.339168,11.353213,10.825649,5.319767,4.947586,8.689769,9.313559,5.929664,5.784070,4.795111,...,4.187664,5.014513,3.503013,8.904054,10.354614,13.312436,11.543474,10.075449,3.867851,4
23,6.543910,9.035169,5.887538,4.787898,7.228518,9.334745,5.657595,7.845522,5.551471,4.354077,...,3.602815,5.361588,3.668107,12.710418,4.264042,9.996748,4.366847,4.577296,6.959468,0
24,7.900752,9.077885,10.414084,4.048354,5.462156,10.396251,6.476389,5.012265,5.564910,4.646405,...,4.082586,9.589010,3.198039,9.784533,8.077392,13.170215,5.793898,6.975319,6.978295,4


In [55]:
import numpy as np


# Print the original classes and their corresponding numerical labels
print("Original classes:", le.classes_)
print("Numerical labels:", np.unique(data['type']))


Original classes: ['ependymoma' 'glioblastoma' 'medulloblastoma' 'normal'
 'pilocytic_astrocytoma']
Numerical labels: [0 1 2 3 4]
