## Cell 1: Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import random

## Cell 2: Load and Preprocess the Dataset

In [15]:
# Load the dataset
df = pd.read_csv('Brain_GSE.csv')
df = shuffle(df, random_state=42)
X = df.drop(columns=['samples', 'type'])
y = df['type']
y_encoded, class_labels = pd.factorize(y) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


## Cell 3: Generate Individual Function

In [3]:
# Generate a random individual with binary representation of selected features
def generate_individual(num_features):
    return np.random.choice([0, 1], size=num_features)


## Cell 4: Evaluate Individual Function

In [4]:
# Evaluate the fitness of an individual (feature subset) using Logistic Regression
def evaluate_individual(individual, X_train, y_train):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    
    if not selected_features:
        return 0
    
    X_train_selected = X_train[:, selected_features]
    model = LogisticRegression(max_iter=10000)
    model.fit(X_train_selected, y_train)
    
    X_test_selected = X_test[:, selected_features]
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy


## Cell 5: Crossover Function

In [5]:
# Crossover between two parents to create a child individual
def crossover(parent1, parent2):
    point = random.randint(1, len(parent1) - 1)
    child = np.concatenate([parent1[:point], parent2[point:]])
    return child


## Cell 6: Mutation Function

In [6]:
# Mutate an individual by flipping bits with a certain probability
def mutate(individual, mutation_rate=0.01):
    for i in range(len(individual)):
        if random.random() < mutation_rate:
            individual[i] = 1 - individual[i]
    return individual


## Cell 7: Genetic Algorithm Function

In [7]:
# Run the genetic algorithm to find the best feature subset
def genetic_algorithm(X_train, y_train, population_size=20, generations=10, mutation_rate=0.01):
    num_features = X_train.shape[1]
    
    population = [generate_individual(num_features) for _ in range(population_size)]
    
    for generation in range(generations):
        fitness_scores = [evaluate_individual(individual, X_train, y_train) for individual in population]
        
        best_individuals = np.argsort(fitness_scores)[-population_size//2:]
        best_population = [population[i] for i in best_individuals]
        
        new_population = best_population.copy()
        while len(new_population) < population_size:
            parent1, parent2 = random.sample(best_population, 2)
            child = crossover(parent1, parent2)
            child = mutate(child, mutation_rate)
            new_population.append(child)
        
        population = new_population
    
    best_individual = population[np.argmax(fitness_scores)]
    return best_individual


## Cell 8: Run the Genetic Algorithm

In [None]:
# Run the genetic algorithm to select the best feature subset
best_individual = genetic_algorithm(X_train, y_train)

selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]
print("Selected Features: ", selected_features)


Selected Features:  [1, 2, 3, 5, 6, 7, 10, 11, 12, 13, 16, 18, 22, 23, 27, 28, 29, 30, 39, 40, 41, 43, 44, 45, 47, 51, 58, 59, 60, 63, 68, 70, 72, 73, 74, 77, 79, 81, 82, 84, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 99, 102, 103, 106, 108, 110, 112, 113, 114, 116, 117, 119, 122, 125, 127, 130, 131, 132, 133, 134, 135, 137, 138, 140, 142, 145, 147, 150, 151, 154, 157, 158, 163, 164, 170, 171, 174, 175, 181, 184, 185, 186, 188, 190, 191, 194, 196, 198, 201, 203, 204, 205, 206, 207, 208, 210, 214, 215, 220, 221, 222, 233, 236, 238, 239, 240, 241, 243, 244, 246, 249, 252, 255, 256, 257, 259, 260, 261, 263, 264, 265, 269, 271, 274, 276, 277, 278, 280, 282, 283, 287, 288, 290, 294, 296, 297, 298, 300, 301, 303, 304, 305, 306, 307, 309, 310, 312, 314, 315, 316, 319, 323, 324, 327, 333, 334, 335, 337, 338, 343, 344, 347, 349, 350, 351, 352, 353, 356, 357, 366, 368, 370, 371, 372, 373, 383, 385, 387, 389, 390, 391, 392, 394, 395, 396, 397, 400, 402, 403, 404, 406, 409, 413, 414, 415, 417, 42

## Cell 9: Train the Final Model

In [None]:
# Train the Logistic Regression model on the selected features
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

# Train Logistic Regression
final_model = LogisticRegression(max_iter=10000)
final_model.fit(X_train_selected, y_train)

# Evaluate on the test set
y_pred = final_model.predict(X_test_selected)
final_accuracy = accuracy_score(y_test, y_pred)

print(f"Final model accuracy on test set: {final_accuracy:.4f}")


## Cell 10: Evaluate the Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))


## Testing with Selected Features

In [50]:
test_sample = X_test[25, :] 
# Select features based on the best individual from the genetic algorithm
best_features_indices = np.where(best_individual == 1)[0]  
selected_features = test_sample[best_features_indices]  

# Reshape the selected features to match the input shape of the model
selected_features = selected_features.reshape(1, -1)

predicted_class = final_model.predict(selected_features)[0]  
predicted_prob = final_model.predict_proba(selected_features)[0]  

decoded_class = class_labels[predicted_class]

# Display the results
print(f"Selected Features: {selected_features}")
print(f"Prediction Probabilities: {predicted_prob}")
print(f"Predicted Class: {decoded_class}")  

Selected Features: [[ 0.02748878 -0.82400104  0.678074   ... -1.61993928 -0.97926024
  -0.83927315]]
Prediction Probabilities: [1.22422727e-03 3.72261601e-05 9.98728745e-01 5.23805300e-06
 4.56309828e-06]
Predicted Class: pilocytic_astrocytoma
