In [1]:
import pandas as pd
import numpy as np

In [70]:
import numpy as np

def compute_markov_coefficients(data, labels, gamma):
    """
    Compute Markov coefficients for feature selection using Fisher-Markov selector.

    Parameters:
    - data: numpy array, shape (n_samples, n_features)
        Input data matrix.
    - labels: numpy array, shape (n_samples,)
        Class labels for the samples.
    - gamma: float

    Returns:
    - markov_coefficients: numpy array, shape (n_features,)
        Array containing the Markov coefficients for each feature.
    """
    n_samples, n_features = data.shape  
    classes = np.unique(labels)
    g = len(classes)
    class_indices = [np.where(labels == c)[0] for c in classes]
    markov_coefficients = np.zeros(n_features)

    for j in range(n_features):
        within_class_sum = 0
        for i in range(g):
            class_data = data[class_indices[i]][:, j]
            ni = len(class_data)
            within_class_sum += np.sum(np.outer(class_data, class_data)) / ni
            
        within_class_sum /= n_samples  
        total_sum = np.sum(np.square(data[:, j]))
        #print (within_class_sum,-(gamma / n_samples) * total_sum,(gamma - 1) / (n_samples ** 2) * np.sum(np.outer(data[:, j], data[:, j])))
        markov_coefficients[j] = within_class_sum - (gamma / n_samples) * total_sum + (gamma - 1) / (n_samples ** 2) * np.sum(np.outer(data[:, j], data[:, j]))

    return markov_coefficients

def fisher_markov_selector(data, labels, gamma, beta):
    """
    Perform feature selection using Fisher-Markov selector.

    Parameters:
    - data: numpy array, shape (n_samples, n_features)
        Input data matrix.
    - labels: numpy array, shape (n_samples,)
        Class labels for the samples.
    - gamma: float
    - beta: float
        Threshold for selecting features.

    Returns:
    - feature_selector: numpy array, shape (n_features,)
        Binary array indicating selected features (1) and unselected features (0).
    """
    n_samples, n_features = data.shape
    markov_coefficients = compute_markov_coefficients(data, labels, gamma)
    feature_selector = np.zeros(n_features)

    for j in range(n_features):
        if markov_coefficients[j] > beta:
            feature_selector[j] = 1

    return feature_selector


# Toy Example

In [78]:
toy_data = np.array([[1, 1.2, 0.5, 0.8, 0.3],
    [1, 2.4, 1.1, 0.6, 0.9],
    [1, 0.8, 0.3, 0.7, 0.1],
    [2, 1.6, 0.9, 1.2, 0.4],
    [2, 2.0, 1.5, 1.1, 0.7],
    [2, 0.5, 0.2, 0.6, 0.2]])

toy_y =  np.array([1 ,1 ,1 ,2 ,2 ,2]) # Example class labels

# Set gamma and beta values
gamma = -0.5
beta = 0.1

# Run feature selection
selected_features_toy = fisher_markov_selector(toy_data, toy_y, gamma, beta)
print("Selected Features:", selected_features_toy)

Selected Features: [1. 1. 1. 0. 0.]


# Iris data

In [79]:
from sklearn.datasets import load_iris

iris = load_iris()
iris_data = iris.data
iris_y = iris.target

# Set gamma and beta values
gamma = -0.5
beta = 0.5

# Run feature selection
selected_features_iris = fisher_markov_selector(iris_data, iris_y, gamma, beta)
print("Selected Features:", selected_features_iris)

Selected Features: [1. 0. 1. 1.]


# Optical pen

In [100]:
# Load the Optical Pen dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra"
column_names = ['pixel' + str(i) for i in range(64)] + ['class']
data = pd.read_csv(url, header=None, names=column_names)

# Split the dataset into features (X) and target labels (y)
optical_data = data.drop('class', axis=1)
optical_y = data['class']

gamma = -0.5
beta = 0.1

selected_features_optical = fisher_markov_selector(np.array(optical_data), np.array(optical_y), gamma, beta)

print("Selected Features:", selected_features_optical)

Selected Features: [0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0.
 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.]


# Gene Expression Data

In [64]:
import pandas as pd


gene_data = pd.read_csv("cancer-GSE233242/NT.csv").iloc[:, 2:]
gene_data.index = gene_data.iloc[:, 0]
gene_data = gene_data.iloc[:, 2:]

prefixes = set([col.split('.')[0] for col in gene_data.columns[1:]])

num_groups = len(prefixes)

gene_data.rename(columns={col: col.split('.')[0] for col in gene_data.columns}, inplace=True)

gene_data = gene_data.dropna()
pv = gene_data.transpose()

feature_average = pv.mean(axis=0)
feature_std = pv.std(axis=0)

normalized_gene_data = (pv - feature_average) / feature_std
y = normalized_gene_data.index

normalized_gene_data = np.array(normalized_gene_data)

# Map disease names to numerical labels
disease_to_label = {disease: label for label, disease in enumerate(y.unique())}
y = y.map(disease_to_label)
gene_label = np.array(y)

print (normalized_gene_data, "data")
print (normalized_gene_data.shape, "data shape")
print (gene_label, "labels")

[[-0.65596487 -0.56239576  1.26912064 ... -0.21995141  0.13120849
   0.74682323]
 [-0.23739987 -0.35406809  0.19924177 ... -0.03714221  1.95347887
   2.25018585]
 [-0.02620014 -0.29920761  0.97409525 ... -0.91533748  0.10112479
  -1.33840002]
 ...
 [ 3.16960193  0.76242707 -0.61820316 ...  0.59140993 -0.64388149
  -1.31286269]
 [-0.76833216 -0.85001354  0.79121196 ...  0.10346723 -0.75766475
   1.53719591]
 [ 2.53450452  2.06903002  1.55672107 ...  1.0696598  -1.27938308
   0.70114378]] data
(86, 15044) data shape
[0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 1 0 1 0 1 0 1 0 1] labels


In [103]:
gamma = -0.5
beta = 1

selected_features = fisher_markov_selector(normalized_gene_data, gene_label, gamma, beta)
print("Selected Features:", selected_features)

Selected Features: [0. 0. 0. ... 0. 0. 0.]


In [104]:
np.count_nonzero(selected_features == 1)

32

# Classification on Iris data

## SVM 

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

data_with_selected_features = iris_data[:, selected_features_iris == 1]

X_train, X_test, y_train, y_test = train_test_split(data_with_selected_features, iris_y, test_size=0.2, random_state=42)

# Train the classifier 
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [83]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
    
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## Random Forest

In [85]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [108]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
    
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



# Classification on Optical Pen

In [109]:
data_selected_features_opt = optical_data.iloc[:,selected_features_optical.astype(bool)]

X_train, X_test, y_train, y_test = train_test_split(data_selected_features_opt, optical_y, test_size=0.25, random_state=42)

classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9769874476987448


In [110]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        86
           1       0.94      0.98      0.96        98
           2       1.00      0.97      0.98       100
           3       0.98      0.97      0.98       108
           4       0.96      0.99      0.98        81
           5       0.96      1.00      0.98        94
           6       0.97      0.98      0.97        93
           7       0.99      0.98      0.99       118
           8       0.99      0.93      0.96        81
           9       0.98      0.97      0.97        97

    accuracy                           0.98       956
   macro avg       0.98      0.98      0.98       956
weighted avg       0.98      0.98      0.98       956

