In [None]:
import numpy as np
import json
import os
import pandas as pd

print(os.getcwd())

datasets = [
    "recist_pancancer_cosmic_card_7-COMBINED",
    "recist_pancancer_cosmic_card_7-BINARY",
    "recist_pancancer_union_cosmic_card_7-COMBINED",
    "recist_pancancer_union_cosmic_card_7-BINARY",
    "recist_melanoma_cosmic_card_4-COMBINED",
    "recist_melanoma_cosmic_card_4-BINARY",
    "recist_melanoma_union_cosmic_card_4-COMBINED",
    "recist_melanoma_union_cosmic_card_4-BINARY"
    ]
 
json_file = "../../results/ffs_results_recist_pancancer_union_cosmic_card_7-COMBINED_50.json"  # "../../results/ffs_results_recist_pancancer_cosmic_card_7-BINARY_50.json" 
output_file =   "../../data/data_processed/recist_pancancer_union_cosmic_card_7-COMBINED.csv"  # "../../data/data_processed/recist_pancancer_cosmic_card_7-BINARY.csv"

# Fix path issues for json_file
json_file = os.path.normpath(json_file)
print(f"Using JSON file: {json_file}")

with open(json_file, "r") as f:
    ffs_results = json.load(f)

print(f"Loaded {len(ffs_results)} results from {json_file}")

# Read the melanoma batch corrected data


df_melanoma = pd.read_csv(output_file, index_col=0)

print(f"Data loaded successfully!")
print(f"Shape: {df_melanoma.shape}")
print(f"Columns: {list(df_melanoma.columns[:5])}... (showing first 5)")
print(f"Target variable (recist) distribution:")
print(df_melanoma['recist'].value_counts())
print(f"\nFirst few rows:")
print(df_melanoma.head())

columns = list(df_melanoma.columns[1:])  # Exclude 'Unnamed: 0' and 'recist'
print(columns)  # Show first 5 gene names to verify

# Check for NaN values in df_melanoma
print("="*60)
print("CHECKING FOR NaN VALUES IN df_melanoma")
print("="*60)

# Check for NaN values in the entire dataframe
total_nan = df_melanoma.isnull().sum().sum()
print(f"Total NaN values in dataframe: {total_nan}")

# Check NaN values per column
nan_per_column = df_melanoma.isnull().sum()
columns_with_nan = nan_per_column[nan_per_column > 0]

if len(columns_with_nan) > 0:
    print(f"\nColumns with NaN values:")
    for col, nan_count in columns_with_nan.items():
        print(f"  {col}: {nan_count} NaN values")
else:
    print("\nNo NaN values found in any column")

# Specifically check the target variable 'recist'
nan_in_target = df_melanoma['recist'].isnull().sum()
print(f"\nNaN values in target variable 'recist': {nan_in_target}")

# Check for infinite values
inf_values = np.isinf(df_melanoma.select_dtypes(include=[np.number])).sum().sum()
print(f"Infinite values in numeric columns: {inf_values}")

# Show data info
print(f"\nDataframe shape: {df_melanoma.shape}")
print(f"Data types:")
print(df_melanoma.dtypes.value_counts())

# Remove observations with NaN values in the target variable 'recist'
print(f"Original shape: {df_melanoma.shape}")
df_melanoma_clean = df_melanoma.dropna(subset=['recist'])
print(f"Shape after removing NaN in target: {df_melanoma_clean.shape}")
print(f"Removed {df_melanoma.shape[0] - df_melanoma_clean.shape[0]} observations with NaN in 'recist'")

# Update the dataframe
df_melanoma = df_melanoma_clean

# Verify no NaN values remain in target
print(f"NaN values in 'recist' after cleaning: {df_melanoma['recist'].isnull().sum()}")
print(f"Target distribution after cleaning:")
print(df_melanoma['recist'].value_counts())

In [16]:
from collections import Counter
import os
import json
import pandas as pd

json_file = "../../results/ffs_results_mirna_nb_50.json" #"../../results/Imvigor_RFE_Results/rfe_results_imvigor_10.json"  # "../../results/ffs_results_recist_pancancer_cosmic_card_7-BINARY_50.json" 
#output_file =   "../../data/data_processed/recist_pancancer_union_cosmic_card_7-COMBINED.csv"  # "../../data/data_processed/recist_pancancer_cosmic_card_7-BINARY.csv"

json_true = "../../data/mirna_nbe_info.json"

with open(json_true, "r") as f:
    info = json.load(f)

true_features = info["support"]
print("True features:")
true_possitions = [idx for idx, val in enumerate(true_features) if val == True]
print(true_possitions)
import numpy as np

# Fix path issues for json_file
json_file = os.path.normpath(json_file)
print(f"Using JSON file: {json_file}")

with open(json_file, "r") as f:
    ffs_results = json.load(f)
# Extract all selected features from all runs
all_features = []
TPR = []
ii = 0
for run_data in ffs_results.values():
    # Parse the string representation of the array
    feature_str = run_data['selected_features']
    
    # Remove brackets and split by whitespace
    features = feature_str.strip().split()

    features = feature_str.replace('[', '').replace(']', '').split()
    feature_str = feature_str.replace('[', '').replace(']', '').split()

    # Convert to integers
    features = [int(f) for f in feature_str]
    # Calculate True Positive Rate (TPR)
    true_positives = sum(1 for f in features if f in true_possitions)
    tpr = true_positives / len(true_possitions) if len(true_possitions) > 0 else 0
    TPR.append(tpr)

    all_features.extend(features)

    ii = ii +1 
    print(ii)
    if ii == 19:
        break

print(f"Average TPR over runs: {np.mean(TPR):.4f} ± {np.std(TPR):.4f}")
# Count frequency of each feature
feature_counts = Counter(all_features)

# Get top k most frequent features
k = 10  # You can change this value
top_k_features = [feature for feature, count in feature_counts.most_common(k)]

print(f"Top {k} most frequent features:")
print(top_k_features)

# Also show the counts for reference
print(f"\nFeature frequencies:")
for feature, count in feature_counts.most_common(k):
    print(f"Feature {feature}: {count} times")



    # Map feature indices to actual gene names
    #top_k_feature_names = [columns[feature] for feature in top_k_features]


    # Create a summary dataframe
    feature_summary = pd.DataFrame({
        'feature_index': top_k_features,
        'gene_name': top_k_feature_names
    })

print(f"\nSummary DataFrame:")
print(feature_summary)

print(feature_summary.to_latex(index=False))




True features:
[13, 41, 72, 110, 116, 128, 137, 165, 197, 221, 222, 245, 248, 255, 292]
Using JSON file: ../../results/ffs_results_mirna_nb_50.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
Average TPR over runs: 0.7368 ± 0.0506
Top 10 most frequent features:
[72, 116, 137, 165, 197, 221, 222, 255, 292, 128]

Feature frequencies:
Feature 72: 19 times


NameError: name 'top_k_feature_names' is not defined

In [None]:
from sklearn.utils import resample

def downsample_to_minority_class(X, y, random_state=42):
    """
    Downsample all classes to the size of the minority class.
    
    Parameters:
    -----------
    X : numpy.ndarray
        Feature matrix
    y : numpy.ndarray
        Target labels
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    X_downsampled : numpy.ndarray
        Downsampled feature matrix
    y_downsampled : numpy.ndarray
        Downsampled target labels
    """
    # Identify the minority class size
    class_counts = pd.Series(y).value_counts()
    min_class_size = class_counts.min()
    
    print(f"Original class distribution:")
    print(class_counts)
    print(f"\nMinority class size: {min_class_size}")
    
    # Separate by class
    indices_by_class = {}
    for class_label in class_counts.index:
        indices_by_class[class_label] = np.where(y == class_label)[0]
    
    # Downsample each class to minority class size
    downsampled_indices = []
    for class_label, indices in indices_by_class.items():
        downsampled = resample(indices, 
                              replace=False, 
                              n_samples=min_class_size,
                              random_state=random_state)
        downsampled_indices.extend(downsampled)
    
    # Shuffle the indices
    np.random.seed(random_state)
    np.random.shuffle(downsampled_indices)
    
    # Create downsampled dataset
    X_downsampled = X[downsampled_indices]
    y_downsampled = y[downsampled_indices]
    
    print(f"\nDownsampled class distribution:")
    print(pd.Series(y_downsampled).value_counts())
    print(f"\nNew dataset shape: X={X_downsampled.shape}, y={y_downsampled.shape}")
    
    return X_downsampled, y_downsampled

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Get top k most frequent features
k = 10  # You can change this value
for k in range(10, 45):
    top_k_features = [feature for feature, count in feature_counts.most_common(k)]

    print(f"Top {k} most frequent features:")
    print(top_k_features)

    # Also show the counts for reference
    print(f"\nFeature frequencies:")
    for feature, count in feature_counts.most_common(k):
        print(f"Feature {feature}: {count} times")



        # Map feature indices to actual gene names
        top_k_feature_names = [columns[feature] for feature in top_k_features]


        # Create a summary dataframe
        feature_summary = pd.DataFrame({
            'feature_index': top_k_features,
            'gene_name': top_k_feature_names
        })

    print(f"\nSummary DataFrame:")
    print(feature_summary)

    print(feature_summary.to_latex(index=False))

    # Extract the selected genes from feature_summary
    selected_genes = feature_summary['gene_name'].tolist()
    print(f"Selected genes for logistic regression: {selected_genes}")

    # Prepare the data
    X = df_melanoma[selected_genes].values
    y = df_melanoma['recist'].values

    X, y = downsample_to_minority_class(X, y, random_state=42)

    print(f"Data shape: X={X.shape}, y={y.shape}")
    print(f"Target distribution: {pd.Series(y).value_counts().to_dict()}")

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Set up cross-validation
    cv = StratifiedKFold(n_splits=4, shuffle=True)

    # Create logistic regression model
    lr = LogisticRegression( max_iter=1000, class_weight='balanced')

    # Perform cross-validation with multiple metrics
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    cv_results = cross_validate(lr, X_scaled, y, cv=cv, scoring=scoring, return_train_score=True)

    # Print cross-validation results
    print("="*60)
    print("CROSS-VALIDATION RESULTS")
    print("="*60)

    for metric in scoring:
        train_scores = cv_results[f'train_{metric}']
        test_scores = cv_results[f'test_{metric}']
        print(f"{metric.upper()}:")
        print(f"  Train: {train_scores.mean():.3f} ± {train_scores.std():.3f}")
        print(f"  Test:  {test_scores.mean():.3f} ± {test_scores.std():.3f}")
        print()

    # Fit final model on all data for interpretation
    lr_final = LogisticRegression(random_state=42, max_iter=1000)
    lr_final.fit(X_scaled, y)

    # Feature importance (coefficients)
    feature_importance = pd.DataFrame({
        'gene': selected_genes,
        'coefficient': lr_final.coef_[0],
        'abs_coefficient': np.abs(lr_final.coef_[0])
    }).sort_values('abs_coefficient', ascending=False)

    print("="*60)
    print("FEATURE IMPORTANCE (Logistic Regression Coefficients)")
    print("="*60)
    print(feature_importance)

    # Visualize feature importance
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    plt.barh(range(len(feature_importance)), feature_importance['coefficient'])
    plt.yticks(range(len(feature_importance)), feature_importance['gene'], fontsize=8)
    plt.xlabel('Coefficient Value')
    plt.title('Feature Coefficients')
    plt.grid(True, alpha=0.3)

    # Cross-validation scores visualization
    plt.subplot(2, 2, 2)
    test_scores = [cv_results[f'test_{metric}'] for metric in scoring]
    plt.boxplot(test_scores, labels=[m.replace('_macro', '') for m in scoring])
    plt.ylabel('Score')
    plt.title('Cross-Validation Performance')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)


    # Prediction probabilities for one fold (for illustration)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
    lr_temp = LogisticRegression(random_state=42, max_iter=1000,class_weight='balanced')
    lr_temp.fit(X_train, y_train)
    y_pred_proba = lr_temp.predict_proba(X_test)
    y_pred = lr_temp.predict(X_test)


    # Confusion matrix
    plt.subplot(2, 2, 3)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=lr_temp.classes_, yticklabels=lr_temp.classes_)
    plt.title('Confusion Matrix (Test Set)')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

    # Prediction probability distribution
    plt.subplot(2, 2, 4)
    for i, class_name in enumerate(lr_temp.classes_):
        class_probs = y_pred_proba[:, i]
        plt.hist(class_probs, alpha=0.6, label=f'{class_name}', bins=20)
    plt.xlabel('Prediction Probability')
    plt.ylabel('Frequency')
    plt.title('Prediction Probability Distribution')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Detailed classification report
    print("="*60)
    print("CLASSIFICATION REPORT (Single Train-Test Split)")
    print("="*60)
    print(classification_report(y_test, y_pred))

    print("="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Number of features used: {len(selected_genes)}")
    print(f"Cross-validation accuracy: {cv_results['test_accuracy'].mean():.3f} ± {cv_results['test_accuracy'].std():.3f}")
    print(f"Most important features (by |coefficient|):")
    for i, row in feature_importance.head(3).iterrows():
        print(f"  {row['gene']}: {row['coefficient']:.3f}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Get top k most frequent features
k = 10  # You can change this value
for k in range(13, 34):
    top_k_features = [feature for feature, count in feature_counts.most_common(k)]

    print(f"Top {k} most frequent features:")
    print(top_k_features)

    # Also show the counts for reference
    print(f"\nFeature frequencies:")
    for feature, count in feature_counts.most_common(k):
        print(f"Feature {feature}: {count} times")



        # Map feature indices to actual gene names
        top_k_feature_names = [columns[feature] for feature in top_k_features]


        # Create a summary dataframe
        feature_summary = pd.DataFrame({
            'feature_index': top_k_features,
            'gene_name': top_k_feature_names
        })

    print(f"\nSummary DataFrame:")
    print(feature_summary)

    print(feature_summary.to_latex(index=False))

    # Extract the selected genes from feature_summary
    selected_genes = feature_summary['gene_name'].tolist()
    print(f"Selected genes for logistic regression: {selected_genes}")

    # Prepare the data
    X = df_melanoma[selected_genes].values
    y = df_melanoma['recist'].values
    y_combined = np.where((y == 'PR-SD') | (y == 'PD'), 'PD-PR-SD', y)

    X, y_combined = downsample_to_minority_class(X, y_combined, random_state=42)
    
    print(f"Data shape: X={X.shape}, y={y.shape}")
    print(f"Target distribution: {pd.Series(y_combined).value_counts().to_dict()}")

    # Standardize features
    #scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Set up cross-validation
    cv = StratifiedKFold(n_splits=4, shuffle=True)

    # Create logistic regression model
    lr = LogisticRegression( max_iter=1000, class_weight='balanced')

    # Perform cross-validation with multiple metrics
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    cv_results = cross_validate(lr, X_scaled, y_combined, cv=cv, scoring=scoring, return_train_score=True)

    # Print cross-validation results
    print("="*60)
    print("CROSS-VALIDATION RESULTS")
    print("="*60)

    for metric in scoring:
        train_scores = cv_results[f'train_{metric}']
        test_scores = cv_results[f'test_{metric}']
        print(f"{metric.upper()}:")
        print(f"  Train: {train_scores.mean():.3f} ± {train_scores.std():.3f}")
        print(f"  Test:  {test_scores.mean():.3f} ± {test_scores.std():.3f}")
        print()

    # Fit final model on all data for interpretation
    lr_final = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    lr_final.fit(X_scaled, y_combined)

    # Feature importance (coefficients)
    feature_importance = pd.DataFrame({
        'gene': selected_genes,
        'coefficient': lr_final.coef_[0],
        'abs_coefficient': np.abs(lr_final.coef_[0])
    }).sort_values('abs_coefficient', ascending=False)

    print("="*60)
    print("FEATURE IMPORTANCE (Logistic Regression Coefficients)")
    print("="*60)
    print(feature_importance)

    # Visualize feature importance
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    plt.barh(range(len(feature_importance)), feature_importance['coefficient'])
    plt.yticks(range(len(feature_importance)), feature_importance['gene'], fontsize=8)
    plt.xlabel('Coefficient Value')
    plt.title('Feature Coefficients')
    plt.grid(True, alpha=0.3)

    # Cross-validation scores visualization
    plt.subplot(2, 2, 2)
    test_scores = [cv_results[f'test_{metric}'] for metric in scoring]
    plt.boxplot(test_scores, labels=[m.replace('_macro', '') for m in scoring])
    plt.ylabel('Score')
    plt.title('Cross-Validation Performance')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)


    # Prediction probabilities for one fold (for illustration)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_combined, test_size=0.2, random_state=42, stratify=y_combined)
    lr_temp = LogisticRegression( max_iter=1000)
    lr_temp.fit(X_train, y_train)
    y_pred_proba = lr_temp.predict_proba(X_test)
    y_pred = lr_temp.predict(X_test)


    # Confusion matrix
    plt.subplot(2, 2, 3)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=lr_temp.classes_, yticklabels=lr_temp.classes_)
    plt.title('Confusion Matrix (Test Set)')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

    # Prediction probability distribution
    plt.subplot(2, 2, 4)
   

    # Elegimos la probabilidad de la clase mayoritaria (o la positiva, index 1 generalmente)
    # Asumimos que lr_temp.classes_[1] es 'PD-PR-SD'
    pos_class_index = 1 
    probs_pos_class = y_pred_proba[:, pos_class_index]

    # Separamos las probabilidades basadas en la verdad (y_test)
    for class_label in lr_temp.classes_:
        # Buscamos los índices donde la etiqueta REAL es class_label
        mask = (y_test == class_label)
        
        # Graficamos las probabilidades SOLO de esos pacientes
        plt.hist(probs_pos_class[mask], alpha=0.6, label=f'True {class_label}', bins=10)

    plt.xlabel(f'Probability of being {lr_temp.classes_[pos_class_index]}')
    plt.ylabel('Frequency')
    plt.title('Separation of Classes')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Detailed classification report
    print("="*60)
    print("CLASSIFICATION REPORT (Single Train-Test Split)")
    print("="*60)
    print(classification_report(y_test, y_pred))

    print("="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Number of features used: {len(selected_genes)}")
    print(f"Cross-validation accuracy: {cv_results['test_accuracy'].mean():.3f} ± {cv_results['test_accuracy'].std():.3f}")
    print(f"Most important features (by |coefficient|):")
    for i, row in feature_importance.head(3).iterrows():
        print(f"  {row['gene']}: {row['coefficient']:.3f}")

In [None]:
y_pred_proba