In [2]:
import numpy as np
np.set_printoptions(threshold=10000,suppress=True)
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [10]:
# Lire correctement le fichier Wave.txt en utilisant un séparateur d'espaces
# (le fichier contient des valeurs séparées par des espaces, pas par des tabulations)
# Utiliser sep='\s+' pour gérer tout nombre d'espaces like '  ' ou '\t'

df = pd.read_csv('Wave.txt', sep='\s+', header=None, engine='python')
print("Loaded df shape:", df.shape)

# Renommer la dernière colonne en 'label' pour clarifier les diagnostics
n_cols = df.shape[1]
feature_names = [f'X{i}' for i in range(n_cols - 1)] + ['label']
df.columns = feature_names

print("Columns preview (first 5 cols):")
print(df.iloc[:, :5].head())

# Vérification des labels
y = df['label']
print("label column name:", y.name)
print("dtype:", y.dtype)
print("shape df:", df.shape)
print("n unique labels:", y.nunique())
print("value counts (sorted):\n", y.value_counts().sort_index())

# Créer un programme qui permet de découper votre base de données X avec un échantillonnage stratifié
# par rapport aux labels en deux sous-ensembles d’apprentissage A et de test T de tailles respectivement
# 1/2 et 1/2. La fonction gère les classes rares (moins de 2 échantillons) en revenant à un split aléatoire.

def stratified_split(df, test_size=0.5, random_state=42, min_count_for_stratify=2):
    """Retourne A (train) et T (test). Si certaines classes ont moins de
    `min_count_for_stratify` échantillons, on utilisera un split aléatoire (pas de stratification).
    """
    from sklearn.model_selection import train_test_split

    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    counts = y.value_counts()
    # Afficher les comptes par classe pour aide au débogage
    print("Classe counts:\n", counts)

    if counts.min() < min_count_for_stratify:
        rare = counts[counts < min_count_for_stratify]
        print(f"⚠️ Certaines classes sont trop rares ({len(rare)} classes), p.ex. :\n{rare}\n"
              "Utilisation d'un split aléatoire (stratify=None).")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, stratify=None, random_state=random_state
        )
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, stratify=y, random_state=random_state
        )

    # Combiner les ensembles d'apprentissage et de test
    A = pd.concat([X_train, y_train], axis=1)
    T = pd.concat([X_test, y_test], axis=1)

    print(f"A shape: {A.shape}, T shape: {T.shape}")
    return A, T


A, T = stratified_split(df)

Loaded df shape: (5000, 41)
Columns preview (first 5 cols):
     X0    X1    X2    X3    X4
0 -0.23 -1.21  1.20  1.23 -0.10
1  0.38  0.38 -0.31 -0.09  1.52
2 -0.69  1.00  1.08  1.48  2.44
3  0.40  0.68  0.27  1.39  1.03
4 -0.81  1.59 -0.69  1.16  4.22
label column name: label
dtype: int64
shape df: (5000, 41)
n unique labels: 3
value counts (sorted):
 label
0    1692
1    1653
2    1655
Name: count, dtype: int64
Classe counts:
 label
0    1692
2    1655
1    1653
Name: count, dtype: int64
A shape: (2500, 41), T shape: (2500, 41)


In [23]:
# Base A  partiellement etiquetée  pct (% des donnees labellisee par rapport a la taille totale de A )

def semi_label(A, pct=0.1, random_state=42):
    np.random.seed(random_state)
    A_labeled = A.copy()
    n_samples = A.shape[0]
    n_labeled = int(n_samples * pct)

    # Sélectionner aléatoirement les indices des échantillons à étiqueter
    labeled_indices = np.random.choice(n_samples, n_labeled, replace=False)

    # Mettre les labels non sélectionnés à NaN
    A_labeled.loc[~A_labeled.index.isin(labeled_indices), 'label'] = np.nan

    print(f"Nombre d'échantillons dans A: {n_samples}")
    print(f"Nombre d'échantillons étiquetés dans A: {n_labeled}")

    # separation A_etiq  et a_non_etiq
    A_etiq = A_labeled.dropna(subset=['label'])
    A_non_etiq = A_labeled[A_labeled['label'].isna()]

    return A_etiq, A_non_etiq
    

In [28]:
A_labeled = semi_label(A, pct=0.1)
A_etiq = A_labeled[0]
print(A_etiq)
A_non_etiq = A_labeled[1]

Nombre d'échantillons dans A: 2500
Nombre d'échantillons étiquetés dans A: 250
        X0    X1    X2    X3    X4    X5    X6    X7    X8    X9  ...   X31  \
1011  1.26  0.79  0.28  3.12  1.37  6.73  5.07  5.35  2.99  3.59  ... -3.00   
1977  0.78  1.36  2.06  1.68  0.91  2.44  1.49  3.39  0.77  2.25  ... -0.80   
2054 -0.65  0.41  0.05  1.26  0.38 -0.87 -0.45  0.98  0.25  1.29  ... -0.86   
756   0.15 -0.11  1.21 -1.41  1.36  0.90  3.14  2.35  3.88  6.66  ...  1.09   
450  -0.84 -0.83  2.19  2.11  3.53  4.13  5.51  4.63  4.18  4.38  ... -1.00   
...    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
1047  0.47 -1.03  1.24  0.49  0.16  2.06  2.87  2.14  0.52  0.39  ...  0.04   
2442  0.82 -0.31  1.92  0.65  1.12  0.85 -0.07  2.19  1.39  0.12  ...  0.15   
1655  0.55 -2.33  2.36  3.10  0.95  2.49  3.48  4.24  4.58  5.20  ...  0.18   
2032  1.68  1.06  0.44  3.66  2.08  1.62  3.60  3.46  3.72 -0.57  ...  0.70   
2045 -0.70 -0.83  1.29  0.39 -0.77  1.82  0.91  2.79

In [None]:
# NOTE: placeholder S1 removed — Fisher-score calculation is implemented in the next cell
# This cell kept for informational purpose only.
# Original S1(v) was incorrect; the correct calculations use S1 (between-class) and S2 (within-class)
# computed only on the labeled samples of A_labeled.


In [18]:
# S1(v) = sum_k n_k * (mu_k - mu)^2 / sum_k n_k * sigma (ecart type) ²
# S2(v) = sum_k n_k * sigma_k^2

def S1(x, y):
    mu = x.mean()
    s1 = 0.0
    s2 = 0.0
    for k, group in x.groupby(y):
        n_k = len(group)
        mu_k = group.mean()
        s1 += n_k * (mu_k - mu) ** 2
        var_k = group.var(ddof=0)
        s2 += n_k * var_k
    return s1 / s2 if s2 != 0 else np.nan



In [31]:
def S2(x, t=10):
    x = x.values  # vecteur numpy
    var = np.var(x)
    if var == 0:
        return np.nan
    
    n = len(x)
    score = 0.0
    
    for i in range(n):
        for j in range(n):
            S_ij = np.exp(- ((x[i] - x[j])**2) / t)
            score += S_ij * ((x[i] - x[j])**2) / var
    
    return score


In [35]:
def compute_scores(A_etiq, A_non_etiq):
    
    X_etiq = A_etiq.drop(columns=["label"])
    y = A_etiq["label"]
    X_non = A_non_etiq.drop(columns=["label"])
    
    scores = {}
    
    for v in X_etiq.columns:
        s1 = S1(X_etiq[v], y)
        s2 = S2(X_non[v])
        scores[v] = s1 / s2 if s2 not in [0, np.nan] else np.nan
    
    return pd.Series(scores).sort_values(ascending=False)


In [None]:
scores = compute_scores(A_etiq, A_non_etiq)
print(scores)
