In [13]:
import sys
sys.path.append('../')
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from semi_supervised_transformers import PreprocessingPipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import LabelEncoder

In [14]:
data = pd.read_csv('/Users/annabzinkowska/DTU/master_thesis/data/processed_data_all_bertopic.csv')

  data = pd.read_csv('/Users/annabzinkowska/DTU/master_thesis/data/processed_data_all_bertopic.csv')


In [15]:
categorical_columns = ['month_mode', 'quarter_mode','year_mode','day_week_mode', 'customer_country_mode']
numerical_columns = ['quantity_sum', 'price_sum', 'unit_weight', 'unit_price_mean', 'customer_country_count', 'customer_id_count'] 

This code will train each classifier using the self-training approach, evaluate it on the validation set, and then display a ranking of classifiers by accuracy.

In [16]:
# Assuming `data` is your DataFrame and `categorical_columns`, `numerical_columns`, 
# and `log_transform_columns` are lists of column names

# Set a random seed for replicability
random_seed = 42

# Splitting data into labeled and unlabeled samples
labeled_data = data[data['category_bertopic'] != '-1']
unlabeled_data = data[data['category_bertopic'] == '-1']

X_labeled = labeled_data.drop(columns='category_bertopic')
y_labeled = labeled_data['category_bertopic']
X_unlabeled = unlabeled_data.drop(columns='category_bertopic')

# Further split labeled data for evaluation
X_train, X_val, y_train, y_val = train_test_split(X_labeled, y_labeled, test_size=0.2, stratify=y_labeled, random_state=random_seed)

# Check lengths
print("Length of X_train:", len(X_train))
print("Length of y_train:", len(y_train))
print("Length of X_val:", len(X_val))
print("Length of y_val:", len(y_val))

# Preprocess the data
pipeline = PreprocessingPipeline(categorical_columns, numerical_columns, text_column=None)
X_train_processed = pipeline.fit_transform(X_train, include_text=False)
X_val_processed = pipeline.transform(X_val, include_text=False)
X_unlabeled_processed = pipeline.transform(X_unlabeled, include_text=False)

# Double-check lengths after processing
print("Length of X_train_processed:", len(X_train_processed))
print("Length of X_val_processed:", len(X_val_processed))
print("Length of X_unlabeled_processed:", len(X_unlabeled_processed))

# Define the classifiers
classifiers = [
   #('Random Forest', RandomForestClassifier(random_state=random_seed)),
   #('XGBoost', xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=random_seed)),
   #('LightGBM', lgb.LGBMClassifier(random_state=random_seed)),
   #('SVM', SVC(probability=True, random_state=random_seed)),
   #('KNN', KNeighborsClassifier()),
]

# Fit the LabelEncoder on labels from the labeled dataset
le = LabelEncoder()
le.fit(y_labeled)

# Encode the labeled data
y_train_encoded = le.transform(y_train)
y_val_encoded = le.transform(y_val)

# Define a placeholder value for unlabeled data
# We'll use the next integer after the maximum label for labeled data
unlabeled_placeholder = len(le.classes_)

# Confidence threshold for self-training, default 0.75

# Track the number of samples pseudo-labeled in each iteration
pseudo_labeled_counts = []

# Training and evaluation
results = []



Length of X_train: 14573
Length of y_train: 14573
Length of X_val: 3644
Length of y_val: 3644
Length of X_train_processed: 14573
Length of X_val_processed: 3644
Length of X_unlabeled_processed: 16969


In [17]:
for name, base_classifier in classifiers:
    if hasattr(base_classifier, 'predict_proba'):
        self_training_model = SelfTrainingClassifier(base_classifier, criterion='threshold', threshold=threshold)

        # Combine the data
        combined_X = np.vstack((X_train_processed, X_unlabeled_processed))
        combined_y = np.concatenate([y_train_encoded, [-1] * len(X_unlabeled_processed)])

        # Track the number of samples pseudo-labeled in this iteration
        pseudo_labels = np.full(len(X_unlabeled_processed), -1)  # Placeholder labels for unlabeled data
        self_training_model.fit(combined_X, combined_y)

        # Get the model's confidence scores for the unlabeled data
        confidence_scores = self_training_model.predict_proba(X_unlabeled_processed)

        # Check which samples meet the confidence threshold for pseudo-labeling
        confident_samples = np.max(confidence_scores, axis=1) >= threshold

        # Update the pseudo-labels for confident samples
        pseudo_labels[confident_samples] = self_training_model.classes_[np.argmax(confidence_scores, axis=1)][confident_samples]

        # Count the number of pseudo-labeled samples in this iteration
        pseudo_labeled_count = np.sum(pseudo_labels != -1)
        pseudo_labeled_counts.append((name, pseudo_labeled_count))

        # Update the combined_y with pseudo-labels
        combined_y[len(y_train_encoded):] = pseudo_labels

        # Fit the model with updated pseudo-labels
        self_training_model.fit(combined_X, combined_y)

        # Evaluate the classifier after self-training
        y_pred = self_training_model.predict(X_val_processed)
        acc = accuracy_score(y_val_encoded, y_pred)

        # Convert y_pred back to original labels for reporting
        y_pred_original = le.inverse_transform(y_pred)
        y_val_original = le.inverse_transform(y_val_encoded)

        print(f"Results for {name}:")
        print("Accuracy:", acc)
        print("Classification Report:")
        print(classification_report(y_val_original, y_pred_original))
        print("Confusion Matrix:")
        print(confusion_matrix(y_val_original, y_pred_original))
        print("Pseudo-labeled Count:", pseudo_labeled_count)  # Print the number of pseudo-labeled samples
        print("-" * 50)

        results.append((name, acc))

    else:
        print(f"Skipping {name} as it does not support predict_proba")

# Sort and display classifiers by accuracy
results.sort(key=lambda x: x[1], reverse=True)
print("Classifier rankings:")
for i, (name, acc) in enumerate(results):
    print(f"{i + 1}. {name}: {acc:.4f}")

# Print the number of pseudo-labeled samples for each classifier
print("Pseudo-labeled Counts:")
for name, count in pseudo_labeled_counts:
    print(f"{name}: {count}")



In [None]:
print("Unique labels in data:", data['category_bertopic'].unique())
print("Labels seen by the encoder:", le.classes_)

Unique labels in data: ['-1' 'C' 'D' 'B' 'E' 'A']
Labels seen by the encoder: ['A' 'B' 'C' 'D' 'E']
