In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import category_encoders as ce
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, accuracy_score
from utils.AL_utils import select_lowest_probabieties, select_samples, save_metrics_to_file, calculate_metrics

# Load Data

In [2]:
file_name = 'car.data'
df = pd.read_csv(os.path.join(os.getcwd(), 'car+evaluation', file_name), header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


# Preprocess data

In [4]:
# Rename columns
columns_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = columns_names

# Split data for X and y
X = df.drop(columns='class', axis=1)
y = df['class']

In [5]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
# Split data to DL and DU
X_labelled, X_unlabelled, y_labeled, y_unlabelled = train_test_split(X_train, y_train, test_size=0.9, stratify=y_train)

# Prepare data for model
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
X_labelled = encoder.fit_transform(X_labelled)
X_test = encoder.transform(X_test)
X_unlabelled = encoder.transform(X_unlabelled)


# Whole budget
B =1000
# Budget per cycle
b = 50
# Number of cycle
c = 0

results_folder = 'results'

while B>0:
    # 1. Initialize and train the model
    dt_clf = DecisionTreeClassifier(criterion='gini', max_depth=100, random_state=0)
    dt_clf.fit(X_labelled, y_labeled)
    probalilities = dt_clf.predict_proba(X_unlabelled)
    # 2. Select samples based on choosen metrics and ask Oracle
    X_lowest_prob, y_lowest_proba, X_rest, y_rest = select_samples(probalility=probalilities, df=X_unlabelled, n_samples=b, labels=y_unlabelled, metric='margin_sampling')

    # 3. Add samples labelled by Oracle to DL
    X_labelled = pd.concat([X_labelled, X_lowest_prob])
    y_labeled = pd.concat([y_labeled, y_lowest_proba])
    # 4. Update DUL
    X_unlabelled = X_rest
    y_unlabelled = y_rest

    # Calculate accuracy for this cycle
    y_pred = dt_clf.predict(X_test)

    metrics_filename = 'classic_AL.csv'

    # Calculate metrics
    precision, recall, f1, cm, accuracy = calculate_metrics(y_test, y_pred)
    
    # Save the metrics to the file
    save_metrics_to_file(results_folder, metrics_filename, c, precision, recall, f1, cm, accuracy, X_labelled, y_test)
        
    # Print accurracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nOverall Accuracy: {accuracy:.4f}")
    
    # Update cycle and budget
    c +=1
    B -=b

Metrics saved for cycle 0 to results/classic_AL.csv

Overall Accuracy: 0.8150
Metrics saved for cycle 1 to results/classic_AL.csv

Overall Accuracy: 0.8237
Metrics saved for cycle 2 to results/classic_AL.csv

Overall Accuracy: 0.8497
Metrics saved for cycle 3 to results/classic_AL.csv

Overall Accuracy: 0.8526
Metrics saved for cycle 4 to results/classic_AL.csv

Overall Accuracy: 0.8555
Metrics saved for cycle 5 to results/classic_AL.csv

Overall Accuracy: 0.8353
Metrics saved for cycle 6 to results/classic_AL.csv

Overall Accuracy: 0.8439
Metrics saved for cycle 7 to results/classic_AL.csv

Overall Accuracy: 0.8584
Metrics saved for cycle 8 to results/classic_AL.csv

Overall Accuracy: 0.8757
Metrics saved for cycle 9 to results/classic_AL.csv

Overall Accuracy: 0.8757
Metrics saved for cycle 10 to results/classic_AL.csv

Overall Accuracy: 0.8786
Metrics saved for cycle 11 to results/classic_AL.csv

Overall Accuracy: 0.9017
Metrics saved for cycle 12 to results/classic_AL.csv

Overall 