In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import category_encoders as ce
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, accuracy_score
from utils.AL_utils import select_lowest_probabieties, select_samples, save_metrics_to_file, calculate_metrics, select_samples_weighted, calculate_dynamic_class_weights_based_on_model
%load_ext autoreload
%autoreload 2

# Load Data

In [2]:
file_name = 'car.data'
folder_name = 'car+evaluation'
df = pd.read_csv(os.path.join(os.getcwd(), folder_name, file_name), header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


# Preprocess data

In [4]:
# Rename columns
columns_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = columns_names

# Split data for X and y
X = df.drop(columns='class', axis=1)
y = df['class']

# Custom AL

In [5]:
# Split data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Split data for DL and DUL
X_labelled, X_unlabelled, y_labeled, y_unlabelled = train_test_split(X_train, y_train, test_size=0.9, stratify=y_train)

# Encode variables with ordinal encoding

encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
X_labelled = encoder.fit_transform(X_labelled)
X_test = encoder.transform(X_test)
X_unlabelled = encoder.transform(X_unlabelled)


# Whole budget
B =1000
# Budget per cycle
b = 50
# Number of cycle
c = 0

while B>0:
    dt_clf = DecisionTreeClassifier(criterion='gini', max_depth=100, random_state=0)
    dt_clf.fit(X_labelled, y_labeled)
    probalilities = dt_clf.predict_proba(X_unlabelled)
    class_weights = calculate_dynamic_class_weights_based_on_model(dt_clf, X_labelled, y_labeled)
    X_lowest_prob, y_lowest_proba, X_rest, y_rest = select_samples_weighted(probalility=probalilities, df=X_unlabelled, n_samples=b, labels=y_unlabelled, metric='margin_sampling', class_weights=class_weights)

    X_labelled = pd.concat([X_labelled, X_lowest_prob])
    y_labeled = pd.concat([y_labeled, y_lowest_proba])
    X_unlabelled = X_rest
    y_unlabelled = y_rest
    # Calculate accuracy for this cycle
    y_pred = dt_clf.predict(X_test)
    metrics_filename = 'custom_AL.csv'
    results_folder = 'results'

    # Calculate metrics
    precision, recall, f1, cm, accuracy = calculate_metrics(y_test, y_pred)
    print(precision, recall, f1)
    
    # Save the metrics to the file
    save_metrics_to_file(results_folder, metrics_filename, c, precision, recall, f1, cm, accuracy, X_labelled, y_test)

    c +=1
    B -=b

[0.60810811 0.2        0.90361446 0.375     ] [0.58441558 0.21428571 0.92975207 0.23076923] [0.59602649 0.20689655 0.91649695 0.28571429]
Metrics saved for cycle 0 to results/custom_AL.csv
[0.61971831 0.23076923 0.90118577 0.44444444] [0.57142857 0.21428571 0.94214876 0.30769231] [0.59459459 0.22222222 0.92121212 0.36363636]
Metrics saved for cycle 1 to results/custom_AL.csv
[0.69333333 0.14285714 0.90944882 0.7       ] [0.67532468 0.07142857 0.95454545 0.53846154] [0.68421053 0.0952381  0.93145161 0.60869565]
Metrics saved for cycle 2 to results/custom_AL.csv
[0.7        0.44444444 0.92369478 0.75      ] [0.72727273 0.28571429 0.95041322 0.46153846] [0.7133758  0.34782609 0.93686354 0.57142857]
Metrics saved for cycle 3 to results/custom_AL.csv
[0.73493976 0.6        0.94262295 1.        ] [0.79220779 0.42857143 0.95041322 0.69230769] [0.7625     0.5        0.94650206 0.81818182]
Metrics saved for cycle 4 to results/custom_AL.csv
[0.73626374 0.5        0.97033898 1.        ] [0.870129