In [11]:
# Import the rcParams module from the matplotlib library to customize plot appearance
from matplotlib import rcParams
# Update the font size of the x-axis and y-axis labels, as well as the overall axis labels
rcParams.update({'xtick.labelsize': 14, 'ytick.labelsize': 14, 'axes.labelsize': 16})

# Import the torch and numpy libraries for numerical computations
import torch
import numpy as np

# Set a seed value for reproducibility in random number generation
seed = 2022
torch.manual_seed(seed)  # Set the seed for PyTorch
np.random.seed(seed)     # Set the seed for NumPy

# Import the logging, glob, os, and pandas libraries for file handling and data manipulation
import logging
import glob
import os
import pandas as pd

# Read the initial training data from a CSV file into a pandas DataFrame
#initial_data = pd.read_csv("initial_train_updated.CSV", delimiter='\t', index_col=False)
initial_data = pd.read_csv("initial_train_smalltext.csv", delimiter='\t', index_col=False)

# Get the number of unique classes in the 'Label.Marius' column of the initial training data
num_classes = initial_data['label'].nunique()

# Define the path to the 'Training Data' directory
path = r'./Training Data'

# Use the glob library to get a list of all CSV files in the specified directory and its subdirectories
all_files = glob.glob(os.path.join(path, "**/*.csv"), recursive=True)

# Read and concatenate all the CSV files into a single pandas DataFrame
new_data = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [12]:
# Import the numpy library for numerical computations
import numpy as np
# Import the LABEL_IGNORED and LABEL_UNLABELED constants from the small_text.base module
from small_text.base import LABEL_IGNORED, LABEL_UNLABELED

# Convert the 'label' column of the initial_data DataFrame into a categorical variable
initial_data['label'] = pd.Categorical(initial_data.label)
# Create a new column 'code' in the initial_data DataFrame that contains the numerical codes of the categorical labels
initial_data['code'] = initial_data['label'].cat.codes

# Create a new DataFrame 'df' by converting lists of tuples into a pandas DataFrame
# The DataFrame has three columns: 'example', 'label', and 'code'
# The 'example' column contains the values from the 'profile' column of the new_data DataFrame
# The 'label' column contains the string 'LABEL_UNLABELED' for all rows
# The 'code' column contains the numerical value of the LABEL_UNLABELED constant for all rows
df = pd.DataFrame(
    list(zip(new_data.example,  # use column "text"
             ['LABEL_UNLABELED'] * new_data.shape[0],
             [LABEL_UNLABELED] * new_data.shape[0])),
    columns=['example', 'label', 'code'])

# Concatenate the initial_data and df DataFrames along the row axis
# The resulting DataFrame 'input_data' has a multi-level index with two levels: 'Initial Training' and 'Twitter Data'
input_data = pd.concat([initial_data, df], keys=['Initial Training', 'Twitter Data'])

In [17]:
# Import the TextDataset class from the small_text module.
from small_text import TextDataset

# Creates an array of numbers from 0 to num_classes-1. The variable num_classes is presumably defined elsewhere in the code.
target_labels = np.arange(num_classes)

# Prints the array of target labels to the console.
print(target_labels)

# Creating an instance of the TextDataset using the `from_arrays` class method. This requires several arguments:
# 1. A list of examples obtained from the 'example' column of the input_data dataframe.
# 2. An array of codes obtained from the 'code' column of the input_data dataframe.
# 3. The target labels, which is an array of integers from 0 to num_classes-1.
train = TextDataset.from_arrays(
    input_data.example.tolist(),
    input_data.code.to_numpy(),
    target_labels=target_labels)

[0 1 2 3 4 5 6]




In [13]:
from sklearn.metrics import accuracy_score, f1_score
from IPython.display import clear_output

import pickle

num_queries = 1

#def evaluate(active_learner, train, test):
#y_pred = active_learner.classifier.predict(train)
#y_pred_test = active_learner.classifier.predict(test)
#test_acc = accuracy_score(y_pred_test, test.y)
#print('Train accuracy: {:.2f}'.format(accuracy_score(y_pred, train.y)))
#print('Test accuracy: {:.2f}'.format(test_acc))
#return test_acc

# Load the pickled model
with open('active_learner_cer.pkl', 'rb') as file:
    active_learner = pickle.load(file)

# evaluate the performance of the active learner on the training data,
# by computing the F1 score
def evaluate(active_learner, train_set): 

    # active_learner: instance of PoolBasedActiveLearner class, which is a
    # pool-based active learning framework
    y_pred = active_learner.classifier.predict(train_set)
    #y_pred_test = active_learner.classifier.predict(test)
    
    print(y_pred)
    print(train_set.y) #print true true labels of training data

    # f1: evaluation metric for multi-class classification
    print('Train f1: {:.2f}'.format(
        f1_score(y_pred, train_set.y, average='macro')))
    print('---')

results = []
#results.append(evaluate(active_learner, train[indices_labeled], test)) results.append(evaluate(active_learner, train[indices_labeled]))

In [20]:
import pickle
import pandas as pd
import numpy as np
from small_text import TextDataset
from sklearn.metrics import accuracy_score, classification_report

In [21]:
# 1. Load the Model
with open('active_learner_cer.pkl', 'rb') as file:
    active_learner = pickle.load(file)

In [22]:
# 2. Load the Test Data
#test_data = pd.read_csv("labeled_testdata_smalltext.csv")
test_data = pd.read_csv("labeled_testdata_smalltext.csv", delimiter='\t', index_col=False)
test_examples = test_data["example"].tolist()
true_labels = test_data["label"].tolist()

# Convert test examples and labels into a dataset object
test_dataset = TextDataset.from_arrays(test_examples, np.zeros_like(true_labels), target_labels=target_labels)

# Predict with the Model
predicted_probas = active_learner.classifier.predict_proba(test_dataset)
predicted_labels = np.argmax(predicted_probas, axis=1)



In [24]:
# 3. Predict with the Model
predicted_probas = active_learner.classifier.predict_proba(test_dataset)
predicted_labels = np.argmax(predicted_probas, axis=1)

In [25]:
# 4. Evaluate the Model
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.97


In [26]:
# Detailed classification report
report = classification_report(true_labels, predicted_labels)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.90      0.95        10
           1       0.96      0.93      0.95        28
           2       0.92      1.00      0.96        23
           3       1.00      1.00      1.00        27
           4       1.00      0.91      0.95        11
           5       1.00      1.00      1.00        24
           6       0.96      0.97      0.97        77

    accuracy                           0.97       200
   macro avg       0.98      0.96      0.97       200
weighted avg       0.97      0.97      0.97       200



In [None]:
from sklearn.metrics import f1_score

