In [1]:
#pip install small-text[transformers]

In [2]:
#pip install setfit

In [3]:
#pip install hnswlib

In [4]:
#pip install openpyxl

In [5]:
#!/bin/bash

In [4]:
# Import the rcParams module from the matplotlib library to customize plot appearance
from matplotlib import rcParams
# Update the font size of the x-axis and y-axis labels, as well as the overall axis labels
rcParams.update({'xtick.labelsize': 14, 'ytick.labelsize': 14, 'axes.labelsize': 16})

# Import the torch and numpy libraries for numerical computations
import torch
import numpy as np

# Set a seed value for reproducibility in random number generation
seed = 2022
torch.manual_seed(seed)  # Set the seed for PyTorch
np.random.seed(seed)     # Set the seed for NumPy

# Import the logging, glob, os, and pandas libraries for file handling and data manipulation
import logging
import glob
import os
import pandas as pd

# Read the initial training data from a CSV file into a pandas DataFrame
#initial_data = pd.read_csv("initial_train_updated.CSV", delimiter='\t', index_col=False)
initial_data = pd.read_csv("initial_train_smalltext.csv", delimiter='\t', index_col=False)
print(initial_data)  # Print the initial training data

# Get the number of unique classes in the 'Label.Marius' column of the initial training data
num_classes = initial_data['label'].nunique()

print("----------------------------")

# Define the path to the 'Training Data' directory
path = r'./Training Data'

# Use the glob library to get a list of all CSV files in the specified directory and its subdirectories
all_files = glob.glob(os.path.join(path, "**/*.csv"), recursive=True)

# Read and concatenate all the CSV files into a single pandas DataFrame
new_data = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
print(new_data)  # Print the concatenated data

print("----------------------------")

# Print the first 2 training samples from the initial training data
print('First 2 training samples:\n')
for i in range(2):
    print(initial_data.iloc[i, 0], ' ', initial_data.iloc[i, 1])

# Print the first 2 future samples from the concatenated data
print('First 2 future samples:\n')
for i in range(2):
    print(new_data.iloc[i, 0], ' ', initial_data.iloc[i, 1])

print("----------------------------")

# Print the profile attribute of the new_data DataFrame 
print(new_data.example)

                                              example  \
0   hier twittert das Team von Petra Pau/Marzahn-H...   
1   Die soziale Opposition: Feministisch. Sozialis...   
2   MdB DIE LINKE l Stellvertretende Fraktionsvors...   
3   MdB für die Menschen in der Städteregion Aache...   
4   Mitglied des Bundestages | Sprecherin für Arbe...   
..                                                ...   
65  Mitglied des Deutschen Bundestages | Auswärtig...   
66  Ostwestfale seit Geburt! Zur Zeit aktiv in MV,...   
67              Nein.  Nicht F. Jacobi,  ein anderer!   
68  Dipl.-Ing.(BA) Mechatronik | industrial automa...   
69    🌈 Für die Menschen und Twitter-Bots "ganz Ohr".   

                            label  
0   0 ökonomisch links + libertär  
1   0 ökonomisch links + libertär  
2   0 ökonomisch links + libertär  
3   0 ökonomisch links + libertär  
4   0 ökonomisch links + libertär  
..                            ...  
65              6 keine Kategorie  
66              6 keine Kat

In [6]:
# Import the numpy library for numerical computations
import numpy as np
# Import the LABEL_IGNORED and LABEL_UNLABELED constants from the small_text.base module
from small_text.base import LABEL_IGNORED, LABEL_UNLABELED

# Convert the 'label' column of the initial_data DataFrame into a categorical variable
initial_data['label'] = pd.Categorical(initial_data.label)
# Create a new column 'code' in the initial_data DataFrame that contains the numerical codes of the categorical labels
initial_data['code'] = initial_data['label'].cat.codes

# Print the initial_data DataFrame with the new 'code' column
print(initial_data)
print("--------")

# Create a new DataFrame 'df' by converting lists of tuples into a pandas DataFrame
# The DataFrame has three columns: 'example', 'label', and 'code'
# The 'example' column contains the values from the 'profile' column of the new_data DataFrame
# The 'label' column contains the string 'LABEL_UNLABELED' for all rows
# The 'code' column contains the numerical value of the LABEL_UNLABELED constant for all rows
df = pd.DataFrame(
    list(zip(new_data.example,  # use column "text"
             ['LABEL_UNLABELED'] * new_data.shape[0],
             [LABEL_UNLABELED] * new_data.shape[0])),
    columns=['example', 'label', 'code'])

# Print the newly created DataFrame 'df'
print(df)
print("--------")

# Concatenate the initial_data and df DataFrames along the row axis
# The resulting DataFrame 'input_data' has a multi-level index with two levels: 'Initial Training' and 'Twitter Data'
input_data = pd.concat([initial_data, df], keys=['Initial Training', 'Twitter Data'])

# Print the concatenated DataFrame 'input_data'
print(input_data)

                                              example  \
0   hier twittert das Team von Petra Pau/Marzahn-H...   
1   Die soziale Opposition: Feministisch. Sozialis...   
2   MdB DIE LINKE l Stellvertretende Fraktionsvors...   
3   MdB für die Menschen in der Städteregion Aache...   
4   Mitglied des Bundestages | Sprecherin für Arbe...   
..                                                ...   
65  Mitglied des Deutschen Bundestages | Auswärtig...   
66  Ostwestfale seit Geburt! Zur Zeit aktiv in MV,...   
67              Nein.  Nicht F. Jacobi,  ein anderer!   
68  Dipl.-Ing.(BA) Mechatronik | industrial automa...   
69    🌈 Für die Menschen und Twitter-Bots "ganz Ohr".   

                            label  code  
0   0 ökonomisch links + libertär     0  
1   0 ökonomisch links + libertär     0  
2   0 ökonomisch links + libertär     0  
3   0 ökonomisch links + libertär     0  
4   0 ökonomisch links + libertär     0  
..                            ...   ...  
65              6 kei

In [10]:
from sklearn.metrics import accuracy_score, f1_score
from IPython.display import clear_output

import pickle

num_queries = 1

#def evaluate(active_learner, train, test):
#y_pred = active_learner.classifier.predict(train)
#y_pred_test = active_learner.classifier.predict(test)
#test_acc = accuracy_score(y_pred_test, test.y)
#print('Train accuracy: {:.2f}'.format(accuracy_score(y_pred, train.y)))
#print('Test accuracy: {:.2f}'.format(test_acc))
#return test_acc

# Load the pickled model
with open('active_learner_cer.pkl', 'rb') as file:
    active_learner = pickle.load(file)

# evaluate the performance of the active learner on the training data,
# by computing the F1 score
def evaluate(active_learner, train_set): 

    # active_learner: instance of PoolBasedActiveLearner class, which is a
    # pool-based active learning framework
    y_pred = active_learner.classifier.predict(train_set)
    #y_pred_test = active_learner.classifier.predict(test)
    
    print(y_pred)
    print(train_set.y) #print true true labels of training data

    # f1: evaluation metric for multi-class classification
    print('Train f1: {:.2f}'.format(
        f1_score(y_pred, train_set.y, average='macro')))
    print('---')

results = []
#results.append(evaluate(active_learner, train[indices_labeled], test)) results.append(evaluate(active_learner, train[indices_labeled]))

In [17]:
# 3: import of xlsx with true labels (repeat)
import numpy as np
import pickle 
import pandas as pd

# Load the pickled model
with open('active_learner_cer.pkl', 'rb') as file:
    active_learner = pickle.load(file)
    
train = active_learner.dataset

indices_labeled = active_learner.indices_labeled

results = []
    
indices_queried = active_learner.query(num_samples=70)
probas = active_learner.classifier.predict_proba(train[indices_queried])
y_predicted = np.argmax(probas, axis=1)

y = []

# Read the xlsx file with sheetname and column name
df = pd.read_excel('CER_ClassificationProcess.xlsx', sheet_name='predicted_labels_cer_5', usecols=['true label'])
y = df['true label'].tolist()

y = np.asarray(y).astype("int16")

# Return the labels for the current query to the active learner.
active_learner.update(y)

train_set_y = train.y
train_set_y[indices_queried] = y
train.y = train_set_y

indices_labeled = np.concatenate([indices_queried, indices_labeled])

print(y)

print('---------------')
print(f'Iteration #{i} ({len(indices_labeled)} samples)')
#results.append(evaluate(active_learner, train[indices_labeled], test))
results.append(evaluate(active_learner, train[indices_labeled]))

# pickle the model
with open('active_learner_cer.pkl', 'wb') as file:
    pickle.dump(active_learner, file)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 16800
  Num epochs = 1
  Total optimization steps = 525
  Total train batch size = 32


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/525 [00:00<?, ?it/s]

[3 6 2 6 3 3 5 3 5 2 0 1 2 6 3 5 5 5 6 3 2 6 6 5 0 5 5 3 5 5 3 6 1 5 2 6 5
 5 6 5 5 5 6 5 3 6 5 5 5 0 6 5 4 3 6 1 4 5 2 6 5 3 6 1 5 5 6 3 5 3]
---------------
Iteration #1 (420 samples)
[3 6 2 6 3 3 5 3 5 2 0 1 2 6 3 5 5 5 6 3 2 6 6 5 0 5 5 3 5 5 3 6 1 5 2 6 5
 5 6 5 5 5 6 5 3 6 5 5 5 0 6 5 4 3 6 1 4 5 2 6 5 3 6 1 5 5 6 3 5 3 0 0 0 0
 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 5 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 4
 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 3 1 1 6 6 6
 1 1 0 6 6 1 0 6 3 1 6 3 6 3 3 1 1 6 2 6 2 1 1 5 5 1 0 1 0 6 6 6 6 6 3 1 0
 3 2 6 6 6 5 6 4 1 3 4 6 6 6 2 2 6 1 6 2 3 3 6 6 2 3 0 4 2 2 6 4 2 2 6 6 3
 2 2 2 3 0 0 2 2 6 0 1 5 4 0 0 2 0 0 0 2 3 4 6 3 0 3 2 4 3 6 2 2 3 4 2 0 6
 2 0 3 2 0 2 4 6 6 4 1 2 3 6 0 2 2 2 0 3 4 1 6 6 2 1 0 3 1 2 5 1 1 5 1 1 2
 6 3 2 1 6 2 3 5 2 6 1 1 5 2 6 1 6 6 0 6 4 5 5 6 6 1 6 2 3 1 1 5 2 2 0 6 5
 2 6 0 1 5 6 6 4 6 6 2 3 6 2 6 5 1 6 6 5 6 6 6 6 3 1 6 2 6 2 4 4 0 6 2 2 6
 1 3 1 6 6 4 2 1 0 3 6 0 4 2 3 6 1 3 3 1 3 3 5 3 5 3 3 5 3 3 1 3

In [16]:
# 4: second & following predictions
import csv
import pickle

# Load the pickled model
with open('active_learner_cer.pkl', 'rb') as file:
    active_learner = pickle.load(file)

#active_learner._clf_factory.kwargs['max_iter'] = 2000
active_learner._clf.model.max_iter = 2000
active_learner._clf.model.class_weight = 'balanced'
    
train = active_learner.dataset

indices_labeled = active_learner.indices_labeled

results = []

indices_queried = active_learner.query(num_samples=70)
probas = active_learner.classifier.predict_proba(train[indices_queried])
y_predicted = np.argmax(probas, axis=1)

y = []

print(y_predicted)

# Write predicted labels to a CSV file
with open('Predicted Labels/predicted_labels_cer_5.csv', mode='w', newline='') as file:
    fieldnames = ['text', 'prediction', 'true label']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for j in range(len(indices_queried)):
        writer.writerow({'text': train.x[indices_queried[j]], 'prediction': y_predicted[j], 'true label': ""})

print(y)        

# Ask the user to continue
input("Press Enter to continue...") 
#Simulate user interaction here. Replace this for real-world usage.
#y = train.y[indices_queried]

# pickle the model
with open('active_learner_cer.pkl', 'wb') as file:
    pickle.dump(active_learner, file)

[3 3 6 6 3 3 5 3 5 2 6 6 6 6 3 5 5 5 6 3 2 6 6 5 0 5 5 3 5 5 3 6 1 5 2 6 5
 5 6 5 6 5 6 5 3 6 5 5 5 0 2 5 4 3 3 1 4 5 6 6 5 3 6 1 5 5 6 3 5 3]
[]


Press Enter to continue... 


In [6]:
from sampling import _get_class_histogram

# get histogram of predictions
_get_class_histogram(active_learner.y, num_classes = 16)

array([ 102,   62,   24,   12,   78,  127,   73,  106,  221,  144,   29,
         29,   17,   25,   26, 1325])