# Install dependancies

In [None]:
!pip install transformers datasets evaluate transformers[torch]

# Import libraries

In [2]:
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel
import evaluate
import torch
from sklearn.model_selection import train_test_split

# Configs

In [3]:
# CSV filename
data_file = 'sample_data_for_task1.csv'

# Choose a pre-trained German model
model_name = "dbmdz/bert-base-german-cased"
tokenizer_name = model_name

# Label encoding
id2label = {
    0: "ft",
    1: "pkg",
    2: "ct",
    3: "mr",
    4: "ch",
    5: "cnc",
}
label2id = {
    "ft": 0,
    "pkg": 1,
    "ct": 2,
    "mr": 3,
    "ch": 4,
    "cnc": 5,
}

# Read & Clean CSV

In [5]:
df = pd.read_csv(data_file, sep=',')
print(df.sample(10))

                                                   text label
9582                             musterbau verpackungen   pkg
25395                              drehteil bearbeitung    mr
37192                            RECYCLING lebensmittel    ft
4535   cnc fräsen metall 5 achs bearbeitung, Schweissen   cnc
14586                                  Pressen retrofit    mr
14105                                    kran gebraucht    ct
10455                                       ethyl ester    ch
20026                             Lippenpflege hyaluron    ch
11371               kunststoff lasern stanzen schneiden    mr
19745                                  Bandkante fräsen    mr


### Clean CSV

In [6]:
# Drop rows where 'text' is equal to blank space (Outliers)
print(f"Number of rows to be dropped due to empty 'text' values: {int(df[df['text'] == ' '].count()['text'])}")
df = df[df['text'] != ' ']

Number of rows to be dropped due to empty 'text' values: 300


In [8]:
# Drop rows containing Chineese texts '吉祥' (Outliers)
def contains_chinese(text):
    # Chinese characters range in Unicode
    chinese_regex = re.compile(r'[\u4e00-\u9fff]+')
    return chinese_regex.search(text) is not None

# Apply the function to the 'text' column to create a boolean mask
chinese_mask = df['text'].apply(contains_chinese)

# Use the mask to filter out rows containing Chinese text
df = df[~chinese_mask]

In [12]:
# Drop rows having mail adresses in column text '@mail.com' (Outliers)
print(f"Number of rows to be dropped due to '@mail.com' values: {int(df[df['text'].str.contains('@mail.com')].shape[0])}")
df = df[~df['text'].str.contains('@mail.com')]

Number of rows to be dropped due to '@mail.com' values: 300


In [7]:
# Fill empty labels
## we have 100 rows with empty 'label' value
## all the rows having empty labels, have 'text' value containing the word 'drehteile'
## more than 80% of labeled rows where 'text' column containing the word 'drehteile' have the label 'mr'
## we will assign unlabeled rows the label 'mr'
df.loc[df['label'].isnull(), 'label'] = 'mr'

### Encode labels

In [14]:
df['label'] = df['label'].map(label2id)

# Ingest Dataset

In [15]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

datasets_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

def remove_index_column(example):
    del example["__index_level_0__"]
    return example

datasets_dict = datasets_dict.map(remove_index_column)

print(datasets_dict)

Map:   0%|          | 0/29116 [00:00<?, ? examples/s]

Map:   0%|          | 0/7279 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 29116
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7279
    })
})


# PreProcess

In [16]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

In [17]:
def preprocess_text(examples):
  return tokenizer(examples["text"], truncation=True)

In [18]:
tokenized_datasets_dict = datasets_dict.map(preprocess_text, batched=True)

Map:   0%|          | 0/29116 [00:00<?, ? examples/s]

Map:   0%|          | 0/7279 [00:00<?, ? examples/s]

In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Set-up Evaluation

In [20]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [21]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Train

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(list(label2id.keys())),
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
training_args = TrainingArguments(
    output_dir="tc",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_dict["train"],
    eval_dataset=tokenized_datasets_dict["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4131,0.336171,0.892293
2,0.2452,0.290099,0.911114
3,0.1666,0.296303,0.918945


TrainOutput(global_step=5460, training_loss=0.3145412347255609, metrics={'train_runtime': 526.4881, 'train_samples_per_second': 165.907, 'train_steps_per_second': 10.371, 'total_flos': 637366254747696.0, 'train_loss': 0.3145412347255609, 'epoch': 3.0})

# Publish the model to HuggingFace

In [24]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
trainer.push_to_hub()

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

events.out.tfevents.1709781886.1332935de582.621.0:   0%|          | 0.00/8.16k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/waelChr/tc/commit/fff08fd82d234ce69601cc6b62a8378d19f0dd21', commit_message='End of training', commit_description='', oid='fff08fd82d234ce69601cc6b62a8378d19f0dd21', pr_url=None, pr_revision=None, pr_num=None)