# Install dependancies

In [None]:
! pip install transformers datasets

In [None]:
!pip install transformers[torch]

In [None]:
!pip install transformers datasets evaluate

# Import libraries

In [None]:
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel
import evaluate
import torch
from sklearn.model_selection import train_test_split

# Configs

In [None]:
# CSV filename
data_file = 'sample_data_for_task1.csv'

# Choose a pre-trained German model
model_name = "dbmdz/bert-base-german-cased"
tokenizer_name = model_name

# Label encoding
id2label = {
    0: "ft",
    1: "pkg",
    2: "ct",
    3: "mr",
    4: "ch",
    5: "cnc",
}
label2id = {
    "ft": 0,
    "pkg": 1,
    "ct": 2,
    "mr": 3,
    "ch": 4,
    "cnc": 5,
}

# Read & Clean CSV

In [None]:
df = pd.read_csv(data_file, sep=',')
print(df.sample(10))

                                                 text label
21676                                    reis vietnam    ft
20392                               weine für premium    ft
18094                         kunststoff bilderrahmen   pkg
33323  kümmel Fenchel Anis extrakt gefriergetrocknete    ft
35858                                  trink flaschen   pkg
7978                   desinfektionsmittel konzentrat    ch
35951                       servicemonteure metallbau    ct
9632                                        PET Dosen   pkg
11278                              Versandtüte papier   pkg
23475                           gitternetz kunststoff   pkg


### Clean CSV

In [None]:
# Drop rows where 'text' is equal to blank space
print(f"Number of rows to be dropped due to empty 'text' values: {int(df[df['text'] == ' '].count()['text'])}")
df = df[df['text'] != ' ']

Number of rows to be dropped due to empty 'text' values: 300


In [None]:
# Fill empty labels
## we have 100 rows with empty 'label' value
## all the rows having empty labels, have 'text' value containing the word 'drehteile'
## more than 80% of labeled rows where 'text' column containing the word 'drehteile' have the label 'mr'
## we will assign unlabeled rows the label 'mr'
df.loc[df['label'].isnull(), 'label'] = 'mr'

In [None]:
# Drop rows containing Chineese texts '吉祥'
def contains_chinese(text):
    # Chinese characters range in Unicode
    chinese_regex = re.compile(r'[\u4e00-\u9fff]+')
    return chinese_regex.search(text) is not None

# Apply the function to the 'text' column to create a boolean mask
chinese_mask = df['text'].apply(contains_chinese)

# Use the mask to filter out rows containing Chinese text
df = df[~chinese_mask]

### Encode labels

In [None]:
df['label'] = df['label'].map(label2id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map(label2id)


# Ingest Dataset

In [None]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

datasets_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

def remove_index_column(example):
    del example["__index_level_0__"]
    return example

datasets_dict = datasets_dict.map(remove_index_column)

print(datasets_dict)

Map:   0%|          | 0/29356 [00:00<?, ? examples/s]

Map:   0%|          | 0/7339 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 29356
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7339
    })
})


# PreProcess

In [None]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

In [None]:
def preprocess_text(examples):
  return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_datasets_dict = datasets_dict.map(preprocess_text, batched=True)

Map:   0%|          | 0/29356 [00:00<?, ? examples/s]

Map:   0%|          | 0/7339 [00:00<?, ? examples/s]

{'text': 'eimer mit deckel lebensmittel',
 'label': 1,
 'input_ids': [102, 2746, 345, 212, 506, 14564, 17935, 1049, 103],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Set-up Evaluation

In [None]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Train

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(list(label2id.keys())),
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="tc",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_dict["train"],
    eval_dataset=tokenized_datasets_dict["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0586,0.646626,0.907344
2,0.047,0.638914,0.91225
3,0.0724,0.544017,0.917427


TrainOutput(global_step=5505, training_loss=0.057410420932816335, metrics={'train_runtime': 324.83, 'train_samples_per_second': 271.12, 'train_steps_per_second': 16.947, 'total_flos': 642673888681104.0, 'train_loss': 0.057410420932816335, 'epoch': 3.0})

# Publish the model to HuggingFace

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub()

optimizer.pt:   0%|          | 0.00/880M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

Upload 8 LFS files:   0%|          | 0/8 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

events.out.tfevents.1709696261.08f52ca9a3f5.893.7:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/waelChr/tc/commit/ca406f470443dc7ce3a92da13f9a546fbefc0621', commit_message='End of training', commit_description='', oid='ca406f470443dc7ce3a92da13f9a546fbefc0621', pr_url=None, pr_revision=None, pr_num=None)