## Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q datasets transformers[sentencepiece] langid watermark
!pip install --upgrade accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m116.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.4 MB/s[0m

In [None]:
%load_ext watermark
%watermark -p torch,datasets,sklearn,transformers,langid

torch       : 2.0.1+cu118
datasets    : 2.12.0
sklearn     : 1.2.2
transformers: 4.29.2
langid      : 1.1.6



In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import time
from pathlib import Path

#import langid
import torch
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    DataCollatorWithPadding, 
    pipeline,
    Trainer,
    TrainingArguments
)

In [None]:
!pip install ipython-autotime --q
import time
%load_ext autotime

time: 344 µs (started: 2023-05-29 09:21:47 +00:00)


## Data Exploration

In [None]:
# Read Lang Dataset
lang_df=pd.read_csv('/content/drive/MyDrive/Kemet/Data/Language_det_train.csv')
lang_df

Unnamed: 0,Text,Language
0,στη Γαλλία νωρίτερα ραντεβού χρησιμοποιήθηκε α...,Greek
1,e con ciò lei salì nella sua carrozza e senza ...,Italian
2,buna değmez.,Turkish
3,Viktiga skillnader är att i en wiki lagras sid...,Sweedish
4,تعرف على ما إذا كان شخص ما يقول نكتة رائعة يمك...,Arabic
...,...,...
9815,ഇംഗ്ലീഷ് വിക്കിപീഡിയയിൽ പലപ്പോഴും ഭൂരിപക്ഷം ആള...,Malayalam
9816,"Les algorithmes utilisés permettent, dans une ...",French
9817,"تم استخدام مصطلح ""التعلم الآلي"" لأول مرة في عا...",Arabic
9818,De hade under år 2000 försökt starta uppslagsv...,Sweedish


time: 2.44 s (started: 2023-05-29 09:22:01 +00:00)


In [None]:
# Rename cols as the pretrained model
lang_df= lang_df.rename(columns = {'Language':'labels','Text':'text'})

time: 2.79 ms (started: 2023-05-29 09:22:04 +00:00)


In [None]:
print(lang_df.labels.value_counts())

English       1316
French         963
Spanish        778
Portugeese     702
Italian        663
Russian        657
Sweedish       642
Malayalam      564
Dutch          519
Arabic         509
Turkish        450
German         446
Tamil          446
Danish         407
Kannada        351
Greek          347
Hindi           60
Name: labels, dtype: int64
time: 8.49 ms (started: 2023-05-29 09:22:04 +00:00)


In [None]:
# Map each language to its apper
language_mapping = {'English': 'en','French': 'fr','Portugeese': 'pt',
                    'Russian': 'ru','Malayalam': 'ml','Tamil': 'ta',
                    'Danish': 'da','Kannada': 'kn','Hindi': 'hi',
                    'Dutch': 'nl','Turkish': 'tr','Arabic': 'ar',
                    'German': 'de','Sweedish': 'sv','Greek': 'el',
                    'Spanish': 'es','Italian': 'it'}

lang_df['labels'] = lang_df['labels'].replace(language_mapping)

time: 24.9 ms (started: 2023-05-29 09:22:04 +00:00)


In [None]:
# split data to train and test
x_train, x_test, y_train, y_test = train_test_split(lang_df.text, lang_df.labels, test_size=0.2,
                                                     stratify=lang_df.labels,random_state=42)
ds_train= Dataset.from_pandas(pd.concat([x_train, y_train], axis=1)) 
ds_test= Dataset.from_pandas(pd.concat([x_test, y_test], axis=1))

time: 115 ms (started: 2023-05-29 09:22:04 +00:00)


## Tokenization

In [None]:
gdrive_dir = Path('/content/drive/MyDrive/Kemet/Checkpoints')
model_ckpt = "papluca/xlm-roberta-base-language-detection"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading (…)okenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

time: 5.84 s (started: 2023-05-29 01:01:18 +00:00)


In [None]:
def tokenize_text(sequence):
    """Tokenize input sequence."""
    return tokenizer(sequence["text"], truncation=True, max_length=128)

time: 784 µs (started: 2023-05-29 01:01:24 +00:00)


Tokenize all sub-datasets:

In [None]:
tok_train = ds_train.map(tokenize_text, batched=True)
tok_test = ds_test.map(tokenize_text, batched=True)

Map:   0%|          | 0/7856 [00:00<?, ? examples/s]

Map:   0%|          | 0/1964 [00:00<?, ? examples/s]

time: 1.6 s (started: 2023-05-29 01:01:24 +00:00)


Prepare forward and backward mappings between labels strings and integers:

In [None]:
languages = lang_df.labels.values
all_langs = sorted(list(set(languages)))

id2label = {idx: all_langs[idx] for idx in range(len(all_langs))}
label2id = {v: k for k, v in id2label.items()}
label2id

{'ar': 0,
 'da': 1,
 'de': 2,
 'el': 3,
 'en': 4,
 'es': 5,
 'fr': 6,
 'hi': 7,
 'it': 8,
 'kn': 9,
 'ml': 10,
 'nl': 11,
 'pt': 12,
 'ru': 13,
 'sv': 14,
 'ta': 15,
 'tr': 16}

time: 5.1 ms (started: 2023-05-29 01:01:26 +00:00)


In [None]:
def encode_labels(example):
    """Map string labels to integers."""
    example["labels"] = label2id[example["labels"]]
    return example

time: 312 µs (started: 2023-05-29 01:01:26 +00:00)


Encode targets:

In [None]:
tok_train = tok_train.map(encode_labels, batched=False)
tok_test = tok_test.map(encode_labels, batched=False)

Map:   0%|          | 0/7856 [00:00<?, ? examples/s]

Map:   0%|          | 0/1964 [00:00<?, ? examples/s]

time: 769 ms (started: 2023-05-29 01:01:26 +00:00)


In [None]:
# Use dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

time: 512 µs (started: 2023-05-29 01:01:27 +00:00)


## Model training

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
  model_ckpt, num_labels=len(all_langs), id2label=id2label, label2id=label2id,ignore_mismatched_sizes=True
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at papluca/xlm-roberta-base-language-detection and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([20, 768]) in the checkpoint and torch.Size([17, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([20]) in the checkpoint and torch.Size([17]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


time: 8.14 s (started: 2023-05-29 01:02:01 +00:00)


We define here the metrics that we're going to monitor during training:

In [None]:
def compute_metrics(pred):
    """Custom metric to be used during training."""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)  # Accuracy
    f1 = f1_score(labels, preds, average="weighted")  # F1-score
    return {
        "accuracy": acc,
        "f1": f1
        }

time: 458 µs (started: 2023-05-29 01:02:09 +00:00)


To train our model, we'll use the HF `Trainer`. The 1st step is to create an instance of the `TrainingArguments` class, which will contain all the hyperparameters the `Trainer` will use for training and evaluation.

In [None]:
epochs = 2
lr = 2e-5
train_bs = 64
eval_bs = train_bs * 2

# Log training loss at each epoch
logging_steps = len(tok_train) // train_bs
save_steps = logging_steps
# Out dir
output_dir = gdrive_dir / "xlm-roberta-base-finetuned-language-detection"

training_args = TrainingArguments(
  output_dir=output_dir,
  num_train_epochs=epochs,
  save_steps=save_steps,
  learning_rate=lr,
  per_device_train_batch_size=train_bs,
  per_device_eval_batch_size=eval_bs,
  evaluation_strategy="epoch",
  logging_steps=logging_steps,
 fp16=True,  # Remove if GPU doesn't support it
)

time: 90 ms (started: 2023-05-29 01:02:09 +00:00)


Then, we can instantiate the `Trainer`:

In [None]:
trainer = Trainer(
  model,
  training_args,
  compute_metrics=compute_metrics,
  train_dataset=tok_train,
  eval_dataset=tok_test,
  data_collator=data_collator,
  tokenizer=tokenizer,
)

time: 7.79 s (started: 2023-05-29 01:02:09 +00:00)


Let's train the model!

In [None]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9285,0.175624,0.982688,0.979765
2,0.1704,0.101748,0.989817,0.989826


TrainOutput(global_step=246, training_loss=0.5459021973173793, metrics={'train_runtime': 152.0558, 'train_samples_per_second': 103.33, 'train_steps_per_second': 1.618, 'total_flos': 853688333053920.0, 'train_loss': 0.5459021973173793, 'epoch': 2.0})

time: 2min 32s (started: 2023-05-29 01:02:17 +00:00)


## Prediction

In [None]:
device = 0 if torch.cuda.is_available() else -1
checkpoint_path = '/content/drive/MyDrive/Kemet/Checkpoints/xlm-roberta-base-finetuned-language-detection/checkpoint-244'
pipe = pipeline("text-classification", model=checkpoint_path, device=device,tokenizer='papluca/xlm-roberta-base-language-detection')

Downloading (…)okenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


time: 30.8 s (started: 2023-05-29 09:22:29 +00:00)


In [None]:
ds_test = ds_test.to_pandas()
ds_test.head(3)

Unnamed: 0,text,labels,__index_level_0__
0,അതായത് ഏതെങ്കിലും വിഷയത്തെപ്പറ്റി ലേഖനമെഴുതാനു...,ml,5535
1,Neredeyse bir kuşunki gibi küçük gözleri vardı...,tr,8883
2,where did you get the news like how do you.,en,5565


time: 36.6 ms (started: 2023-05-29 09:22:19 +00:00)


In [None]:
start_time = time.perf_counter()
model_preds = [s['label'] for s in pipe(ds_test.text.values.tolist(), truncation=True, max_length=128)]
print(f"{time.perf_counter() - start_time:.2f} seconds")

34.77 seconds
time: 34.8 s (started: 2023-05-29 09:23:08 +00:00)


In [None]:
ds_test.labels.unique()

array(['ml', 'tr', 'en', 'el', 'nl', 'fr', 'sv', 'ar', 'de', 'pt', 'es',
       'it', 'ta', 'kn', 'da', 'ru', 'hi'], dtype=object)

time: 3.77 ms (started: 2023-05-29 09:23:49 +00:00)


Classification report for the model

In [None]:
print(classification_report(ds_test.labels.values.tolist(), model_preds, digits=3))

              precision    recall  f1-score   support

          ar      1.000     0.980     0.990       102
          da      0.975     0.963     0.969        81
          de      0.989     0.989     0.989        89
          el      1.000     1.000     1.000        69
          en      0.985     0.992     0.989       263
          es      0.956     0.981     0.968       156
          fr      0.990     1.000     0.995       193
          hi      1.000     1.000     1.000        12
          it      0.992     0.970     0.981       133
          kn      1.000     1.000     1.000        70
          ml      1.000     1.000     1.000       113
          nl      0.990     1.000     0.995       104
          pt      0.986     0.986     0.986       140
          ru      1.000     1.000     1.000       132
          sv      1.000     0.984     0.992       128
          ta      1.000     1.000     1.000        89
          tr      1.000     0.989     0.994        90

    accuracy              

In [None]:
def predict_language(text, pipe):
   result = pipe(text, truncation=True, max_length=128)   
   preds = result[0]['label']
   return preds

time: 477 µs (started: 2023-05-29 09:24:15 +00:00)


In [None]:
text = " Hello world ! "

time: 398 µs (started: 2023-05-29 09:24:05 +00:00)


In [None]:
language = predict_language(text, pipe)
print(f"The language is: {language}")

The language is: en
time: 20.9 ms (started: 2023-05-29 09:24:18 +00:00)
