#### Environment: Kaggle gpu
#### Training time: About 10min 

In [None]:
!nvidia-smi

Fri Feb 18 17:09:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip -qq install -U pip
!pip -qq install ktrain
!pip install -q tensorflow==2.4.1

In [None]:
import numpy as np 
import pandas as pd
import random
import os
import re
import ktrain
from ktrain import text
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from transformers import TFAutoModelForSequenceClassification
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 2022

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

set_global_determinism(seed=SEED)

In [None]:
path = Path('/kaggle/input/swahili-sents')
wrk_path = Path('/kaggle/working')

In [None]:
train = pd.read_csv(path/'Train.csv')
test = pd.read_csv(path/'Test.csv')

In [None]:
train['Tweets'] = train['Tweets'].apply(lambda x: (re.sub('\d+','', x)).lower())
train['Tweets'] = train['Tweets'].apply(lambda x: (re.sub('\s+',' ', x)).strip())
test['Tweets'] = test['Tweets'].apply(lambda x: (re.sub('\d+','', x)).lower())
test['Tweets'] = test['Tweets'].apply(lambda x: (re.sub('\s+',' ', x)).strip())

In [None]:
le = LabelEncoder()
train['Labels'] = le.fit_transform(train['Labels'])

In [None]:
CLASS_NAMES = sorted(train['Labels'].unique().tolist()) 

In [None]:
MODEL_NAME = 'Davlan/xlm-roberta-base-finetuned-swahili'     
MAX_LEN = 256
BATCH_SIZE = 16
FOLDS = 3
LR = 3e-5
EPOCHS = 2

CLASS_NAMES = sorted(train['Labels'].unique().tolist()) 

t = text.Transformer(model_name=MODEL_NAME, maxlen=MAX_LEN, class_names=CLASS_NAMES, batch_size=BATCH_SIZE)
t.preprocess_train_called = True

Downloading:   0%|          | 0.00/683 [00:00<?, ?B/s]

In [None]:
test_data = np.asarray(test.Tweets)

folds = StratifiedKFold(n_splits=FOLDS, random_state=SEED, shuffle=True)

oof_preds = []
oof_loss_score = []

for train_index, test_index in folds.split(train.Tweets, train.Labels):
    X_train, X_test = list(train.loc[train_index, 'Tweets']), list(train.loc[test_index, 'Tweets'])
    y_train, y_test = np.asarray(train.loc[train_index, 'Labels']), np.asarray(train.loc[test_index, 'Labels'])

    train_set = t.preprocess_train(X_train, y_train)
    val_set = t.preprocess_test(X_test, y_test)

    model = t.get_classifier()
    learner = ktrain.get_learner(model, train_data=train_set, val_data=val_set, batch_size=BATCH_SIZE)

    history = learner.fit(LR, n_cycles=EPOCHS, checkpoint_folder=wrk_path/'tmp')
    learner.validate(class_names=t.get_classes())

    oof_loss_score.append(history.history['accuracy'][-1])

    preds = ktrain.get_predictor(learner.model, preproc=t).predict(test_data, return_proba=True)

    oof_preds.append(preds)

preprocessing train...
language: sw
train sequence lengths:
	mean : 16
	95percentile : 33
	99percentile : 39


Downloading:   0%|          | 0.00/356 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: sw
test sequence lengths:
	mean : 16
	95percentile : 34
	99percentile : 40


404 Client Error: Not Found for url: https://huggingface.co/Davlan/xlm-roberta-base-finetuned-swahili/resolve/main/tf_model.h5


Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Epoch 1/2
Epoch 2/2
              precision    recall  f1-score   support

           0       0.60      0.23      0.33        80
           1       0.69      0.68      0.68       447
           2       0.50      0.61      0.55       228

    accuracy                           0.61       755
   macro avg       0.59      0.51      0.52       755
weighted avg       0.62      0.61      0.60       755

preprocessing train...
language: sw
train sequence lengths:
	mean : 16
	95percentile : 34
	99percentile : 39


Is Multi-Label? False
preprocessing test...
language: sw
test sequence lengths:
	mean : 16
	95percentile : 32
	99percentile : 39


404 Client Error: Not Found for url: https://huggingface.co/Davlan/xlm-roberta-base-finetuned-swahili/resolve/main/tf_model.h5


Epoch 1/2
Epoch 2/2
              precision    recall  f1-score   support

           0       0.61      0.29      0.39        79
           1       0.67      0.85      0.75       447
           2       0.61      0.40      0.48       228

    accuracy                           0.66       754
   macro avg       0.63      0.51      0.54       754
weighted avg       0.64      0.66      0.63       754

preprocessing train...
language: sw
train sequence lengths:
	mean : 16
	95percentile : 33
	99percentile : 40


Is Multi-Label? False
preprocessing test...
language: sw
test sequence lengths:
	mean : 16
	95percentile : 35
	99percentile : 39


404 Client Error: Not Found for url: https://huggingface.co/Davlan/xlm-roberta-base-finetuned-swahili/resolve/main/tf_model.h5


Epoch 1/2
Epoch 2/2
              precision    recall  f1-score   support

           0       0.59      0.59      0.59        80
           1       0.74      0.80      0.77       446
           2       0.65      0.54      0.59       228

    accuracy                           0.70       754
   macro avg       0.66      0.64      0.65       754
weighted avg       0.70      0.70      0.70       754

CPU times: user 5min 32s, sys: 49.1 s, total: 6min 21s
Wall time: 9min 6s


In [None]:
print(f'Mean Accuracy: {np.mean(oof_loss_score)}')
sub = pd.DataFrame(np.mean(oof_preds, axis=0), columns=t.get_classes())

Mean Accuracy: 0.6924445033073425


In [None]:
sub['Labels'] = sub.idxmax(axis='columns').values
sub['ID'] = test.ID

In [None]:
sub['Labels'] = sub['Labels'].map({0: -1, 1: 0, 2: 1})

In [None]:
sub = sub[['ID', 'Labels']]
sub.head(2)

Unnamed: 0,ID,Labels
0,4,0
1,7,1


In [None]:
sub.shape

(755, 2)

In [None]:
sub.to_csv(wrk_path/'submission_6.csv', index=False) 