# Introduction

As requested, a minimal notebook approaching this task using transformers.

Questions: ask on the Discussion board and I'll try to answer :)

# Setup

Install the requirements and upload the files using the panel on the left.

In [None]:
!pip install -q simpletransformers

[K     |████████████████████████████████| 215kB 4.6MB/s 
[K     |████████████████████████████████| 7.4MB 14.8MB/s 
[K     |████████████████████████████████| 51kB 8.0MB/s 
[K     |████████████████████████████████| 1.3MB 46.5MB/s 
[K     |████████████████████████████████| 2.9MB 57.8MB/s 
[K     |████████████████████████████████| 71kB 7.1MB/s 
[K     |████████████████████████████████| 1.7MB 56.3MB/s 
[K     |████████████████████████████████| 317kB 56.6MB/s 
[K     |████████████████████████████████| 133kB 66.1MB/s 
[K     |████████████████████████████████| 112kB 62.3MB/s 
[K     |████████████████████████████████| 4.4MB 50.1MB/s 
[K     |████████████████████████████████| 6.7MB 51.9MB/s 
[K     |████████████████████████████████| 102kB 13.8MB/s 
[K     |████████████████████████████████| 163kB 59.8MB/s 
[K     |████████████████████████████████| 1.1MB 58.5MB/s 
[K     |████████████████████████████████| 890kB 52.6MB/s 
[K     |████████████████████████████████| 122kB 57.8MB/s 
[

# Load Data

The simpletransformers assumes two columns: text then label. Easy enough given our data.

In [None]:
from simpletransformers.classification import ClassificationModel # Ignore wandb warning
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
import logging



In [None]:
# Load train
df = pd.read_csv('Train.csv', lineterminator='\n')
df['text'] = df['text'].astype(str) # One row has a float as the text
print(df.shape)
df.head()

(8401, 3)


Unnamed: 0,ID,text,label
0,IQOTJAT,m3alem bourjilia w illi ma yefehmouch yelzmou ...,1
1,HY9M63D,Ya m3alllam,1
2,7SV55S2,Ma7lek! Zin ou fannena 7loua.,1
3,KDLJVZR,hhhhh ya bliiiiiiiida ya Hanen,0
4,0MAU5GY,Nikraha w ma5yebha pffff,0


In [None]:
# Load test
test = pd.read_csv('Test.csv', lineterminator='\n')
print(test.shape)
test.head(2)

(3400, 3)


Unnamed: 0,ID,text,label
0,7I09CSF,m3alma berjouliya,0
1,EO5QHN8,جميلة,0


In [None]:
# Step 2: Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
import string
df['text'] = [entry.lower() for entry in df['text']]
translator = str.maketrans('', '', string.punctuation) 
df['text'] = [entry.translate(translator)for entry in df['text']]


test['text'] = [entry.lower() for entry in test['text']]
test['text'] = [entry.translate(translator)for entry in test['text']]
df.head(50)

Unnamed: 0,ID,text,label
0,IQOTJAT,m3alem bourjilia w illi ma yefehmouch yelzmou ...,1
1,HY9M63D,ya m3alllam,1
2,7SV55S2,ma7lek zin ou fannena 7loua,1
3,KDLJVZR,hhhhh ya bliiiiiiiida ya hanen,0
4,0MAU5GY,nikraha w ma5yebha pffff,0
5,EA1VCJ2,3sal denya,1
6,2SZ11JC,سي جلول يعطيك ألف صحة و ربي يبارك فيك وكأنك شر...,1
7,59OI2D8,ya mama mali masta lasta p p,0
8,E1TVXKP,bravo neji jelloul,1
9,L5L4SAC,نسحو يفهم في كل شيء,1


In [None]:
# Split the labeled data (df) into a smaller train and eval set for local scoring
train = df.sample(frac=0.999)
train_df = train[['text', 'label']]
eval_df = df.loc[~df.ID.isin(train.ID.values)][['text', 'label']]
train_df.shape, eval_df.shape

((8393, 2), (8, 2))

# Modelling

Using simpletransformers - see https://github.com/ThilinaRajapakse/simpletransformers for docs.

In [None]:
# # Setting up some logging (optional)
# logging.basicConfig(level=logging.INFO)
# transformers_logger = logging.getLogger("transformers")
# transformers_logger.setLevel(logging.WARNING)

# Specify some settings
args={'reprocess_input_data': True, 
      'overwrite_output_dir': True, 
      'num_train_epochs': 7}

# Create a ClassificationModel
model = ClassificationModel('bert', "asafaya/bert-large-arabic", args=args)

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

Some weights of the model checkpoint at asafaya/bert-large-arabic were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-large-

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8393.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 7'), FloatProgress(value=0.0, max=1050.0), HTML(value='')))








HBox(children=(HTML(value='Running Epoch 1 of 7'), FloatProgress(value=0.0, max=1050.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 2 of 7'), FloatProgress(value=0.0, max=1050.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 3 of 7'), FloatProgress(value=0.0, max=1050.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 4 of 7'), FloatProgress(value=0.0, max=1050.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 5 of 7'), FloatProgress(value=0.0, max=1050.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 6 of 7'), FloatProgress(value=0.0, max=1050.0), HTML(value='')))





RuntimeError: ignored

In [None]:
result # Nice for interpretation

In [None]:
# View the raw outputs
model_outputs

array([[-1.43164062,  1.57226562],
       [-2.51171875,  2.79296875],
       [ 2.80859375, -3.10742188],
       ...,
       [ 2.39453125, -2.53125   ],
       [ 1.53613281, -1.59570312],
       [-0.50146484,  0.77636719]])

In [None]:
# Convert to predicted class
pred_class = np.argmax(model_outputs, axis=-1)
pred_class

array([1, 1, 0, ..., 0, 0, 1])

In [None]:
# Score 
f1_score(eval_df['label'], pred_class)

0.8930338789493719

In [None]:
# And accuracy for interest: 
accuracy_score(eval_df['label'], pred_class)

0.8884920634920634

# Generating Submission File

We get the model preds and save them following the sample submission.

In [None]:
pred_labels, model_outputs = model.predict(test['text'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3400.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=425.0), HTML(value='')))




In [None]:
ss = pd.read_csv('SampleSubmission.csv')
ss.head()

Unnamed: 0,ID,label
0,7I09CSF,0
1,EO5QHN8,0
2,NYI236K,0
3,15973AH,0
4,9ZGKVAX,0


In [None]:
ss['label'] = pred_labels
ss.head()

Unnamed: 0,ID,label
0,7I09CSF,1
1,EO5QHN8,1
2,NYI236K,1
3,15973AH,0
4,9ZGKVAX,1


In [None]:
ss.to_csv('Sub1.csv', index=False)

# Good luck :)

by Johno Whitaker