<a href="https://colab.research.google.com/github/arefrazavi/news_classification/blob/main/bert/fasttext_bert_separate_pretrained_model_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Required Packages

In [1]:
import os

# Disable asynchronously kernel launches which is useful for debugging.
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

! pip install datasets transformers numpy sklearn

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 7.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 49.5 MB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 75.1 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 63.1 MB/s 
Collecting huggingface-hub<0.1.0
  Downloading huggingface_hub-0.0.15-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 55.6 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1

# Gather Data

In [2]:
import pandas as pd

train_dataset_df = pd.read_csv("/content/drive/MyDrive/Datasets/fasttext_train_dataset.csv")
test_dataset_df = pd.read_csv("/content/drive/MyDrive/Datasets/fasttext_test_dataset.csv")

train_dataset_df

Unnamed: 0,text,label
0,ارزش سهام 10 شركت بورس از يك ميليارد دلار فرات...,Economy
1,اعلام برنامه واگذاري 60 ميليارد دلار سهام شركت...,Economy
2,روندهابانك هاي خصوصي در جستجوي شرايط مناسب,Economy
3,نبض بورس در دست سهام ساختمان,Economy
4,اصلاح آيين نامه قيمت گذاري سهام دولتي آغاز شد,Economy
...,...,...
145872,بسكتبال جام برتر تهران در ماه مبارك رمضان,Sport
145873,داوران و كمك داوران بين المللي فوتبال,Sport
145874,روبرتو باجو يار قرضي پروجيا,Sport
145875,ژرژ وه آ از آ. ث. ميلان جدا مي شود,Sport


# Prepare Data

In [3]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset


# Replace with vectores
# Target 4 out range error: Should it begin from 0?
label_mapping = {'Sport': 0, 'Politics': 1, 'Economy': 2, 'Social': 3}

# Clean and format datasets and store them all in a DatasetDict.
train_dataset_df.dropna(how="any", inplace=True)
#train_dataset_df["label"].astype(str).replace(label_mapping, inplace=True)
train_dataset_df, validation_dataset_df = train_test_split(train_dataset_df, test_size=0.2, random_state=42, shuffle=True)
test_dataset_df.dropna(how="any", inplace=True)
#test_dataset_df["label"].astype(str).replace(label_mapping, inplace=True)
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(train_dataset_df),
    "validation": Dataset.from_pandas(validation_dataset_df),
    "test": Dataset.from_pandas(test_dataset_df),
})

# print('Train dataset after cleaning: ', train_dataset_df)

# Preprocess the datasets by tokenization:
tokenizer = AutoTokenizer.from_pretrained(
    "HooshvareLab/bert-fa-base-uncased",
)

# @TODO Padding should be min(maximum length of samples, 512)
def preprocess_documents(documents):
    # The maximum sequence length in BERT model is 512.
    tokenized_documents = tokenizer(documents["text"], padding="max_length", truncation=True, max_length=512)
    tokenized_documents["label"] = [label_mapping[label] for label in documents["label"]]

    return tokenized_documents


tokenized_datasets = raw_datasets.map(preprocess_documents, batched=True)

# Divide dataset into three subset for training, validation and testing.
train_dataset = tokenized_datasets["train"]
validation_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]


print("An example of a sample training documents after preprocessing:")
for i in range(0, 10):
  print("Label: ", train_dataset['label'][i], ", Text: ", train_dataset['text'][i])

Downloading:   0%|          | 0.00/440 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

An example of a sample training documents after preprocessing:
Label:  0 , Text:  شاهرخي سرمربي تيم فوتبال پاس تهران شد
Label:  1 , Text:  دبير شوراي عالي امنيت ملي:درهاي مذاكره بسته نيست طرح روسيه قابل بررسي است
Label:  0 , Text:  19 روز تا آغاز دووميداني داخل سالن آسياقرقيزستان هم اعلام آمادگي كرد
Label:  1 , Text:  در گفت وگوي روساي جمهوري ايران و كنيا درنايروبي مناسبات مهم بين المللي و راههاي گسترش همكاري تهران و نايروبي بررسي شد
Label:  0 , Text:  استقلال را نجات دهيدآبي هاي اهواز در تهران استقلال را متوقف كردند
Label:  1 , Text:  گردهمايي بزرگ دانشجويان در اعتراض به نقض حقوق روحانيون و دانشجويان
Label:  2 , Text:  با ادامه روند اخذ 3 درصد عوارض فروش كارخانه هاي توليدكننده فولاد تعطيل خواهند شد
Label:  2 , Text:  در نيمه نخست بهمن ماهنيمي از نقدينگي بازار به سوي سهام خودرو سرازير شد
Label:  0 , Text:  پرسپوليس قهرمان ليگ تا پايان سال 78 شد
Label:  2 , Text:  دهمين كنفرانس بازرگاني و توسعه ملل متحد در بانكوك گشايش يافت


# Define and Train Model

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer
import sys

output_dir = "/content/drive/MyDrive/nlp_output_dir/fasttext_classification"
# model_name_or_path = "/content/drive/MyDrive/nlp_output_dir/fasttext_classification"
model_name_or_path = "HooshvareLab/bert-fa-base-uncased"

labels = raw_datasets["train"].unique("label")


# Pretrain our BERT model, we can pass custom config parameters directly or by a AutoConfig option.
# config = AutoConfig.from_pretrained(
#     model_name_or_path,
#     num_labels=len(labels),
# )
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(label_mapping)
)
print('\n---Model Architectures: ', model.config.architectures)


# Create a trainer from our pre-trained model to fine tune it.
training_args = TrainingArguments(output_dir=output_dir, save_total_limit=6, load_best_model_at_end=True, save_strategy="epoch", evaluation_strategy="epoch")
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=validation_dataset
)

# Fune tuning params using Validation
# num_attention_heads, vocab_size, num_hidden_layers, hidden_size, initializer_range

# Fine tune our model
trainer.train()

Downloading:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w


---Model Architectures:  ['BertForMaskedLM']


The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__.
***** Running training *****
  Num examples = 116523
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 43698


Epoch,Training Loss,Validation Loss


Save Model

In [None]:
trainer.save_model()

Evaluate Model

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir= output_dir, save_total_limit=6, load_best_model_at_end=True, save_strategy="epoch", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

In [None]:
import numpy as np
import csv

# Removing the `label` columns because it contains -1 and Trainer won't like that.
predict_dataset = test_dataset.remove_columns("label")

predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
predictions = np.argmax(predictions, axis=1)



output_predict_file = os.path.join(training_args.output_dir, f"predict_results.csv")
text_label_predictions = []
for index, prediction_label in enumerate(predictions):
    label_name = labels[prediction_label]
    text_label_predictions.append({
        "text": predict_dataset['text'][index],
        "label": label_name
    })

with open(output_predict_file, "w") as file:
    writer = csv.DictWriter(file, fieldnames=["text", "label"])
    writer.writeheader()
    writer.writerows(text_label_predictions)
   
text_label_predictions