<a href="https://colab.research.google.com/github/ayami-n/Flax_text_prediction/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Flax_text_prediction

Mounted at /content/drive
/content/drive/MyDrive/Flax_text_prediction


# Import libs

In [2]:
# pytorch
import torch
import torch.nn as nn
from torch.nn.utils.clip_grad import clip_grad_norm
from torch.utils.data import DataLoader, Dataset

# Transformers
!pip install transformers
import transformers
from transformers import EarlyStoppingCallback
from transformers import Trainer,TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification

# others
import pandas as pd
from tqdm import tqdm
from typing import Callable, Any
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 28.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

# GPU Check

In [3]:
if torch.cuda.is_available():
  print("GPU")
else:
  print("CPU")  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

GPU


# Config

In [4]:
### Model Config ####
model_checkpoint = 'bert-base-cased' # https://huggingface.co/docs/transformers/model_doc/roberta#roberta: siebert/sentiment-roberta-large-english
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)  # this tokenizer converts numeric from string: the values are different if you select different model_checkpoint

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Tokenaization and Loading Data

In [5]:
df = pd.read_csv("./kaggle/train.csv")
df.drop(['discourse_id','essay_id','discourse_type'],inplace=True,axis=1)
tar_map = {"Ineffective":0, "Adequate":1,"Effective":2}
df["discourse_effectiveness"] = df["discourse_effectiveness"].map(tar_map)

x_train,x_test,y_train_,y_test_ = train_test_split(df['discourse_text'], df['discourse_effectiveness'], random_state=42, test_size=0.05)

y_train = pd.get_dummies(y_train_).values
y_test = pd.get_dummies(y_test_).values

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [7]:
max_len = 128
X_train_tokenized = tokenizer(list(x_train.values),
                              add_special_tokens=True,padding='max_length',
                              truncation=True,max_length=max_len,
                              return_attention_mask=True)

X_val_tokenized = tokenizer(list(x_test.values),
                              add_special_tokens=True,padding='max_length',
                              truncation=True,max_length=max_len,
                              return_attention_mask=True)

train_dataset = Dataset(X_train_tokenized, list(y_train_))
val_dataset = Dataset(X_val_tokenized, list(y_test_))

# Create a model

In [8]:
# the model size is 400 MB
model = BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

Downloading pytorch_model.bin:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [9]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    
    result = classification_report(labels,pred,output_dict=True)

    return result

In [19]:
# Define Trainer
args = TrainingArguments(
    output_dir="./",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    save_strategy="epoch",
    seed=0,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


# Training

In [20]:
trainer.train()

***** Running training *****
  Num examples = 34926
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 5460


Epoch,Training Loss,Validation Loss,0,1,2,Accuracy,Macro avg,Weighted avg
1,0.7584,0.733078,"{'precision': 0.6909090909090909, 'recall': 0.11275964391691394, 'f1-score': 0.19387755102040816, 'support': 337}","{'precision': 0.6643887623386484, 'recall': 0.8309591642924976, 'f1-score': 0.7383966244725738, 'support': 1053}","{'precision': 0.6381156316916489, 'recall': 0.6636971046770601, 'f1-score': 0.6506550218340611, 'support': 449}",0.65851,"{'precision': 0.6644711616464627, 'recall': 0.5358053042954906, 'f1-score': 0.5276430657756811, 'support': 1839}","{'precision': 0.662833958134046, 'recall': 0.6585100598151169, 'f1-score': 0.6171900408194624, 'support': 1839}"
2,0.5522,0.811712,"{'precision': 0.4911242603550296, 'recall': 0.49258160237388726, 'f1-score': 0.4918518518518519, 'support': 337}","{'precision': 0.7176591375770021, 'recall': 0.6638176638176638, 'f1-score': 0.6896891958559447, 'support': 1053}","{'precision': 0.6129032258064516, 'recall': 0.7193763919821826, 'f1-score': 0.6618852459016393, 'support': 449}",0.646003,"{'precision': 0.6072288745794944, 'recall': 0.6252585527245779, 'f1-score': 0.6144754312031453, 'support': 1839}","{'precision': 0.6505696008674958, 'recall': 0.6460032626427407, 'f1-score': 0.6466466953345404, 'support': 1839}"
3,0.3285,1.100665,"{'precision': 0.558282208588957, 'recall': 0.27002967359050445, 'f1-score': 0.364, 'support': 337}","{'precision': 0.6843971631205674, 'recall': 0.7331433998100665, 'f1-score': 0.707932141219624, 'support': 1053}","{'precision': 0.583941605839416, 'recall': 0.7126948775055679, 'f1-score': 0.641925777331996, 'support': 449}",0.643284,"{'precision': 0.6088736591829801, 'recall': 0.5719559836353796, 'f1-score': 0.57128597285054, 'support': 1839}","{'precision': 0.6367597053193768, 'recall': 0.643284393692224, 'f1-score': 0.6287902222546656, 'support': 1839}"
4,0.1732,1.561139,"{'precision': 0.5353535353535354, 'recall': 0.314540059347181, 'f1-score': 0.39626168224299063, 'support': 337}","{'precision': 0.691743119266055, 'recall': 0.7160493827160493, 'f1-score': 0.7036864209052729, 'support': 1053}","{'precision': 0.5789473684210527, 'recall': 0.7104677060133631, 'f1-score': 0.638, 'support': 449}",0.641109,"{'precision': 0.602014674346881, 'recall': 0.5803523826921978, 'f1-score': 0.5793160343827545, 'support': 1839}","{'precision': 0.6355448691801795, 'recall': 0.6411092985318108, 'f1-score': 0.6313126634742469, 'support': 1839}"
5,0.1026,1.873053,"{'precision': 0.5286343612334802, 'recall': 0.3560830860534125, 'f1-score': 0.425531914893617, 'support': 337}","{'precision': 0.6911898274296094, 'recall': 0.7226970560303894, 'f1-score': 0.7065923862581245, 'support': 1053}","{'precision': 0.5968688845401174, 'recall': 0.6792873051224945, 'f1-score': 0.6354166666666667, 'support': 449}",0.644916,"{'precision': 0.6055643577344023, 'recall': 0.5860224824020989, 'f1-score': 0.5891803226061361, 'support': 1839}","{'precision': 0.6383723747567016, 'recall': 0.644915715062534, 'f1-score': 0.6377096908005914, 'support': 1839}"


***** Running Evaluation *****
  Num examples = 1839
  Batch size = 32
Trainer is attempting to log a value of "{'precision': 0.6909090909090909, 'recall': 0.11275964391691394, 'f1-score': 0.19387755102040816, 'support': 337}" of type <class 'dict'> for key "eval/0" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.6643887623386484, 'recall': 0.8309591642924976, 'f1-score': 0.7383966244725738, 'support': 1053}" of type <class 'dict'> for key "eval/1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.6381156316916489, 'recall': 0.6636971046770601, 'f1-score': 0.6506550218340611, 'support': 449}" of type <class 'dict'> for key "eval/2" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attemp

TrainOutput(global_step=5460, training_loss=0.3697059778066782, metrics={'train_runtime': 3634.347, 'train_samples_per_second': 48.05, 'train_steps_per_second': 1.502, 'total_flos': 1.148687403446016e+16, 'train_loss': 0.3697059778066782, 'epoch': 5.0})