<a href="https://colab.research.google.com/github/ayami-n/Flax_text_prediction/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Flax_text_prediction

/content/drive/MyDrive/Flax_text_prediction


# Import libs

In [None]:
%%capture
!pip install datasets
!pip install git+https://github.com/huggingface/transformers.git
!pip install flax
!pip install git+https://github.com/deepmind/optax.git

In [21]:
import jax
from jax import random  # to create random values for initalizing a model (Flax requires)
import jax.numpy as jnp

# Flax for building model
try:
    import flax
except ModuleNotFoundError: # Install flax if missing
    !pip install --quiet flax
    import flax

from flax import linen as nn
from flax.training import train_state, checkpoints

# Optax for optimizor 
import optax

# Transformers
!pip install transformers
from transformers import FlaxAutoModelForSequenceClassification, AutoConfig
from transformers import RobertaTokenizer, RobertaConfig # as we use Roberta model

# others
import pandas as pd
from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Config

In [16]:
model_checkpoint = 'siebert/sentiment-roberta-large-english' # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
num_labels = 3 # our targets
seed = 0  # for building our model
max_len = 256  # input text should be the same length

config = AutoConfig.from_pretrained(model_checkpoint, num_labels=num_labels)  # create a config for our model

# Tokenaization

In [31]:
df = pd.read_csv("./kaggle/train.csv")  # import train datasets
new_label = {"discourse_effectiveness": {"Ineffective": 0, "Adequate": 1, "Effective": 2}}  # to replace 
df = df.replace(new_label) 

In [25]:
def tokenaize(texts, tokenizer, max_len):  # df text comes and return as numerical 
    input_ids = []
    attention_mask = []
    
    for text in tqdm(texts):  # it takes over 1 min (numpy and list)
        token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                         add_special_tokens=True,  return_tensors='jax')
        input_ids.append(token['input_ids']) # separate: the values (text) are converted by the tokenizer
        attention_mask.append(token['attention_mask']) # separate: the values (attention-mask) are converted by the tokenizer
        
    return jnp.array(input_ids), jnp.array(attention_mask)

In [28]:
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)  # this tokenizer converts numeric from string: the values are different if you select different model_checkpoint
input_ids, attention_mask = tokenaize(df['discourse_text'].to_numpy(), tokenizer, max_len)  # pandas is slow -> numpy is the fastest (list is considerable)

100%|██████████| 36765/36765 [01:25<00:00, 430.79it/s]


# Create a model

In [14]:
model = FlaxAutoModelForSequenceClassification.from_pretrained(model_checkpoint, config=config, seed=seed, ignore_mismatched_sizes=True)  # ignore_mismatched_sizes=True: arrow to have arbitary number of outputs

Some weights of FlaxRobertaForSequenceClassification were not initialized from the model checkpoint at siebert/sentiment-roberta-large-english and are newly initialized because the shapes did not match:
- ('classifier', 'out_proj', 'bias'): found shape (2,) in the checkpoint and (3,) in the model instantiated
- ('classifier', 'out_proj', 'kernel'): found shape (1024, 2) in the checkpoint and (1024, 3) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
out = model(**inputs)
print(out)
print(out.logits)

FlaxSequenceClassifierOutput(logits=DeviceArray([[-0.4148041 , -0.48419115,  0.02517768]], dtype=float32), hidden_states=None, attentions=None)
[[-0.4148041  -0.48419115  0.02517768]]
