<a href="https://colab.research.google.com/github/ayami-n/Flax_text_prediction/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Flax_text_prediction

/content/drive/MyDrive/Flax_text_prediction


# Import libs

In [None]:
%%capture
!pip install datasets
!pip install git+https://github.com/huggingface/transformers.git
!pip install flax
!pip install git+https://github.com/deepmind/optax.git

In [3]:
import jax
from jax import random  # to create random values for initalizing a model (Flax requires)
import jax.numpy as jnp

# Flax for building model
try:
    import flax
except ModuleNotFoundError: # Install flax if missing
    !pip install --quiet flax
    import flax

from flax import linen as nn
from flax.training import train_state, checkpoints

# Optax for optimizor 
import optax

# Transformers
!pip install transformers
from transformers import FlaxAutoModelForSequenceClassification, AutoConfig
from transformers import RobertaTokenizer, RobertaConfig # as we use Roberta model

# others
import pandas as pd
from tqdm import tqdm

[K     |████████████████████████████████| 197 kB 29.2 MB/s 
[K     |████████████████████████████████| 145 kB 71.8 MB/s 
[K     |████████████████████████████████| 217 kB 75.5 MB/s 
[K     |████████████████████████████████| 596 kB 65.7 MB/s 
[K     |████████████████████████████████| 51 kB 7.1 MB/s 
[K     |████████████████████████████████| 72 kB 717 kB/s 
[?25hLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 32.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.9 MB/s 
Installing collected 

# Config

In [4]:
model_checkpoint = 'siebert/sentiment-roberta-large-english' # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
num_labels = 3 # our targets
seed = 0  # for building our model
max_len = 512  # input text should be the same length

config = AutoConfig.from_pretrained(model_checkpoint, num_labels=num_labels)  # create a config for our model
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)  # this tokenizer converts numeric from string: the values are different if you select different model_checkpoint

Downloading:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256 [00:00<?, ?B/s]

# Tokenaization

In [8]:
'''
https://stackoverflow.com/questions/65246703/how-does-max-length-padding-and-truncation-arguments-work-in-huggingface-bertt
adding [CLS] token at the beginning of the sentence, and [SEP] token at the end of sentence.
'''

sep = tokenizer.sep_token  # [SEP] is for separating sentences for the next sentence prediction task
cls = tokenizer.cls_token  # [CLS] stands for class

In [11]:
df = pd.read_csv("./kaggle/train.csv")  # import train datasets
df['text'] = cls + df['discourse_type'] + sep + df['discourse_text']
new_label = {"discourse_effectiveness": {"Ineffective": 0, "Adequate": 1, "Effective": 2}}  # to replace 
df = df.replace(new_label) 

In [14]:
def tokenaize(texts, tokenizer, max_len):  # df text comes and return as numerical 
    input_ids = []
    attention_mask = []
    
    for text in tqdm(texts):  # it takes over 1 min (numpy and list)
        token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                         add_special_tokens=True,  return_tensors='jax')
        input_ids.append(token['input_ids']) # separate: the values (text) are converted by the tokenizer
        attention_mask.append(token['attention_mask']) # separate: the values (attention-mask) are converted by the tokenizer
        
    return jnp.array(input_ids), jnp.array(attention_mask)

In [None]:
input_ids, attention_mask = tokenaize(df['discourse_text'].to_numpy(), tokenizer, max_len)  # pandas is slow -> numpy is the fastest (list is considerable)

print(input_ids)
print(attention_mask)

In [None]:
max = 0
max_str = "Hello inu"
for itr, val in enumerate(df['discourse_text'].to_numpy()):
  words = val.split()
  if len(words) > max:
    max = len(words)
    max_str = val

['Hi,', "i'm", 'Isaac,', "i'm", 'going', 'to', 'be', 'writing', 'about', 'how', 'this', 'face', 'on', 'Mars', 'is', 'a', 'natural', 'landform', 'or', 'if', 'there', 'is', 'life', 'on', 'Mars', 'that', 'made', 'it.', 'The', 'story', 'is', 'about', 'how', 'NASA', 'took', 'a', 'picture', 'of', 'Mars', 'and', 'a', 'face', 'was', 'seen', 'on', 'the', 'planet.', 'NASA', "doesn't", 'know', 'if', 'the', 'landform', 'was', 'created', 'by', 'life', 'on', 'Mars,', 'or', 'if', 'it', 'is', 'just', 'a', 'natural', 'landform.']


# Create a model

In [None]:
model = FlaxAutoModelForSequenceClassification.from_pretrained(model_checkpoint, config=config, seed=seed, ignore_mismatched_sizes=True)  # ignore_mismatched_sizes=True: arrow to have arbitary number of outputs

Some weights of FlaxRobertaForSequenceClassification were not initialized from the model checkpoint at siebert/sentiment-roberta-large-english and are newly initialized because the shapes did not match:
- ('classifier', 'out_proj', 'bias'): found shape (2,) in the checkpoint and (3,) in the model instantiated
- ('classifier', 'out_proj', 'kernel'): found shape (1024, 2) in the checkpoint and (1024, 3) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
out = model(**inputs)
print(out)
print(out.logits)

FlaxSequenceClassifierOutput(logits=DeviceArray([[-0.4148041 , -0.48419115,  0.02517768]], dtype=float32), hidden_states=None, attentions=None)
[[-0.4148041  -0.48419115  0.02517768]]


# Validation

In [None]:
test = pd.read_csv("./kaggle/test.csv") 