<a href="https://colab.research.google.com/github/carlosinator/cil-sentiment/blob/main/infer_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
%%bash
pip3 install transformers emoji==0.6.0 keras_nlp

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.4/7.4 MB 36.9 MB/s eta 0:00:00
Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 51.0/51.0 kB 6.0 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting keras_nlp
  Downloading keras_nlp-0.6.0-py3-none-any.whl (576 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 576.5/576.5 kB 38.5 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 29.8 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 78.2 M

In [3]:
!git clone https://github.com/carlosinator/cil-sentiment.git

Cloning into 'cil-sentiment'...
remote: Enumerating objects: 119, done.[K
remote: Counting objects: 100% (119/119), done.[K
remote: Compressing objects: 100% (108/108), done.[K
remote: Total 119 (delta 58), reused 17 (delta 4), pack-reused 0[K
Receiving objects: 100% (119/119), 162.02 KiB | 9.53 MiB/s, done.
Resolving deltas: 100% (58/58), done.


In [4]:
import tensorflow as tf
import tensorflow_probability as tfp
import keras_nlp
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
import transformers
from transformers import AutoTokenizer, TFAutoModel, AutoConfig, TFAutoModelForSequenceClassification
import matplotlib.pyplot as plt
import pickle
import re
import subprocess as sp
import os
from threading import Thread , Timer
import sched, time

import sys
sys.path.append("./cil-sentiment/models")
sys.path.append("./cil-sentiment/")
from gru_models import GRUModel, VGRUModel
import utils

# reproducibility
transformers.set_seed(0) # sets the seed in random, numpy, and tf

Using TensorFlow backend


## Copy Test Data

In [5]:
# copy preprocessed data from google cloud storage
!gsutil cp "gs://cil_2023/test_preprocessed.txt" .

Copying gs://cil_2023/test_preprocessed.txt...
/ [1 files][798.3 KiB/798.3 KiB]                                                
Operation completed over 1 objects/798.3 KiB.                                    


## Parse Test Data into df

In [6]:
# Read the file line by line, split on the first comma only, and append to a list
data = []
with open('test_preprocessed.txt', 'r') as file:
    for line in file:
        split_line = line.strip().split(',', 1)  # split only on the first comma
        if len(split_line) == 2:  # to ensure there are no lines with missing values
            data.append(split_line)

# Create a DataFrame from the list
df = pd.DataFrame(data, columns=['id', 'tweet'])

## Tokenize Dataset

In [16]:
# tokenize data set
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_tweets = tokenizer.batch_encode_plus(df['tweet'].tolist(),
                                    padding="max_length", max_length=73, truncation=True,
                                    return_tensors='tf')



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# some variable definitions
USE_MODEL = "read"
run_name = "inference_" + USE_MODEL + "_fullmodel"

In [9]:
# download the model
!gsutil cp -r {"gs://cil_2023/models/" + run_name} .

Copying gs://cil_2023/models/inference_read_fullmodel/fingerprint.pb...
Copying gs://cil_2023/models/inference_read_fullmodel/keras_metadata.pb...
Copying gs://cil_2023/models/inference_read_fullmodel/saved_model.pb...
Copying gs://cil_2023/models/inference_read_fullmodel/variables/variables.data-00000-of-00001...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

- [4 files][527.7 MiB/527.7 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://cil_2023/models/inference_read_fullmodel/variables/variables.index...
- [5 files][527.7 MiB/5

In [10]:
# Load the saved model
model = tf.keras.models.load_model(run_name)

In [19]:
# Perform the inference
probabilities = model.predict(dict(tokenized_tweets))



In [20]:
if USE_MODEL == "basemodel":
  # on base model, we will get logits, so apply a softmax
  probabilities = tf.nn.softmax(probabilities["logits"])

# Convert probabilities into class predictions (0 or 1)
# and then map 0 to 1 and 1 to -1 (that is Leo's great mapping:))
predictions = np.argmax(probabilities, axis=-1)
predictions = np.where(predictions == 0, 1, -1)

# Write the results to a text file
with open('predictions.csv', 'w') as file:
    file.write("Id,Prediction\n")
    for i, prediction in enumerate(predictions):
        file.write(f"{df['id'].iloc[i]},{prediction}\n")

In [21]:
model.summary()


Model: "gru_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tf_roberta_model (TFRobert  multiple                  134899968 
 aModel)                                                         
                                                                 
 layer_normalization (Layer  multiple                  32        
 Normalization)                                                  
                                                                 
 dense (Dense)               multiple                  12304     
                                                                 
 bidirectional (Bidirection  multiple                  1248      
 al)                                                             
                                                                 
 dense_1 (Dense)             multiple                  34        
                                                         