In [None]:
# RUN WITH GPU in Google Colab

!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m111.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


## training

you can get data train and fine-tuned model here : https://drive.google.com/drive/folders/1SfzXn-D5tVNpZQOTrAcGaROdyP_Hj1uO?usp=share_link

In [None]:
# this training run in colab with gpu

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [None]:
df = pd.read_csv('local_train_data_clean.csv')
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 167613 entries, 0 to 167620
Data columns (total 4 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Unnamed: 0            167613 non-null  int64 
 1   text_combine          167613 non-null  object
 2   category              167613 non-null  object
 3   text_combine_cleaned  167613 non-null  object
dtypes: int64(1), object(3)
memory usage: 6.4+ MB


In [None]:
category_used = ('POLITICS', 'ENTERTAINMENT', 'WELLNESS', 'HEALTHY LIVING', 'QUEER VOICES', 'TRAVEL', 'BUSINESS', 'SPORTS', 'COMEDY')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,text_combine,category,text_combine_cleaned
0,0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,over million american roll up sleeve for omicr...
1,1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,american airline flyer charged banned for life...
2,2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,of the funniest tweet about cat and dog this w...
3,3,The Funniest Tweets From Parents This Week (Se...,PARENTING,the funniest tweet from parent this week sept ...
4,4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,woman who called cop on black birdwatcher lose...


In [None]:
df = df[df['category'].isin(category_used)][['text_combine','category']].reset_index(drop=True)

In [None]:
df = df.rename(columns={'text_combine': 'text', 'category': 'label'})

In [None]:
# Create a new column 'category_id' with encoded categories 
df['category_id'] = df['label'].factorize()[0]
category_id_df = df[['label', 'category_id']].drop_duplicates()

# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'label']].values)

# New dataframe
df.head()

Unnamed: 0,text,label,category_id
0,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,0
1,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS,1
2,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT,2
3,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,3
4,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS,3


In [None]:
id_to_category

{0: 'COMEDY',
 1: 'SPORTS',
 2: 'ENTERTAINMENT',
 3: 'POLITICS',
 4: 'WELLNESS',
 5: 'BUSINESS',
 6: 'QUEER VOICES',
 7: 'TRAVEL',
 8: 'HEALTHY LIVING'}

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
token = tokenizer.encode_plus(
    df['text'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [None]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [None]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [None]:
labels = np.zeros((len(df), len(category_used)))
labels.shape

(92865, 9)

In [None]:
X_input_ids.shape

(92865, 256)

In [None]:
X_attn_masks.shape

(92865, 256)

In [None]:
labels[np.arange(len(df)), df['category_id'].values] = 1 # one-hot encoded target tensor

In [None]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [None]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 

In [None]:
dataset = dataset.shuffle(10000).batch(1, drop_remainder=True) # batch size, drop any left out tensor

In [None]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(1, 256), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(1, 256), dtype=tf.float64, name=None)}, TensorSpec(shape=(1, 9), dtype=tf.float64, name=None))>

In [None]:
p = 0.8
train_size = int(len(df)*p) 

In [None]:
train_size

74292

In [None]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [None]:
len(train_dataset)

74292

In [None]:
from transformers import TFBertModel

In [None]:
model = TFBertModel.from_pretrained('bert-base-uncased') # bert base model with pretrained weights

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(len(category_used), activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [None]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1
)



## save model

In [None]:
model.save('result')



In [None]:
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json')

## load and predict

In [None]:
category_id_df

Unnamed: 0,label,category_id
0,COMEDY,0
1,SPORTS,1
2,ENTERTAINMENT,2
3,POLITICS,3
52,WELLNESS,4
72,BUSINESS,5
169,QUEER VOICES,6
470,TRAVEL,7
10376,HEALTHY LIVING,8


In [None]:
model_loaded = tf.keras.models.load_model('result')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=id_to_category):
    probs = model.predict(processed_data)[0]
    max_idx = np.argmax(probs)
    return classes[max_idx],probs
    # print(probs)
    # return classes[np.argmax(probs)]

In [None]:
def math_word_in_df(df,words):
    words = words.split()
    res_ = {}
    for i,r in df.iterrows():
        for word in words:
            if word in r.text:
                if r.label not in res_:
                    res_[r.label] = 1
                else:
                    res_[r.label]+=1
                break
    return res_


In [None]:
res_ = math_word_in_df(df,'eat Eat')

In [None]:
res_

{'COMEDY': 340,
 'POLITICS': 3077,
 'ENTERTAINMENT': 1540,
 'SPORTS': 517,
 'WELLNESS': 1707,
 'BUSINESS': 560,
 'TRAVEL': 856,
 'QUEER VOICES': 568,
 'HEALTHY LIVING': 1025}

In [None]:
input_text = 'eat apple loss weight'
# input_text = 'apple inc loss weight'
processed_data = prepare_data(input_text, tokenizer)
result,probs = make_prediction(model_loaded, processed_data=processed_data)
print(f"Predicted Sentiment: {result} {probs}")

Predicted Sentiment: WELLNESS [0.01699809 0.00254215 0.00610758 0.00311848 0.5672853  0.01046719
 0.00338109 0.00354401 0.3865562 ]


In [None]:
!zip -r /content/result_zip /content/result
!zip -r /content/tokenizer_zip /content/tokenizer

  adding: content/result/ (stored 0%)
  adding: content/result/fingerprint.pb (stored 0%)
  adding: content/result/assets/ (stored 0%)
  adding: content/result/variables/ (stored 0%)
  adding: content/result/variables/variables.index (deflated 79%)
  adding: content/result/variables/variables.data-00000-of-00001 (deflated 14%)
  adding: content/result/saved_model.pb (deflated 92%)
  adding: content/result/keras_metadata.pb (deflated 96%)
  adding: content/tokenizer/ (stored 0%)
  adding: content/tokenizer/special_tokens_map.json (deflated 42%)
  adding: content/tokenizer/vocab.txt (deflated 53%)
  adding: content/tokenizer/tokenizer_config.json (deflated 45%)


In [None]:
# from google.colab import files
# files.download("/content/sentiment_model_zip.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df['text'].iloc[1]

'Maury Wills, Base-Stealing Shortstop For Dodgers, Dies At 89 Maury Wills, who helped the Los Angeles Dodgers win three World Series titles with his base-stealing prowess, has died.'

In [None]:
df.iloc[1]

text           Maury Wills, Base-Stealing Shortstop For Dodge...
label                                                     SPORTS
category_id                                                    1
Name: 1, dtype: object

In [None]:
category_to_id

{'COMEDY': 0,
 'SPORTS': 1,
 'ENTERTAINMENT': 2,
 'POLITICS': 3,
 'WELLNESS': 4,
 'BUSINESS': 5,
 'QUEER VOICES': 6,
 'TRAVEL': 7,
 'HEALTHY LIVING': 8}

In [None]:
id_to_category

{0: 'COMEDY',
 1: 'SPORTS',
 2: 'ENTERTAINMENT',
 3: 'POLITICS',
 4: 'WELLNESS',
 5: 'BUSINESS',
 6: 'QUEER VOICES',
 7: 'TRAVEL',
 8: 'HEALTHY LIVING'}