In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm.notebook import tqdm_notebook


In [None]:
!pip install transformers



In [None]:
from transformers import BertTokenizer, TFBertModel

In [None]:
dataset = pd.read_csv('train_val.csv')

In [None]:
def remove_words(dataset):
    cleaned_tweet=[]
    for text in dataset:
        words=text.split()
        cleaned_words=[]
        for word in words:
            if not word.startswith('@'):
                cleaned_words.append(word);
        cleaned_text = ' '.join(cleaned_words)
        cleaned_tweet.append(cleaned_text)
    return cleaned_tweet

In [None]:
tweet=dataset['tweet']

In [None]:
tweets=remove_words(tweet)

In [None]:
dataset['tweet']=tweets

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [None]:
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import nltk
nltk.download('omw-1.4')
def clean(text):

  cleanr = re.compile('<[^>]*>')
  cleantext = re.sub(cleanr, ' ', text)

  cleantext = re.sub("[-]", " " , cleantext)

  cleantext = re.sub("[^A-Za-z0-9 ]", " " , cleantext)
  cleantext = cleantext.lower()

  words = nltk.tokenize.word_tokenize(cleantext)
  words_new = [i for i in words if i not in stop_words]

  w = [lemmatizer.lemmatize(word) for word in words_new if len(word)>2]

  return ' '.join(w)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
dataset['tweet'] = tqdm_notebook(dataset['tweet'].apply(clean))

  0%|          | 0/9921 [00:00<?, ?it/s]

In [None]:
dataset

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,astrazeneca made kidney cell little girl abort...,ingredients
1,1336808189677940736t,begin please find safe alternative vaccine iss...,side-effect
2,1329488407307956231t,well mean congratulation covid19 first ever th...,side-effect
3,1364194604459900934t,wish vaccine given vaccine passport abroad int...,mandatory
4,1375938799247765515t,trying speak writing letter government speakin...,side-effect rushed
...,...,...,...
9916,1388469392866938880t,former pfizer chief scientific officer experim...,side-effect
9917,1352957607393300485t,manufacturer saying manufacturer recommendatio...,pharma
9918,1357484621542268928t,complete oxford astrazeneca vaccine swissmedic...,none
9919,1371121610057388037t,opinion vaccine side effect possible penicilli...,side-effect


In [None]:
dataset['labels'] = dataset['labels'].str.split()

In [None]:
dataset

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,astrazeneca made kidney cell little girl abort...,[ingredients]
1,1336808189677940736t,begin please find safe alternative vaccine iss...,[side-effect]
2,1329488407307956231t,well mean congratulation covid19 first ever th...,[side-effect]
3,1364194604459900934t,wish vaccine given vaccine passport abroad int...,[mandatory]
4,1375938799247765515t,trying speak writing letter government speakin...,"[side-effect, rushed]"
...,...,...,...
9916,1388469392866938880t,former pfizer chief scientific officer experim...,[side-effect]
9917,1352957607393300485t,manufacturer saying manufacturer recommendatio...,[pharma]
9918,1357484621542268928t,complete oxford astrazeneca vaccine swissmedic...,[none]
9919,1371121610057388037t,opinion vaccine side effect possible penicilli...,[side-effect]


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_tweets(text):
    input_ids = []
    attention_masks = []
    token_type_ids = []

    for tweet in text:
        encoded = tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=256,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='tf'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        token_type_ids.append(encoded['token_type_ids'])

    return {
        'input_ids': tf.concat(input_ids, axis=0),
        'attention_mask': tf.concat(attention_masks, axis=0),
        'token_type_ids': tf.concat(token_type_ids, axis=0)
    }

In [None]:
train_tokens = tokenize_tweets(dataset['tweet'])

In [None]:

bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
input_ids = tf.keras.Input(shape=(256,), dtype=tf.int32)
attention_mask = tf.keras.Input(shape=(256,), dtype=tf.int32)
token_type_ids = tf.keras.Input(shape=(256,), dtype=tf.int32)


In [None]:
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(dataset['labels'])

In [None]:
train_labels.shape

(9921, 12)

In [None]:
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dropout
bert_output = bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
dropout_rate=0.5
weight_decay=0.001
dropout_layer = Dropout(rate=dropout_rate)(bert_output[:, 0, :])
output = tf.keras.layers.Dense(len(mlb.classes_), activation='sigmoid',kernel_regularizer=regularizers.l2(weight_decay))(dropout_layer)

In [None]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')

In [None]:
model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss='binary_crossentropy',metrics=['binary_accuracy'])

In [None]:
train_inputs = [train_tokens['input_ids'], train_tokens['attention_mask'], train_tokens['token_type_ids']]

In [None]:
model.fit(train_inputs, train_labels, batch_size=16, epochs=10)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x789afa3b5480>

In [None]:
test_data= pd.read_csv('test.csv')

In [None]:
test_data['tweet'] = tqdm_notebook(test_data['tweet'].apply(clean))

  0%|          | 0/486 [00:00<?, ?it/s]

In [None]:
test_tokens = tokenize_tweets(test_data['tweet'])

In [None]:
test_inputs_bert = [test_tokens['input_ids'], test_tokens['attention_mask'], test_tokens['token_type_ids']]

In [None]:
model.predict(test_inputs_bert)

In [None]:
data = model.predict(test_inputs_bert)



In [None]:
data

array([[1.2121243e-02, 7.8991298e-03, 9.6684955e-03, ..., 4.7059353e-03,
        9.2838228e-01, 1.8168969e-02],
       [1.1034927e-03, 5.7126174e-04, 1.8776801e-03, ..., 4.3797729e-04,
        9.9901211e-01, 1.1676891e-03],
       [1.1825699e-03, 5.0597382e-04, 3.2000244e-03, ..., 3.3555398e-04,
        9.9864072e-01, 1.0001975e-03],
       ...,
       [8.7287679e-04, 1.1448642e-03, 1.7388174e-02, ..., 1.1960371e-04,
        9.2777771e-01, 3.9049145e-04],
       [3.6821631e-03, 8.6431584e-04, 4.9905773e-02, ..., 1.3508547e-04,
        6.1460968e-02, 4.5218741e-04],
       [7.6812301e-03, 2.0955510e-03, 3.1805101e-03, ..., 1.3309509e-04,
        9.6614140e-01, 5.6818069e-04]], dtype=float32)

In [None]:
data[0]

array([0.01212124, 0.00789913, 0.0096685 , 0.0209371 , 0.00200069,
       0.0050014 , 0.00359124, 0.00306634, 0.8190514 , 0.00470594,
       0.9283823 , 0.01816897], dtype=float32)

In [None]:
threshold = 0.5

binary_data = (data >= threshold).astype(int)

In [None]:
binary_data[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [None]:
print(binary_data)

[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]]


In [None]:
activated_classes = mlb.inverse_transform(binary_data)

In [None]:
mlb_classes = mlb.classes_

print(mlb_classes)





['conspiracy' 'country' 'ineffective' 'ingredients' 'mandatory' 'none'
 'pharma' 'political' 'religious' 'rushed' 'side-effect' 'unnecessary']


In [None]:
activated_classes

[('religious', 'side-effect'),
 ('side-effect',),
 ('side-effect',),
 ('side-effect',),
 ('side-effect',),
 ('side-effect',),
 ('country', 'side-effect'),
 ('side-effect',),
 ('side-effect',),
 ('side-effect',),
 ('ineffective',),
 ('ineffective',),
 ('side-effect',),
 ('pharma',),
 ('side-effect',),
 ('ingredients', 'side-effect'),
 ('side-effect',),
 ('ineffective', 'side-effect'),
 ('side-effect',),
 ('mandatory',),
 ('ineffective',),
 ('ineffective',),
 ('ineffective', 'side-effect'),
 ('none',),
 ('ineffective',),
 ('ineffective',),
 ('ineffective', 'side-effect'),
 ('ineffective', 'side-effect'),
 ('ineffective', 'side-effect'),
 (),
 ('pharma',),
 ('pharma',),
 ('pharma',),
 ('country',),
 ('pharma',),
 ('side-effect',),
 ('side-effect',),
 ('pharma',),
 ('pharma',),
 ('side-effect',),
 ('side-effect',),
 ('none',),
 ('none',),
 ('none',),
 ('side-effect',),
 ('none',),
 ('side-effect',),
 ('side-effect', 'unnecessary'),
 ('ineffective',),
 ('ineffective',),
 ('mandatory', 'side

In [None]:
test_data['labels']=activated_classes

In [None]:
test_data['labels'] = test_data['labels'].apply(lambda x: ' '.join(map(str, x)))

In [None]:
test_data

Unnamed: 0,id,tweet,labels
0,1070378532260470789t,study link hpv vaccine historically high infer...,religious side-effect
1,973746711964372993t,death tainted measles vaccine affecting anti p...,side-effect
2,1043031076787040257t,apreciat videoclip youtube http uknvkypp3w tre...,side-effect
3,1066338147527741440t,video mmr vaccine increase risk autism african...,side-effect
4,963522018544152576t,oral polio vaccine infecting unvaccinated kid ...,side-effect
...,...,...,...
481,1099937642169405440t,know hard believe another vaccine scare story ...,none
482,1536674508731518985t,seizure day mmr vaccine tale normal child diag...,side-effect
483,1183314494874968064t,kenyan doctor say unicef making woman barren p...,side-effect
484,1327581896243556352t,experience vaccine 60 school giving sugar cube...,pharma


In [None]:
test_tweet= pd.read_csv('test.csv')

In [None]:
test_data['tweet']=test_tweet['tweet']

In [None]:
test_data

Unnamed: 0,id,tweet,labels
0,1070378532260470789t,Study Links HPV Vaccine to Historically High I...,religious side-effect
1,973746711964372993t,Deaths from tainted measles vaccine affecting ...,side-effect
2,1043031076787040257t,"Am apreciat un videoclip pe @YouTube, https://...",side-effect
3,1066338147527741440t,VIDEO --&gt;&gt; MMR #Vaccine increase risk of...,side-effect
4,963522018544152576t,Oral Polio Vaccine: Infecting Unvaccinated Kid...,side-effect
...,...,...,...
481,1099937642169405440t,I know it's hard to believe... But another vac...,none
482,1536674508731518985t,Seizures on the Very Day of the MMR Vaccine: T...,side-effect
483,1183314494874968064t,Kenyan Doctors Say UNICEF Is Making Women Barr...,side-effect
484,1327581896243556352t,Not doing it. My experience with vaccines? In ...,pharma


In [None]:
test_data.to_csv('test_data.csv', index=False)