In [51]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm.notebook import tqdm_notebook


In [52]:
!pip install transformers



In [53]:
!pip install sentencepiece



In [54]:
from transformers import XLNetTokenizer, TFXLNetModel


In [55]:
dataset = pd.read_csv('train_val.csv')

In [56]:
def remove_words(dataset):
    cleaned_tweet=[]
    for text in dataset:
        words=text.split()
        cleaned_words=[]
        for word in words:
            if not word.startswith('@'):
                cleaned_words.append(word);
        cleaned_text = ' '.join(cleaned_words)
        cleaned_tweet.append(cleaned_text)
    return cleaned_tweet

In [57]:
tweet=dataset['tweet']

In [58]:
tweets=remove_words(tweet)

In [59]:
dataset['tweet']=tweets

In [60]:
import nltk
from nltk.corpus import stopwords

In [61]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [62]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [63]:
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [64]:
import nltk
nltk.download('omw-1.4')
def clean(text):

  cleanr = re.compile('<[^>]*>')
  cleantext = re.sub(cleanr, ' ', text)

  cleantext = re.sub("[-]", " " , cleantext)

  cleantext = re.sub("[^A-Za-z0-9 ]", " " , cleantext)
  cleantext = cleantext.lower()

  words = nltk.tokenize.word_tokenize(cleantext)
  words_new = [i for i in words if i not in stop_words]

  w = [lemmatizer.lemmatize(word) for word in words_new if len(word)>2]

  return ' '.join(w)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [65]:
dataset['tweet'] = tqdm_notebook(dataset['tweet'].apply(clean))

  0%|          | 0/9921 [00:00<?, ?it/s]

In [66]:
dataset['labels'] = dataset['labels'].str.split()

In [67]:
dataset

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,astrazeneca made kidney cell little girl abort...,[ingredients]
1,1336808189677940736t,begin please find safe alternative vaccine iss...,[side-effect]
2,1329488407307956231t,well mean congratulation covid19 first ever th...,[side-effect]
3,1364194604459900934t,wish vaccine given vaccine passport abroad int...,[mandatory]
4,1375938799247765515t,trying speak writing letter government speakin...,"[side-effect, rushed]"
...,...,...,...
9916,1388469392866938880t,former pfizer chief scientific officer experim...,[side-effect]
9917,1352957607393300485t,manufacturer saying manufacturer recommendatio...,[pharma]
9918,1357484621542268928t,complete oxford astrazeneca vaccine swissmedic...,[none]
9919,1371121610057388037t,opinion vaccine side effect possible penicilli...,[side-effect]


In [68]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [69]:
def tokenize_tweets(text):
    input_ids = []
    attention_masks = []
    token_type_ids = []

    for tweet in text:
        encoded = tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=300,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='tf'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        token_type_ids.append(encoded['token_type_ids'])

    return {
        'input_ids': tf.concat(input_ids, axis=0),
        'attention_mask': tf.concat(attention_masks, axis=0),
        'token_type_ids': tf.concat(token_type_ids, axis=0)
    }

In [70]:

train_tokens = tokenize_tweets(dataset['tweet'])


In [73]:
bert_model = TFXLNetModel.from_pretrained('xlnet-base-cased')

InvalidArgumentError: ignored

In [None]:
input_ids = tf.keras.Input(shape=(300,), dtype=tf.int32)
attention_mask = tf.keras.Input(shape=(300,), dtype=tf.int32)
token_type_ids = tf.keras.Input(shape=(300,), dtype=tf.int32)


In [None]:
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(dataset['labels'])

In [None]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers
bert_output = bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
weight_decay = 0.001
dropout_rate=0.5
dropout_layer = Dropout(rate=dropout_rate)(bert_output[:, 0, :])
output = tf.keras.layers.Dense(len(mlb.classes_), activation='sigmoid',kernel_regularizer=regularizers.l2(weight_decay))(dropout_layer)

In [None]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')

In [None]:
model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss='binary_crossentropy',metrics=['binary_accuracy'])

In [None]:
train_inputs = [train_tokens['input_ids'], train_tokens['attention_mask'], train_tokens['token_type_ids']]

In [None]:
model.fit(train_inputs, train_labels, batch_size=16, epochs=10)

In [None]:
test_data= pd.read_csv('test.csv')

In [None]:
test_tokens = tokenize_tweets(test_data['tweet'])

In [None]:
test_inputs_bert = [test_tokens['input_ids'], test_tokens['attention_mask'], test_tokens['token_type_ids']]

In [None]:
data = model.predict(test_inputs_bert)
