In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from tqdm.notebook import tqdm_notebook

In [3]:
import nltk
from nltk.corpus import stopwords

In [4]:
train = pd.read_csv('train_val.csv')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9921 entries, 0 to 9920
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      9921 non-null   object
 1   tweet   9921 non-null   object
 2   labels  9921 non-null   object
dtypes: object(3)
memory usage: 232.6+ KB


In [6]:
train.head()

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,@cath__kath AstraZeneca is made with the kidne...,ingredients
1,1336808189677940736t,It begins. Please find safe alternatives to th...,side-effect
2,1329488407307956231t,"@PaolaQP1231 Well, I mean congratulations Covi...",side-effect
3,1364194604459900934t,@BorisJohnson for those of us that do not wish...,mandatory
4,1375938799247765515t,She has been trying to speak out: writing lett...,side-effect rushed


In [7]:
train.isnull().sum()

ID        0
tweet     0
labels    0
dtype: int64

In [8]:
train.shape

(9921, 3)

In [9]:
tweet=train['tweet']

In [10]:
tweet

0       @cath__kath AstraZeneca is made with the kidne...
1       It begins. Please find safe alternatives to th...
2       @PaolaQP1231 Well, I mean congratulations Covi...
3       @BorisJohnson for those of us that do not wish...
4       She has been trying to speak out: writing lett...
                              ...                        
9916    Former Pfizer Chief Scientific Officer on Expe...
9917    @garygilligan Not what the manufacturer's are ...
9918    //That's a complete no for now on the Oxford/A...
9919    Opinion: Vaccine side effects..- possible to h...
9920    @BorisJohnson resign now before you destroy th...
Name: tweet, Length: 9921, dtype: object

In [11]:
def remove_words(dataset):
    cleaned_tweet=[]
    for text in dataset:
        words=text.split()
        cleaned_words=[]
        for word in words:
            if not word.startswith('@'):
                cleaned_words.append(word);
        cleaned_text = ' '.join(cleaned_words)
        cleaned_tweet.append(cleaned_text)
    return cleaned_tweet

In [12]:
tweets=remove_words(tweet)

In [13]:
tweets

['AstraZeneca is made with the kidney cells of a little girl aborted back in the 70s.',
 'Well, I mean congratulations Covid19 for being the first ever “thing” to eradicate influenza. In other news, Covid vaccines will spur the rise of influenza In 2021-2022 season. Influenza will be returning for a shot at the title belt. Nov 2, 2021 on pay per view. Order today.',
 'for those of us that do not wish a vaccine so will not be given a vaccine passport to go abroad, how do you intend to deal with us, your objective I suppose is to push us to one side and penalise us, this is your great and fare justice system you talk about.',
 'She has been trying to speak out: writing letters to government, speaking to Unison. She’s been questioning everything that’s been going on for a long time: “Vaccine rollout, vaccine injury. Realising it’s an experimental agent. It’s going to keep going,”... continue thread....',
 '😕 I\'m confused: 💉 WITH the vaccine, we\'re still required to socially distance, we

In [14]:
train['tweet']=tweets

In [15]:
train.head()

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,AstraZeneca is made with the kidney cells of a...,ingredients
1,1336808189677940736t,It begins. Please find safe alternatives to th...,side-effect
2,1329488407307956231t,"Well, I mean congratulations Covid19 for being...",side-effect
3,1364194604459900934t,for those of us that do not wish a vaccine so ...,mandatory
4,1375938799247765515t,She has been trying to speak out: writing lett...,side-effect rushed


In [16]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [18]:
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [19]:
import nltk
nltk.download('omw-1.4')
def clean(text):

  cleanr = re.compile('<[^>]*>')           # remove html
  cleantext = re.sub(cleanr, ' ', text)

  cleantext = re.sub("[-]", " " , cleantext)   # remove - sign

  cleantext = re.sub("[^A-Za-z0-9 ]", " " , cleantext)  # remove evey character except alphabet
  cleantext = cleantext.lower()

  words = nltk.tokenize.word_tokenize(cleantext)
  words_new = [i for i in words if i not in stop_words]

  w = [lemmatizer.lemmatize(word) for word in words_new if len(word)>2]

  return ' '.join(w)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [20]:
train['tweet'] = tqdm_notebook(train['tweet'].apply(clean))

  0%|          | 0/9921 [00:00<?, ?it/s]

In [21]:
train.head(20)

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,astrazeneca made kidney cell little girl abort...,ingredients
1,1336808189677940736t,begin please find safe alternative vaccine iss...,side-effect
2,1329488407307956231t,well mean congratulation covid19 first ever th...,side-effect
3,1364194604459900934t,wish vaccine given vaccine passport abroad int...,mandatory
4,1375938799247765515t,trying speak writing letter government speakin...,side-effect rushed
5,1361038049556140034t,confused vaccine still required socially dista...,ineffective mandatory
6,1365287445114322946t,need crazy twisted politician telling whether ...,political
7,1364157842022891520t,respect natalie view even differs mine genuine...,mandatory
8,1413367208537989123t,taking one team teamcanada new astra seen just...,side-effect
9,1405695355996426240t,canadian received astrazeneca vaccine excluded...,side-effect ineffective


In [22]:
train['labels'].value_counts

<bound method IndexOpsMixin.value_counts of 0              ingredients
1              side-effect
2              side-effect
3                mandatory
4       side-effect rushed
               ...        
9916           side-effect
9917                pharma
9918                  none
9919           side-effect
9920             political
Name: labels, Length: 9921, dtype: object>

In [23]:
train['labels'] = train['labels'].str.split()

In [24]:
train.head()

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,astrazeneca made kidney cell little girl abort...,[ingredients]
1,1336808189677940736t,begin please find safe alternative vaccine iss...,[side-effect]
2,1329488407307956231t,well mean congratulation covid19 first ever th...,[side-effect]
3,1364194604459900934t,wish vaccine given vaccine passport abroad int...,[mandatory]
4,1375938799247765515t,trying speak writing letter government speakin...,"[side-effect, rushed]"


In [25]:
from sklearn.preprocessing import MultiLabelBinarizer

In [26]:
mlb=MultiLabelBinarizer()

In [28]:
labels=mlb.fit_transform(train['labels'])

In [29]:
train

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,astrazeneca made kidney cell little girl abort...,[ingredients]
1,1336808189677940736t,begin please find safe alternative vaccine iss...,[side-effect]
2,1329488407307956231t,well mean congratulation covid19 first ever th...,[side-effect]
3,1364194604459900934t,wish vaccine given vaccine passport abroad int...,[mandatory]
4,1375938799247765515t,trying speak writing letter government speakin...,"[side-effect, rushed]"
...,...,...,...
9916,1388469392866938880t,former pfizer chief scientific officer experim...,[side-effect]
9917,1352957607393300485t,manufacturer saying manufacturer recommendatio...,[pharma]
9918,1357484621542268928t,complete oxford astrazeneca vaccine swissmedic...,[none]
9919,1371121610057388037t,opinion vaccine side effect possible penicilli...,[side-effect]


In [30]:
labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [31]:
import numpy as np

In [32]:
np.shape(labels)

(9921, 12)

In [33]:
train['labels']=labels

In [37]:
!pip install Keras-Preprocessing
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

Collecting Keras-Preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Keras-Preprocessing
Successfully installed Keras-Preprocessing-1.1.2


In [38]:
import tensorflow as tf
import tensorflow.keras.layers as layers

model = Sequential()
model.add(Embedding(input_dim=30000, output_dim=20, input_length=20))
model.add(LSTM(units=100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [39]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(train['tweet'])
sequences = tokenizer.texts_to_sequences(train['tweet'])
padded_sequences = pad_sequences(sequences, maxlen=20)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded_sequences, train['labels'], epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f01b4522620>

In [40]:
from keras.layers import Dense, LSTM, Embedding, Bidirectional

In [41]:
model.add(Embedding(input_dim=30000, output_dim=20, input_length=20))
model.add(Bidirectional(LSTM(units=100, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [42]:
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(train['tweet'])
sequences = tokenizer.texts_to_sequences(train['tweet'])
padded_sequences = pad_sequences(sequences, maxlen=20)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded_sequences, train['labels'], epochs=10, batch_size=32)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f01af461780>