# Live Tweets Disaster Analysis with BERT

Copyright @ 2020 **ABCOM Information Systems Pvt. Ltd.** All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

See the License for the specific language governing permissions and limitations under the License.

## Install transformers


In [None]:
!pip install transformers

## Import necessary packages

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from tokenizers import BertWordPieceTokenizer
from tqdm.notebook import tqdm
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import backend as K
import transformers
from transformers import TFAutoModel, AutoTokenizer
import matplotlib.pyplot as plt

## Configure TPU

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

## Get tweets training data from GitHub 
This data was originally posted on Kaggle

In [None]:
!wget https://raw.githubusercontent.com/abcom-mltutorials/Live-Tweets-Disaster-Analysis-/master/train.csv?raw=true

In [None]:
# load the data with pandas
train=pd.read_csv('/content/train.csv?raw=true')

The motive is to classify tweets into real disaster(target=1) and no disaster(target=0) with the help of Bert transformer

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.isnull().sum()

Checking which keywords are most commonly found in the tweets.

In [None]:
# empty list for holding keyword from each row of train['keyword']
keyword_combined=[] 
for i in range(len(train)):
  keyword_combined.append(train['keyword'].iloc[i])

In [None]:
import collections
# count instances of each keyword
keyword_counters = collections.Counter(keyword_combined) 

In [None]:
 # make dataframe with words and their corresponding counts
 keyword_with_counts = pd.DataFrame(keyword_counters.most_common(15), 
                             columns=['keyword', 'count'])

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))

# Plot horizontal bar graph
# plot the frequency distribution after sorting
keyword_with_counts.sort_values(by='count').plot.barh(x='keyword',  
                      y='count',
                      ax=ax,
                      color="purple")

ax.set_title("Common Words Found in Tweets")

plt.show()

## Removing unwanted columns

In [None]:
# dropping id, location, keyword column
train.drop(['id','location','keyword'],axis=1,inplace=True)

In [None]:
train['target'].value_counts()

## Cleaning the data
Install clean-text for cleaning the tweets which might contain urls, numbers etc. which will not be helpful for our model.

In [None]:
!pip install clean-text[gpl]

In [None]:
from cleantext import clean

In [None]:
def text_cleaning(text):
    text=clean(text,
      fix_unicode=True,               # fix various unicode errors
      to_ascii=True,                 # transliterate to closest ASCII representation
      lower=True,                    # lowercase text
      no_line_breaks=True,           # fully strip line breaks
      no_urls=True,                  # replace all URLs with ''
      no_emails=True,                # replace all email addresses with ''
      no_phone_numbers=True,         # replace all phone numbers with ''
      no_numbers=True,               # replace all numbers with ''
      no_digits=True,                # replace all digits with ''
      no_currency_symbols=True,      # replace all currency symbols with ''
      no_punct=True,                 # fully remove punctuation
      replace_with_url="",
      replace_with_email="",
      replace_with_phone_number="",
      replace_with_number="",
      replace_with_digit="",
      replace_with_currency_symbol="",
      lang="en"                      # set to 'en' for English
    )
    return text

In [None]:
for i in range(len(train)):
    train['text'].iloc[i]=text_cleaning(train['text'].iloc[i])

Removing stopwords

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stoplist = stopwords.words('english')

In [None]:
for i in range(len(train)):
  train['text'].iloc[i] = [word for word in train['text'].iloc[i].split() if word not in stoplist]

In [None]:
train['text']

Let's look at frequency distribution of all unique words.

In [None]:
# empty list for holding words from each row of train['text']
text_combined=[] 

In [None]:
for i in range(len(train)):
  text_combined.append(train['text'].iloc[i])

In [None]:
from itertools import chain
# convert the 2D array of words to 1D
flatten_list_text = list(chain.from_iterable(text_combined)) 

In [None]:
# count instances of each word
import collections
word_counters = collections.Counter(flatten_list_text) 

In [None]:
# make dataframe with words and their corresponding counts
words_with_counts = pd.DataFrame(word_counters.most_common(15),  
                             columns=['words', 'count'])

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))

# Plot horizontal bar graph
# plot the frequency distribution after sorting
words_with_counts.sort_values(by='count').plot.barh(x='words',  
                      y='count',
                      ax=ax,
                      color="purple")

ax.set_title("Common Words Found in Tweets")

plt.show()

## Modeling

In [None]:
def build_model(transformer, max_len=512): 
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    x = tf.keras.layers.Dropout(0.35)(cls_token)

    # make output dense layer 
    out = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=3e-5), loss='binary_crossentropy', 
                  metrics=[tf.keras.metrics.AUC()])
    
    return model

In [None]:
# take bert layer using transformers.TFBertModel and add it in model.
with strategy.scope():
    transformer_layer = transformers.TFBertModel.from_pretrained('bert-base-uncased')
    model = build_model(transformer_layer, max_len=512)

In [None]:
model.summary()

# Tokenizing

In [None]:
import transformers
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
save_path = 'distilbert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
from tokenizers import BertWordPieceTokenizer
fast_tokenizer = BertWordPieceTokenizer('distilbert_base_uncased/vocab.txt', lowercase=True)
fast_tokenizer

In [None]:
def fast_encode(texts, tokenizer, size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)  # truncate the text and limit it to maxlen
    tokenizer.enable_padding(length=maxlen)         # pad sentences shorter than maxlen
    ids_full = []
    
    for i in tqdm(range(0, len(texts), size)):
        text = texts[i:i+size].tolist()
        encs = tokenizer.encode_batch(text)         
        ids_full.extend([enc.ids for enc in encs])
    
    return np.array(ids_full)

Encode the tweets using fast_tokenizer

In [None]:
x = fast_encode(train.text.astype(str), fast_tokenizer, maxlen=512)

In [None]:
BATCH_SIZE=64

In [None]:
y=train['target'].values

Splitting the data into train and test.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.1, random_state=42)

Creating dataset for bert model

In [None]:
train_dataset = (
    tf.data.Dataset 
      .from_tensor_slices((X_train, y_train))
      .repeat()
      .shuffle(2048)
      .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE) 
)

In [None]:
test_data = (
    tf.data.Dataset# create dataset
    .from_tensor_slices(X_test) 
    .batch(BATCH_SIZE)
)

# Training

In [None]:
with strategy.scope():
    train_history = model.fit(
      train_dataset,
      steps_per_epoch=150,
      epochs = 10
    )

# Testing

Let's predict on X_test

In [None]:
predictions = model.predict(X_test)

Flattening predictions from 2d list to 1d list

In [None]:
flattened_predictions = list(chain.from_iterable(predictions))

In [None]:
for i in range(len(flattened_predictions)):
  if flattened_predictions[i] <= 0.5: 
    flattened_predictions[i] = 0
  else: 
    flattened_predictions[i] = 1

In [None]:
flattened_predictions[:5]

Checking out the accuracy with the predictions that we made on X_test and comparing it with y_test.

In [None]:
from sklearn.metrics import accuracy_score
round(accuracy_score(y_test, flattened_predictions),3)

## Doing live analysis of twitter's tweets 
We have trained the model and it can be used to classify the live tweets we gather from twitter.

Install tweepy
tweepy is the python client for the official Twitter API


In [None]:
!pip install tweepy

In [None]:
import re 
import tweepy 
from tweepy import OAuthHandler 
from cleantext import clean

In order to fetch tweets through Twitter API, one needs to register an App through their twitter account. Follow these steps for the same:

Open this link 'https://apps.twitter.com/' and click the button: ‘Create New App’
Fill the application details. 
You can leave the callback url field empty.
Once the app is created, you will be redirected to the app page.
Open the ‘Keys and Access Tokens’ tab.
Copy ‘Consumer Key’, ‘Consumer Secret’, ‘Access token’ and ‘Access Token Secret’.

A function has to be defined to tokenize our tweet so that its ids can be to the model for making prediction. 

In [None]:
# convert tweet into tokens.    
def convert_lines(tweet, max_seq_length,tokenizer):
  max_seq_length -=2
  all_tokens = []

  tokens_a = tokenizer.tokenize(tweet)
  if len(tokens_a)>max_seq_length:
    tokens_a = tokens_a[:max_seq_length]

  # remove stopwords
  from nltk.corpus import stopwords
  import nltk
  stoplist = stopwords.words('english')
  tokens_b = [word for word in tokens_a if not word in stoplist]

  one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_b+["[SEP]"])+[0] * (max_seq_length - len(tokens_b))
  all_tokens.append(one_token)

  return np.array(all_tokens)

This below function would use our bert model that we trained earlier and also make use of the ids of the clean tokenized tweet to classify it into real or no disaster.


In [None]:
def predict_disaster(tweet): 
        
  maxlen = 512

  tweet2 = text_cleaning(tweet)

  token_input2 = convert_lines(tweet2,maxlen,tokenizer)

  seg_input2 = np.zeros((token_input2.shape[0],maxlen))
  mask_input2 = np.ones((token_input2.shape[0],maxlen))

  hehe = model.predict([token_input2, seg_input2, mask_input2],verbose=1,batch_size=32)

  if hehe <= 0.5: 
    return 'no disaster'
  else: 
    return 'real disaster'

The load_tweets function defined below is responsible for loading the tweets from Twitter using the user's tokens and keys. It returns the original tweets that we load along with their class prediction that we get from predict_disaster function

In [None]:
def load_tweets(query, consumer_key, consumer_secret, access_token, access_token_secret,count = 10): 
        
  
        # attempt authentication 
        try: 
            # create OAuthHandler object 
            auth_handle = OAuthHandler(consumer_key, consumer_secret) 
           
            # set access token and secret 
            auth_handle.set_access_token(access_token, access_token_secret) 
            
            # create tweepy API object to fetch tweets 
            api = tweepy.API(auth_handle) 

        except: 
            print("Error: Authentication Failed")

        # empty list to store parsed tweets 
        tweets = [] 
  
        try: 
            # call twitter api to fetch tweets 
            our_tweets  = api.search(q = query, count = count) 
  
            # parsing tweets one by one 
            for tweet in our_tweets : 

                # empty dictionary to store required params of a tweet 
                parsed_tweet = {} 

                # saving text of tweet 
                parsed_tweet['text'] = tweet.text 
                
                # saving sentiment of tweet 
                parsed_tweet['class'] = predict_disaster(tweet.text) 
  
                # appending parsed tweet to tweets list 
                if tweet.retweet_count > 0: 
                    # if tweet has retweets, ensure that it is appended only once 
                    if parsed_tweet not in tweets: 
                        tweets.append(parsed_tweet) 
                else: 
                    tweets.append(parsed_tweet) 
  
            # return parsed tweets 
            return tweets 
  
        except tweepy.TweepError as e: 
            # print error (if any) 
            print("Error : " + str(e)) 

In [None]:
#  Your keys and tokens from the Twitter Dev Console
consumer_key = 'YOUR CONSUMER_KEY'
consumer_secret = 'YOUR CONSUMER_SECRET'
access_token = 'YOUR ACCESS_TOKEN'
access_token_secret = 'YOUR ACCESS_TOKEN_SECRET'

In [None]:
# input any query and tweets regarding it would come up. 
tweets = load_tweets('crime', consumer_key, consumer_secret, access_token, access_token_secret, 200) 

Let's distribute the tweets in 2 variables according to their classes and find out Real Disaster tweets percentage
and No Disaster tweets percentage

In [None]:
real_d  = [tweet for tweet in tweets if tweet['class'] == 'real disaster']  
print("Real Disaster tweets percentage: {} %".format(round((100*len(real_d )/len(tweets)),2)))

no_d = [tweet for tweet in tweets if tweet['class'] == 'no disaster'] 
print("No Disaster tweets percentage: {} %".format(round((100*len(no_d)/len(tweets)),2)))

In [None]:
# printing first 5 positive tweets 
print("\n\n Real Disaster tweets:") 
for tweet in real_d[:10]: 
    print(tweet['text']) 

# printing first 5 negative tweets 
print("\n\n No Disaster tweets:") 
for tweet in no_d[:10]: 
    print(tweet['text']) 