In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import tensorflow as tf
from tensorflow import keras
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [32]:
Data = pd.read_csv('tweets.csv')
Data.head(10)

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
5,5,ablaze,OC,"If this child was Chinese, this tweet would ha...",0
6,6,ablaze,"London, England",Several houses have been set ablaze in Ngemsib...,1
7,7,ablaze,Bharat,Asansol: A BJP office in Salanpur village was ...,1
8,8,ablaze,"Accra, Ghana","National Security Minister, Kan Dapaah's side ...",0
9,9,ablaze,Searching,This creature who’s soul is no longer clarent ...,0


In [33]:
Data.tail()

Unnamed: 0,id,keyword,location,text,target
11365,11365,wrecked,Blue State in a red sea,Media should have warned us well in advance. T...,0
11366,11366,wrecked,arohaonces,i feel directly attacked 💀 i consider moonbin ...,0
11367,11367,wrecked,🇵🇭,i feel directly attacked 💀 i consider moonbin ...,0
11368,11368,wrecked,auroraborealis,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0
11369,11369,wrecked,,Jake Corway wrecked while running 14th at IRP.,1


In [34]:
#Now that we have imported the data we need to understand some things
#1 what is our dataset saying to us 
#2 what are the  problems of our data
#3 what are the things not needed in my dataset
#4 which algorithm should i use is it supervised or unsupervised
#5 if supervised which algorithm is the best to solve my problem

In [35]:
def missing_data(Data):
    missing_data_count = Data.isnull().sum()*100 / Data.shape[0]
    Data_missing_count = pd.DataFrame(missing_data_count).round(2)
    Data_missing_count = Data_missing_count.reset_index().rename(
               columns= {
                       'index': 'Column',
                        0 : 'Missing_Percentage',
               }
)
    Data_missing_value = Data.isnull().sum()
    Data_missing_value = Data_missing_value.reset_index().rename(
                columns= {
                    'index': 'Column',
                     0: 'Missing_Count',
                }
)
    Final = Data_missing_value.merge(Data_missing_count,how = 'inner', left_on = 'Column', right_on = 'Column')  
    Final = Final.sort_values(by = 'Missing_Count', ascending = False)
    return Final

missing_data(Data)

Unnamed: 0,Column,Missing_Count,Missing_Percentage
2,location,3418,30.06
0,id,0,0.0
1,keyword,0,0.0
3,text,0,0.0
4,target,0,0.0


In [36]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11370 entries, 0 to 11369
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        11370 non-null  int64 
 1   keyword   11370 non-null  object
 2   location  7952 non-null   object
 3   text      11370 non-null  object
 4   target    11370 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 444.3+ KB


In [37]:
Data.describe()

Unnamed: 0,id,target
count,11370.0,11370.0
mean,5684.5,0.185928
std,3282.380615,0.389066
min,0.0,0.0
25%,2842.25,0.0
50%,5684.5,0.0
75%,8526.75,0.0
max,11369.0,1.0


In [38]:
Data['text'][0]

'Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims\' houses and some houses and vehicles were set ablaze…'

In [39]:
Data_shuffled = Data.sample(frac=1,random_state=42)

In [40]:
Data_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
3495,3495,demolished,Observation drone,How many illegal buildings should be demolishe...,0
5461,5461,fatality,,Who’s fatality is this tho ????,0
9794,9794,sunk,London,#OnThisDay 2018 Chinese state media confirmed ...,1
11105,11105,windstorm,"London, UK",With any luck you will miss the windstorm on e...,0
1803,1803,buildings%20on%20fire,,"Inferno on Black Friday 1939: 71 deaths, 3,700...",1


In [41]:
Data.target.value_counts()

0    9256
1    2114
Name: target, dtype: int64

In [42]:
import random
random_index = random.randint(0, len(Data) -5)
for row in Data_shuffled[['text','target']][random_index:random_index + 5].itertuples():
    __,Text,Target = row
    print(f'Target:{Target}','NOT Disater Tweet' if Target < 1 else 'Disater Tweet')
    print(f'Text:\n{Text}\n')
    print('---\n')


Target:1 Disater Tweet
Text:
A DSP Dalvinder Singh was caught with two terrorists!! No screaming and yelling by Arnab of or Navika of …

---

Target:0 NOT Disater Tweet
Text:
if u hate rats.. just remember that im out there and plague outbreak risks are high

---

Target:0 NOT Disater Tweet
Text:
my suggestion: yell angry things as Gaeilge at him until he gets the hint. Alternate suggestion: spike-li… https://t.co/2YvB6cfitk

---

Target:1 Disater Tweet
Text:
Too bad. Six people death, four still missing as sinkhole in China swallows bus with passengers https://t.co/EBFGY3po4Y

---

Target:0 NOT Disater Tweet
Text:
the info, the thoughts, suffering, images, it's such an unholy inundation of misery. the world is a living, writhing nigh…

---



In [43]:
Train_sentences,Val_sentences,Train_labels,Val_labels = train_test_split(Data_shuffled['text'].to_numpy(),
                                                                         Data_shuffled['target'].to_numpy(),
                                                                         test_size = 0.1,
                                                                         random_state=42)

In [44]:
len(Train_sentences),len(Train_labels),len(Val_sentences),len(Val_labels)

(10233, 10233, 1137, 1137)

In [45]:
len(Data_shuffled)

11370

In [46]:
Train_sentences[:5],Train_labels[:5]

(array(['🔻 GDP % at 6 yr low 🔺 Inflation at 6 yr high 🔻 Nominal GDP% at 45 yr low 🔺 Unemployment at 45 yr high 🔺 Petrol @ 80 🔺 Oni…',
        'SHAMEFUL "BEMIDJI -- With applause from a loud, passionate crowd Beltrami County on Tuesday, Jan. 7, became the first lo…',
        'Avalanches hit Army &amp; BSF posts, 4 jawans rescued but 1 jawan is still missing. Bad weather hampers rescue operations. https:…',
        "Meanwhile in The Shawa after today's emergency alert #PickeringNuclearStation #radiation https://t.co/pn3jzVhuV3",
        'Oh fuck I’m dying. Trevor is tossing out body bags today.'],
       dtype=object),
 array([0, 0, 1, 0, 0], dtype=int64))

In [47]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [48]:
max_vocab_length = 10000
max_length = 15

In [49]:
text_vectorizer = TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=None,
    output_mode="int",
    output_sequence_length=None,
    pad_to_max_tokens=False,)

In [50]:
Text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode = 'int',
                                    output_sequence_length = max_length)

In [51]:
len(Train_sentences[0].split())

32

In [52]:
Train_sentences[0].split()

['🔻',
 'GDP',
 '%',
 'at',
 '6',
 'yr',
 'low',
 '🔺',
 'Inflation',
 'at',
 '6',
 'yr',
 'high',
 '🔻',
 'Nominal',
 'GDP%',
 'at',
 '45',
 'yr',
 'low',
 '🔺',
 'Unemployment',
 'at',
 '45',
 'yr',
 'high',
 '🔺',
 'Petrol',
 '@',
 '80',
 '🔺',
 'Oni…']

In [53]:
round(sum([len(i.split()) for i in Train_sentences])/len(Train_sentences))

17

In [54]:
Text_vectorizer.adapt(Train_sentences)

In [55]:
sample_sentence = 'I love football always and forever'
Text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   9,  143, 1889,  240,    7, 2219,    0,    0,    0,    0,    0,
           0,    0,    0,    0]], dtype=int64)>

In [56]:
random_sentence = random.choice(Train_sentences)
print(f'Original text:\n {random_sentence}\
    \n\nVectorized version:')
Text_vectorizer([random_sentence])

Original text:
 Telnet backdoor opens 1M+ #IoT radios to hijack. Attackers can drop #malware, add the device to a botnet or send their own…    

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   1, 6246, 1996,    1, 8391,    1,    3,  830, 3853,   49, 1446,
           1, 1936,    2, 3751]], dtype=int64)>

In [57]:
from tensorflow.keras import  layers
embedding = layers.Embedding(input_dim = max_vocab_length,
                             output_dim = 128,
                             embeddings_initializer='uniform',
                             input_length = max_length)
embedding

<keras.layers.core.embedding.Embedding at 0x213a1d3bd90>

In [58]:
random_sentence = random.choice(Train_sentences)
print(f'Original text:\n {random_sentence}\
    \n\nEmbedded version:')
sample_embed = embedding(Text_vectorizer([random_sentence]))
sample_embed

Original text:
 200114 Army: everyone help the Philippine ARMYs... please.🙏🏻 #PrayforThePhilippines V: these my heart hurts a lot. plea…    

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-5.9388652e-03,  3.4871247e-02,  2.4203781e-02, ...,
          2.0155419e-02,  3.0349676e-02,  3.6196496e-02],
        [ 3.7085507e-02,  3.9435897e-02, -3.6928128e-02, ...,
         -1.1354625e-02,  2.1473017e-02, -1.5304804e-02],
        [-2.0642234e-02, -5.6622401e-03,  2.2385780e-02, ...,
          2.3334097e-02,  3.4723613e-02,  4.4364695e-02],
        ...,
        [ 1.3190135e-03, -5.2034855e-05, -1.2083102e-02, ...,
         -1.0186650e-02,  1.7207231e-02,  1.4497545e-02],
        [ 1.6335700e-02, -4.7901940e-02,  5.4353476e-03, ...,
          4.8815560e-02, -4.9407184e-02,  1.3194252e-02],
        [-4.7256541e-02,  3.6661241e-02,  3.5488497e-02, ...,
         -3.1789862e-02,  1.4147017e-02,  4.9874846e-02]]], dtype=float32)>

RNN MODEL LSTM

Input(text) -> Tokenize -> Embedding -> Layers(Rnns/dense) -> Output (label probability)

In [59]:
from tensorflow.keras import layers
Inputs = layers.Input(shape = (1,), dtype = "string")
x = Text_vectorizer(Inputs)
x = embedding(x)
print(x.shape)
x = layers.LSTM(64,return_sequences=True)(x)
print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
x = layers.Dense(64, activation = 'relu')(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x)
model = tf.keras.Model(Inputs,outputs, name = 'model_LSTM')


(None, 15, 128)
(None, 15, 64)
(None, 64)


In [60]:
model.compile(loss = 'binary_crossentropy',
              optimizer = tf.keras.optimizers.Adam(),
              metrics = ['accuracy'])
model_history = model.fit(Train_sentences,
                          Train_labels,
                          epochs = 5,
                          validation_data = (Val_sentences,Val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [61]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [62]:
model_pred_probs = model.predict(Val_sentences)
model_pred_probs[:10]



array([[1.6289806e-01],
       [1.0126011e-04],
       [7.1551945e-06],
       [1.4978070e-04],
       [3.4152381e-05],
       [1.5440255e-05],
       [9.0542209e-01],
       [5.5006435e-03],
       [9.9222910e-01],
       [3.7777747e-04]], dtype=float32)

In [63]:
model_preds = tf.squeeze(tf.round(model_pred_probs))
model_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 0., 0., 1., 0., 1., 0.], dtype=float32)>

In [64]:
model_results = calculate_results(Val_labels, model_preds)
model_results

{'accuracy': 87.77484608619174,
 'precision': 0.8732342782690036,
 'recall': 0.8777484608619174,
 'f1': 0.8749747297002402}

In [65]:
val_df = pd.DataFrame({"text": Val_sentences,
                       "target": Val_labels,
                       "pred": model_preds,
                       "pred_prob": tf.squeeze(model_pred_probs)})
val_df.head()

Unnamed: 0,text,target,pred,pred_prob
0,When he arrived at a juvenile detention center...,0,0.0,0.162898
1,"Nowadays, Sid’s game is completely being destr...",0,0.0,0.000101
2,If you're wondering why Libtard Twitter has go...,0,0.0,7e-06
3,Next time on #Casualty! Tune in Saturday 21.20...,0,0.0,0.00015
4,"""Amazing show, amazing voice!"" We've been inun...",0,0.0,3.4e-05


In [66]:
most_wrong = val_df[val_df["target"] != val_df["pred"]].sort_values("pred_prob", ascending=False)
most_wrong[:10]

Unnamed: 0,text,target,pred,pred_prob
46,". ""killed"" a Nigerian Nigerian Dauda Onoruoiza...",0,1.0,0.999892
782,if the shooting down of the plane was an accid...,0,1.0,0.999208
445,Hi! My friends and I are organizing a relief o...,0,1.0,0.995828
468,"Along with , we’re doing relief operations for...",0,1.0,0.995584
557,in case of a nuclear attack to a sprawling bur...,0,1.0,0.995295
351,I've heard it's touched places throughout the ...,0,1.0,0.994648
1063,#BeInspired #nonfiction A British soldiers jou...,0,1.0,0.99313
582,How about some compassion for the families of ...,0,1.0,0.991903
1039,19.00 obs from #Kirkwall Airport showed sustai...,0,1.0,0.988749
414,real-time-ish NSW burned areas finally availab...,0,1.0,0.987509


In [67]:
for row in most_wrong[:10].itertuples(): # loop through the top 10 rows (change the index to view different rows)
  _, text, target, pred, prob = row
  print(f"Target: {target}, Pred: {int(pred)}, Prob: {prob}")
  print(f"Text:\n{text}\n")
  print("----\n")

Target: 0, Pred: 1, Prob: 0.9998921155929565
Text:
. "killed" a Nigerian Nigerian Dauda Onoruoiza on Ukrainian plane with #FakeNews htt…

----

Target: 0, Pred: 1, Prob: 0.9992079138755798
Text:
if the shooting down of the plane was an accident (human error), why were the Iranians shooting mi… https://t.co/JesxuYkYVD

----

Target: 0, Pred: 1, Prob: 0.9958281517028809
Text:
Hi! My friends and I are organizing a relief operation in the nearby evacuation areas in our city. We'll be volunte… https://t.co/gNobrzzKoW

----

Target: 0, Pred: 1, Prob: 0.9955840110778809
Text:
Along with , we’re doing relief operations for the survivors of the Taal Volcano Eruption. We’re reaching o…

----

Target: 0, Pred: 1, Prob: 0.9952945709228516
Text:
in case of a nuclear attack to a sprawling bureaucratic agency tasked with mobilizing help in the midst of disaster.

----

Target: 0, Pred: 1, Prob: 0.9946480989456177
Text:
I've heard it's touched places throughout the UK....but thankfully no reports of a

In [68]:
for row in most_wrong[-7:].itertuples():
  _, text, target, pred, prob = row
  print(f"Target: {target}, Pred: {int(pred)}, Prob: {prob}")
  print(f"Text:\n{text}\n")
  print("----\n")

Target: 1, Pred: 0, Prob: 6.812390347477049e-05
Text:
It seems a thunderstorm is heading our way. Better retreat to a cave. A tent won't stand a chance.

----

Target: 1, Pred: 0, Prob: 5.9176261856919155e-05
Text:
I just survived the London rush hour tube ride (which is basically like surviving the zombie apocalypse), but what… https://t.co/OC1WtHjXiw

----

Target: 1, Pred: 0, Prob: 5.30556462763343e-05
Text:
I'm not crying, you are! A fur parent makes sure her fur baby is safe during a rescue operation in Talisay, Batangas. 🐶 #Taa…

----

Target: 1, Pred: 0, Prob: 4.5020329707767814e-05
Text:
#Islamabad alert! A negative blood required in Islamabad today, please contact me if someone here with A negative blo…

----

Target: 1, Pred: 0, Prob: 2.9518754672608338e-05
Text:
HIJACK : REGENTS PARK. JHB. GP. GUN METAL GREY FORD FIESTA. XZT022GP.

----

Target: 1, Pred: 0, Prob: 2.9518754672608338e-05
Text:
HIJACK : REGENTS PARK. JHB. GP. GUN METAL GREY FORD FIESTA. XZT022GP.

----

Target: