In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import tensorflow as tf
from tensorflow import keras
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [2]:
Data = pd.read_csv('tweets.csv')
Data.head(10)

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
5,5,ablaze,OC,"If this child was Chinese, this tweet would ha...",0
6,6,ablaze,"London, England",Several houses have been set ablaze in Ngemsib...,1
7,7,ablaze,Bharat,Asansol: A BJP office in Salanpur village was ...,1
8,8,ablaze,"Accra, Ghana","National Security Minister, Kan Dapaah's side ...",0
9,9,ablaze,Searching,This creature who’s soul is no longer clarent ...,0


In [3]:
Data.tail()

Unnamed: 0,id,keyword,location,text,target
11365,11365,wrecked,Blue State in a red sea,Media should have warned us well in advance. T...,0
11366,11366,wrecked,arohaonces,i feel directly attacked 💀 i consider moonbin ...,0
11367,11367,wrecked,🇵🇭,i feel directly attacked 💀 i consider moonbin ...,0
11368,11368,wrecked,auroraborealis,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0
11369,11369,wrecked,,Jake Corway wrecked while running 14th at IRP.,1


In [4]:
#Now that we have imported the data we need to understand some things
#1 what is our dataset saying to us 
#2 what are the  problems of our data
#3 what are the things not needed in my dataset
#4 which algorithm should i use is it supervised or unsupervised
#5 if supervised which algorithm is the best to solve my problem

In [5]:
def missing_data(Data):
    missing_data_count = Data.isnull().sum()*100 / Data.shape[0]
    Data_missing_count = pd.DataFrame(missing_data_count).round(2)
    Data_missing_count = Data_missing_count.reset_index().rename(
               columns= {
                       'index': 'Column',
                        0 : 'Missing_Percentage',
               }
)
    Data_missing_value = Data.isnull().sum()
    Data_missing_value = Data_missing_value.reset_index().rename(
                columns= {
                    'index': 'Column',
                     0: 'Missing_Count',
                }
)
    Final = Data_missing_value.merge(Data_missing_count,how = 'inner', left_on = 'Column', right_on = 'Column')  
    Final = Final.sort_values(by = 'Missing_Count', ascending = False)
    return Final

missing_data(Data)

Unnamed: 0,Column,Missing_Count,Missing_Percentage
2,location,3418,30.06
0,id,0,0.0
1,keyword,0,0.0
3,text,0,0.0
4,target,0,0.0


In [6]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11370 entries, 0 to 11369
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        11370 non-null  int64 
 1   keyword   11370 non-null  object
 2   location  7952 non-null   object
 3   text      11370 non-null  object
 4   target    11370 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 444.3+ KB


In [7]:
Data.describe()

Unnamed: 0,id,target
count,11370.0,11370.0
mean,5684.5,0.185928
std,3282.380615,0.389066
min,0.0,0.0
25%,2842.25,0.0
50%,5684.5,0.0
75%,8526.75,0.0
max,11369.0,1.0


In [8]:
Data['text'][0]

'Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims\' houses and some houses and vehicles were set ablaze…'

In [9]:
Data_shuffled = Data.sample(frac=1,random_state=42)

In [10]:
Data_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
3495,3495,demolished,Observation drone,How many illegal buildings should be demolishe...,0
5461,5461,fatality,,Who’s fatality is this tho ????,0
9794,9794,sunk,London,#OnThisDay 2018 Chinese state media confirmed ...,1
11105,11105,windstorm,"London, UK",With any luck you will miss the windstorm on e...,0
1803,1803,buildings%20on%20fire,,"Inferno on Black Friday 1939: 71 deaths, 3,700...",1


In [11]:
Data.target.value_counts()

0    9256
1    2114
Name: target, dtype: int64

In [12]:
import random
random_index = random.randint(0, len(Data) -5)
for row in Data_shuffled[['text','target']][random_index:random_index + 5].itertuples():
    __,Text,Target = row
    print(f'Target:{Target}','Disater Tweet' if Target < 1 else 'Disater Tweet')
    print(f'Text:\n{Text}\n')
    print('---\n')


Target:0 Disater Tweet
Text:
CARATS SPREAD THIS SHIT LIKE FIRE CMON NOW #ODETOYOUINDAL #OdeToYouinHou https://t.co/taUpio9TzQ

---

Target:0 Disater Tweet
Text:
Nope. She is honestly a hard worker. She just really tries to prove herself and is sometime… https://t.co/9bwcJ6qzgV

---

Target:0 Disater Tweet
Text:
when i was drowning, that’s when i can finally breathe #Lover #BestRemix #iHeartAwards

---

Target:0 Disater Tweet
Text:
Under whites black peoples have experienced mass… https://t.co/X3NgrMfR2e

---

Target:0 Disater Tweet
Text:
#Spain undiscovered #gems: #Requena is known for its many wineries. This picturesque town was originally the epicen… https://t.co/hQ0isff9ql

---



In [13]:
Train_sentences,Val_sentences,Train_labels,Val_labels = train_test_split(Data_shuffled['text'].to_numpy(),
                                                                         Data_shuffled['target'].to_numpy(),
                                                                         test_size = 0.1,
                                                                         random_state=42)

In [14]:
len(Train_sentences),len(Train_labels),len(Val_sentences),len(Val_labels)

(10233, 10233, 1137, 1137)

In [15]:
len(Data_shuffled)

11370

In [16]:
Train_sentences[:5],Train_labels[:5]

(array(['🔻 GDP % at 6 yr low 🔺 Inflation at 6 yr high 🔻 Nominal GDP% at 45 yr low 🔺 Unemployment at 45 yr high 🔺 Petrol @ 80 🔺 Oni…',
        'SHAMEFUL "BEMIDJI -- With applause from a loud, passionate crowd Beltrami County on Tuesday, Jan. 7, became the first lo…',
        'Avalanches hit Army &amp; BSF posts, 4 jawans rescued but 1 jawan is still missing. Bad weather hampers rescue operations. https:…',
        "Meanwhile in The Shawa after today's emergency alert #PickeringNuclearStation #radiation https://t.co/pn3jzVhuV3",
        'Oh fuck I’m dying. Trevor is tossing out body bags today.'],
       dtype=object),
 array([0, 0, 1, 0, 0], dtype=int64))

In [17]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [18]:
max_vocab_length = 10000
max_length = 15

In [19]:
text_vectorizer = TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=None,
    output_mode="int",
    output_sequence_length=None,
    pad_to_max_tokens=False,)

In [20]:
Text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode = 'int',
                                    output_sequence_length = max_length)

In [21]:
len(Train_sentences[0].split())

32

In [22]:
Train_sentences[0].split()

['🔻',
 'GDP',
 '%',
 'at',
 '6',
 'yr',
 'low',
 '🔺',
 'Inflation',
 'at',
 '6',
 'yr',
 'high',
 '🔻',
 'Nominal',
 'GDP%',
 'at',
 '45',
 'yr',
 'low',
 '🔺',
 'Unemployment',
 'at',
 '45',
 'yr',
 'high',
 '🔺',
 'Petrol',
 '@',
 '80',
 '🔺',
 'Oni…']

In [23]:
round(sum([len(i.split()) for i in Train_sentences])/len(Train_sentences))

17

In [24]:
Text_vectorizer.adapt(Train_sentences)

In [25]:
sample_sentence = 'I love football always and forever'
Text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   9,  143, 1889,  240,    7, 2219,    0,    0,    0,    0,    0,
           0,    0,    0,    0]], dtype=int64)>

In [26]:
random_sentence = random.choice(Train_sentences)
print(f'Original text:\n {random_sentence}\
    \n\nVectorized version:')
Text_vectorizer([random_sentence])

Original text:
 When a customer really annoys you but the boss is watching from a distance https://t.co/IRfUwfyaKv    

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  41,    4,    1,  120,    1,   12,   28,    2, 2538,    8,  574,
          22,    4, 3170,    1]], dtype=int64)>

In [27]:
from tensorflow.keras import  layers
embedding = layers.Embedding(input_dim = max_vocab_length,
                             output_dim = 128,
                             embeddings_initializer='uniform',
                             input_length = max_length)
embedding

<keras.layers.core.embedding.Embedding at 0x1e4af8ee400>

In [28]:
random_sentence = random.choice(Train_sentences)
print(f'Original text:\n {random_sentence}\
    \n\nEmbedded version:')
sample_embed = embedding(Text_vectorizer([random_sentence]))
sample_embed

Original text:
 😺✏ — Buy my mom a house far from here, pay off my student loans, pay off my moms vechile, move to Toronto, also get… https://t.co/JCqIJeFx6r    

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.00120265,  0.01996168,  0.03680942, ...,  0.00661474,
          0.02705259, -0.01175797],
        [ 0.01354922, -0.01267743, -0.0297364 , ..., -0.04749265,
          0.00950684,  0.04086694],
        [ 0.02340953, -0.01676515, -0.03880345, ...,  0.04293073,
         -0.0491998 ,  0.0327204 ],
        ...,
        [-0.03674228,  0.02675709,  0.03150507, ...,  0.02241683,
         -0.04025275,  0.0406711 ],
        [-0.04725102, -0.00312213,  0.04376352, ...,  0.04051067,
         -0.01792122, -0.00012263],
        [-0.00120265,  0.01996168,  0.03680942, ...,  0.00661474,
          0.02705259, -0.01175797]]], dtype=float32)>

RNN MODEL LSTM

Input(text) -> Tokenize -> Embedding -> Layers(Rnns/dense) -> Output (label probability)

In [29]:
from tensorflow.keras import layers
Inputs = layers.Input(shape = (1,), dtype = "string")
x = Text_vectorizer(Inputs)
x = embedding(x)
print(x.shape)
x = layers.LSTM(64,return_sequences=True)(x)
print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
x = layers.Dense(64, activation = 'relu')(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x)
model = tf.keras.Model(Inputs,outputs, name = 'model_LSTM')


(None, 15, 128)
(None, 15, 64)
(None, 64)


In [30]:
model.compile(loss = 'binary_crossentropy',
              optimizer = tf.keras.optimizers.Adam(),
              metrics = ['accuracy'])
model_history = model.fit(Train_sentences,
                          Train_labels,
                          epochs = 5,
                          validation_data = (Val_sentences,Val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [32]:
model_pred_probs = model.predict(Val_sentences)
model_pred_probs[:10]



array([[9.3567804e-02],
       [7.4619447e-06],
       [2.8052918e-07],
       [8.8243805e-06],
       [1.3105819e-05],
       [5.6646979e-07],
       [9.9968690e-01],
       [3.6952479e-04],
       [9.7284275e-01],
       [4.1288913e-05]], dtype=float32)

In [33]:
model_preds = tf.squeeze(tf.round(model_pred_probs))
model_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 0., 0., 1., 0., 1., 0.], dtype=float32)>

In [34]:
model_results = calculate_results(Val_labels, model_preds)
model_results

{'accuracy': 87.86279683377309,
 'precision': 0.8757483056491281,
 'recall': 0.8786279683377308,
 'f1': 0.8770009127780442}

In [35]:
val_df = pd.DataFrame({"text": Val_sentences,
                       "target": Val_labels,
                       "pred": model_preds,
                       "pred_prob": tf.squeeze(model_pred_probs)})
val_df.head()

Unnamed: 0,text,target,pred,pred_prob
0,When he arrived at a juvenile detention center...,0,0.0,0.0935678
1,"Nowadays, Sid’s game is completely being destr...",0,0.0,7.461945e-06
2,If you're wondering why Libtard Twitter has go...,0,0.0,2.805292e-07
3,Next time on #Casualty! Tune in Saturday 21.20...,0,0.0,8.824381e-06
4,"""Amazing show, amazing voice!"" We've been inun...",0,0.0,1.310582e-05


In [36]:
most_wrong = val_df[val_df["target"] != val_df["pred"]].sort_values("pred_prob", ascending=False)
most_wrong[:10]

Unnamed: 0,text,target,pred,pred_prob
46,". ""killed"" a Nigerian Nigerian Dauda Onoruoiza...",0,1.0,0.999915
782,if the shooting down of the plane was an accid...,0,1.0,0.999867
6,Someone's having trouble staying awake 😴 #dogs...,0,1.0,0.999687
1039,19.00 obs from #Kirkwall Airport showed sustai...,0,1.0,0.999508
1063,#BeInspired #nonfiction A British soldiers jou...,0,1.0,0.999385
547,"At 8:07am local time two years ago today, huma...",0,1.0,0.999027
402,The Australia wildfire relief is getting a hug...,0,1.0,0.998926
525,LOOK: Residents had gone back to volcano islan...,0,1.0,0.9988
557,in case of a nuclear attack to a sprawling bur...,0,1.0,0.998373
445,Hi! My friends and I are organizing a relief o...,0,1.0,0.997246


In [37]:
for row in most_wrong[:10].itertuples(): # loop through the top 10 rows (change the index to view different rows)
  _, text, target, pred, prob = row
  print(f"Target: {target}, Pred: {int(pred)}, Prob: {prob}")
  print(f"Text:\n{text}\n")
  print("----\n")

Target: 0, Pred: 1, Prob: 0.9999154806137085
Text:
. "killed" a Nigerian Nigerian Dauda Onoruoiza on Ukrainian plane with #FakeNews htt…

----

Target: 0, Pred: 1, Prob: 0.9998666048049927
Text:
if the shooting down of the plane was an accident (human error), why were the Iranians shooting mi… https://t.co/JesxuYkYVD

----

Target: 0, Pred: 1, Prob: 0.9996868968009949
Text:
Someone's having trouble staying awake 😴 #dogsoftwitter https://t.co/mRwS2tblHr

----

Target: 0, Pred: 1, Prob: 0.9995080828666687
Text:
19.00 obs from #Kirkwall Airport showed sustained wind speeds in the preceding hour of 74.1 km/h (46.0 mph) or Force 8 (…

----

Target: 0, Pred: 1, Prob: 0.9993846416473389
Text:
#BeInspired #nonfiction A British soldiers journey into the depths of #mentalillness after a number of traumatic life…

----

Target: 0, Pred: 1, Prob: 0.9990273118019104
Text:
At 8:07am local time two years ago today, human error, this text alert, escalating US-North Korea tensions, and the lack…

----


In [40]:
for row in most_wrong[-7:].itertuples():
  _, text, target, pred, prob = row
  print(f"Target: {target}, Pred: {int(pred)}, Prob: {prob}")
  print(f"Text:\n{text}\n")
  print("----\n")

Target: 1, Pred: 0, Prob: 4.332517164584715e-06
Text:
I just survived the London rush hour tube ride (which is basically like surviving the zombie apocalypse), but what… https://t.co/OC1WtHjXiw

----

Target: 1, Pred: 0, Prob: 3.8140676679176977e-06
Text:
Germany's Foreign Minister Heiko Maas says 'It must be assumed the airplane crash in Iran was a terrible accident.' https://…

----

Target: 1, Pred: 0, Prob: 2.41985912907694e-06
Text:
Tennis player quits after coughing fit at smoky qualifiers https://t.co/E1P3kgpLop It is outrageous that the…

----

Target: 1, Pred: 0, Prob: 2.3710051664238563e-06
Text:
Volunteers battle to remove oil from Paiva beach, Pernambuco state, Brazil. Oil has washed up on more than 130 beaches along a 2,…

----

Target: 1, Pred: 0, Prob: 2.1437356281239772e-06
Text:
HIJACK : REGENTS PARK. JHB. GP. GUN METAL GREY FORD FIESTA. XZT022GP.

----

Target: 1, Pred: 0, Prob: 2.1437356281239772e-06
Text:
HIJACK : REGENTS PARK. JHB. GP. GUN METAL GREY FORD FIESTA. X