In [1]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
torch.manual_seed(42)
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
import nltk

In [2]:
data = pd.read_csv("D:/Datasets/nlp-getting-started/train.csv")
data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
# remove the url links of the tweets
g = re.compile(r"https?://[\w.]*/?\w*\s*")  # the regular expression used to remove the urls from the tweet data.
data['text'] = data['text'].apply(lambda y: re.subn(g, '', string = y)[0])
# take the hashtags of the tweet and add them to a seperate column
def get_hashtags(x):
    g = re.compile(r"#[A-Za-z]*\s*")
    hash_tag_list = re.findall(g,string = x)
    for i,tag in enumerate(hash_tag_list):
        hash_tag_list[i] =  tag[1:].strip()
    return hash_tag_list
def get_mentions(x):
    mention_list = re.findall(re.compile('@[a-zA-Z0-9_]*\s*'),string = x)
    if len(mention_list) >=1:
        return mention_list
    else:
        return np.NaN
    
data['hashtag'] = data['text'].apply(lambda x : get_hashtags(x))
data['mentions'] = data['text'].apply(lambda x : get_mentions(x))
data

Unnamed: 0,id,keyword,location,text,target,hashtag,mentions
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,[earthquake],
1,4,,,Forest fire near La Ronge Sask. Canada,1,[],
2,5,,,All residents asked to 'shelter in place' are ...,1,[],
3,6,,,"13,000 people receive #wildfires evacuation or...",1,[wildfires],
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Alaska, wildfires]",
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,[],
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,[],"[@aria_ahrary , @TheTawniest ]"
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,1,[],
7611,10872,,,Police investigating after an e-bike collided ...,1,[],


In [4]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(reduce_len=True, preserve_case=False, strip_handles= True)
# Now lets tokenize each tweet and put it in the
def  tokenize(x, tokenizer):
    tokens = tokenizer.tokenize(x)
    return tokens

data['tokens']= data['text'].apply(lambda x : tokenize(x,tokenizer))
data

Unnamed: 0,id,keyword,location,text,target,hashtag,mentions,tokens
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,[earthquake],,"[our, deeds, are, the, reason, of, this, #eart..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,[],,"[forest, fire, near, la, ronge, sask, ., canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,[],,"[all, residents, asked, to, ', shelter, in, pl..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,[wildfires],,"[13,000, people, receive, #wildfires, evacuati..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Alaska, wildfires]",,"[just, got, sent, this, photo, from, ruby, #al..."
...,...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,[],,"[two, giant, cranes, holding, a, bridge, colla..."
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,[],"[@aria_ahrary , @TheTawniest ]","[the, out, of, control, wild, fires, in, calif..."
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,1,[],,"[m1, ., 94, [, 01:04, utc, ], ?, 5km, s, of, v..."
7611,10872,,,Police investigating after an e-bike collided ...,1,[],,"[police, investigating, after, an, e-bike, col..."


In [5]:
print(data['tokens'][:50])

0     [our, deeds, are, the, reason, of, this, #eart...
1      [forest, fire, near, la, ronge, sask, ., canada]
2     [all, residents, asked, to, ', shelter, in, pl...
3     [13,000, people, receive, #wildfires, evacuati...
4     [just, got, sent, this, photo, from, ruby, #al...
5     [#rockyfire, update, =, >, california, hwy, .,...
6     [#flood, #disaster, heavy, rain, causes, flash...
7     [i'm, on, top, of, the, hill, and, i, can, see...
8     [there's, an, emergency, evacuation, happening...
9     [i'm, afraid, that, the, tornado, is, coming, ...
10    [three, people, died, from, the, heat, wave, s...
11    [haha, south, tampa, is, getting, flooded, hah...
12    [#raining, #flooding, #florida, #tampabay, #ta...
13      [#flood, in, bago, myanmar, #we, arrived, bago]
14    [damage, to, school, bus, on, 80, in, multi, c...
15                                 [what's, up, man, ?]
16                                    [i, love, fruits]
17                                 [summer, is, 

In [6]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('English'))
print(stopwords)

{'they', 'off', 'and', 'theirs', 'have', 'its', 'wasn', "weren't", 'him', 'between', 'some', 'now', 'into', 'this', 'doesn', 'during', "mightn't", 'you', 'them', 'each', 'do', 'once', 'am', 'after', 'myself', 'where', 'again', 'at', 'y', 's', "wouldn't", 'if', 'when', "wasn't", "hadn't", 't', 'ours', 'being', 'don', 'shouldn', 'he', 'about', 'before', 'through', 'until', 'that', 'above', 'me', "that'll", 'i', 've', 'same', 'out', 'because', 'our', 'than', 'to', 'a', 'needn', 'over', 'himself', "mustn't", 'itself', 'under', 'any', 'ma', 'here', 'not', 'hasn', 'didn', 'then', "doesn't", 'how', 'she', 'm', 'themselves', 'against', "hasn't", 'does', 'other', 'were', "needn't", "you're", 'but', 'by', 'in', "she's", "couldn't", 'won', 'be', 'having', 'should', 'my', 'those', 'all', 'or', 'own', 'too', "don't", 'for', 're', 'aren', 'shan', 'his', 'had', 'most', 'mustn', 'ain', 'there', 'we', 'will', 'these', 'did', 'll', "shan't", 'doing', "shouldn't", 'such', 'hers', 'was', 'yourselves', "yo

In [7]:
def remove_stopwords(x, stopwords_set):
    for i,word in enumerate(x):
        if word in stopwords_set:
            del x[i]
    return x

data['tokens'] =  data['tokens'].apply(lambda x : remove_stopwords(x,stopwords))
data


Unnamed: 0,id,keyword,location,text,target,hashtag,mentions,tokens
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,[earthquake],,"[deeds, the, reason, this, #earthquake, may, a..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,[],,"[forest, fire, near, la, ronge, sask, ., canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,[],,"[residents, asked, ', shelter, place, ', being..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,[wildfires],,"[13,000, people, receive, #wildfires, evacuati..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Alaska, wildfires]",,"[got, sent, photo, ruby, #alaska, smoke, #wild..."
...,...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,[],,"[two, giant, cranes, holding, bridge, collapse..."
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,[],"[@aria_ahrary , @TheTawniest ]","[out, control, wild, fires, california, even, ..."
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,1,[],,"[m1, ., 94, [, 01:04, utc, ], ?, 5km, of, volc..."
7611,10872,,,Police investigating after an e-bike collided ...,1,[],,"[police, investigating, an, e-bike, collided, ..."


In [8]:
print(data['tokens'][:50])

0     [deeds, the, reason, this, #earthquake, may, a...
1      [forest, fire, near, la, ronge, sask, ., canada]
2     [residents, asked, ', shelter, place, ', being...
3     [13,000, people, receive, #wildfires, evacuati...
4     [got, sent, photo, ruby, #alaska, smoke, #wild...
5     [#rockyfire, update, =, >, california, hwy, .,...
6     [#flood, #disaster, heavy, rain, causes, flash...
7     [i'm, top, the, hill, i, see, fire, the, woods...
8     [there's, emergency, evacuation, happening, in...
9     [i'm, afraid, the, tornado, coming, our, area,...
10          [three, people, died, the, heat, wave, far]
11    [haha, south, tampa, getting, flooded, hah, -,...
12    [#raining, #flooding, #florida, #tampabay, #ta...
13          [#flood, bago, myanmar, #we, arrived, bago]
14    [damage, school, bus, 80, multi, car, crash, #...
15                                     [what's, man, ?]
16                                       [love, fruits]
17                                     [summer, 

In [9]:
word_dict = {}
for entry in data['tokens']:
    for i,word in enumerate(entry):
        if word in word_dict:
            word_dict[word] += 1
        else:
            word_dict[word] = 1
print(len(word_dict))

16264


In [10]:
wordtoidx = {}
idxtoword = {}
for i, word in enumerate(word_dict.keys()):
    wordtoidx[word] =i
    idxtoword[i] = word

In [11]:
data['tokens']

0       [deeds, the, reason, this, #earthquake, may, a...
1        [forest, fire, near, la, ronge, sask, ., canada]
2       [residents, asked, ', shelter, place, ', being...
3       [13,000, people, receive, #wildfires, evacuati...
4       [got, sent, photo, ruby, #alaska, smoke, #wild...
                              ...                        
7608    [two, giant, cranes, holding, bridge, collapse...
7609    [out, control, wild, fires, california, even, ...
7610    [m1, ., 94, [, 01:04, utc, ], ?, 5km, of, volc...
7611    [police, investigating, an, e-bike, collided, ...
7612    [latest, :, homes, razed, northern, california...
Name: tokens, Length: 7613, dtype: object

In [12]:
data['tokens'].values

array([list(['deeds', 'the', 'reason', 'this', '#earthquake', 'may', 'allah', 'forgive', 'us']),
       list(['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada']),
       list(['residents', 'asked', "'", 'shelter', 'place', "'", 'being', 'notified', 'officers', '.', 'other', 'evacuation', 'shelter', 'place', 'orders', 'expected']),
       ...,
       list(['m1', '.', '94', '[', '01:04', 'utc', ']', '?', '5km', 'of', 'volcano', 'hawaii', '.']),
       list(['police', 'investigating', 'an', 'e-bike', 'collided', 'a', 'car', 'little', 'portugal', '.', 'e-bike', 'rider', 'suffered', 'serious', 'non-life', 'threatening', 'injuries', '.']),
       list(['latest', ':', 'homes', 'razed', 'northern', 'california', 'wildfire', '-', 'abc', 'news'])],
      dtype=object)

In [13]:
data['tokens']

0       [deeds, the, reason, this, #earthquake, may, a...
1        [forest, fire, near, la, ronge, sask, ., canada]
2       [residents, asked, ', shelter, place, ', being...
3       [13,000, people, receive, #wildfires, evacuati...
4       [got, sent, photo, ruby, #alaska, smoke, #wild...
                              ...                        
7608    [two, giant, cranes, holding, bridge, collapse...
7609    [out, control, wild, fires, california, even, ...
7610    [m1, ., 94, [, 01:04, utc, ], ?, 5km, of, volc...
7611    [police, investigating, an, e-bike, collided, ...
7612    [latest, :, homes, razed, northern, california...
Name: tokens, Length: 7613, dtype: object

In [14]:
# Now lets create the dataset of the words with the integers 
X_data = []
for entry in data['tokens']:
    new_entry = entry
    for i in  range(len(new_entry)):
        # print(new_entry[i])
        new_entry[i] = wordtoidx[new_entry[i]]
        # print(new_entry[i])
    X_data.append(np.array(new_entry, dtype = np.int64))
    # print(new_entry)

X_data
    


[array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([ 9, 10, 11, 12, 13, 14, 15, 16], dtype=int64),
 array([17, 18, 19, 20, 21, 19, 22, 23, 24, 15, 25, 26, 20, 21, 27, 28],
       dtype=int64),
 array([29, 30, 31, 32, 26, 27, 33], dtype=int64),
 array([34, 35, 36, 37, 38, 39, 32, 40, 41, 42], dtype=int64),
 array([43, 44, 45, 46, 33, 47, 15, 48, 49, 50, 51, 52, 53, 54, 10, 55, 56,
        32], dtype=int64),
 array([57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], dtype=int64),
 array([70, 71,  1, 72, 73, 74, 10,  1, 75, 76], dtype=int64),
 array([77, 78, 26, 79, 80, 81, 82, 83], dtype=int64),
 array([70, 84,  1, 85, 86, 87, 88, 76], dtype=int64),
 array([89, 30, 90,  1, 91, 92, 93], dtype=int64),
 array([ 94,  95,  96,  97,  98,  99,  55, 100, 101, 102,  95,  96, 103,
        104, 105,  73, 104, 106, 107], dtype=int64),
 array([108, 107, 109, 110, 111, 112, 113, 114,  15, 115, 116, 117],
       dtype=int64),
 array([ 57, 118, 119, 120, 121, 118], dtype=int64),
 array([122,  42

In [15]:
y_data_pre = data['target'].values
y_data_pre = torch.from_numpy(y_data_pre)

In [16]:
target_data = torch.zeros(y_data_pre.shape[0], 2, dtype=torch.float32)
target_data[range(len(y_data_pre)),y_data_pre] = 1
target_data

tensor([[0., 1.],
        [0., 1.],
        [0., 1.],
        ...,
        [0., 1.],
        [0., 1.],
        [0., 1.]])

In [17]:
target_data[102]

tensor([1., 0.])

In [18]:
class BatchNorm1D():
    """ This class creates a Batch Normalization layer"""
    def __init__(self, dim, epsilon = 1e-5, momentum = 0.1):
        self.epsilon = epsilon
        self.momentum = momentum
        self.training = True
        # Initializing learnable paramters
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        # Buffers trained with a momentum update
        self.running_mean = torch.zeros(dim)
        self.running_variance = torch.ones(dim)

    def __call__(self, X):
        if X.ndim == 2:
            dim = 0
        if X.ndim == 3:
            dim = (0,1)
        # Calculate the forward pass
        if self.training:
            xmean = X.mean(dim, keepdim=True)
            xvar = X.var(dim,keepdim = True, unbiased = True)
        else:
            xmean = self.running_mean
            xvar = self.running_variance

        xhat = (X-xmean)/ torch.sqrt(xvar+self.epsilon)
        self.out = self.gamma * xhat + self.beta

        # Now update the running mean buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * xmean
                self.running_variance = (1-self.momentum)* self.running_variance + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

In [19]:
class DisasterModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(len(word_dict), 20)
        self.l1 = nn.Linear(20,128)
        self.bn1 = BatchNorm1D(128)
        # self.tanh1 = nn.Tanh()
        self.l2 = nn.Linear(128,128)
        self.bn2 = BatchNorm1D(128)
        # self.tanh2 = nn.Tanh()
        self.l3 = nn.Linear(128,64)
        self.bn3 = BatchNorm1D(64)
        # self.tanh3 = nn.Tanh()
        self.l4 = nn.Linear(64,2)
    
    def forward(self, xix):
        x = torch.zeros(32,20)
        for i in range(len(xix)):
            # print(xix[i])
            ix_entry = torch.from_numpy(xix[i])
            # print(ix_entry)
            embedding = torch.sum(self.embedding.weight[ix_entry], dim = 0, keepdim = True)
            # print(embedding.shape)
            x[i] = embedding

        # print(x)
        # print(x.shape)
        x = torch.tanh(self.bn1(self.l1(x)))
        x = torch.tanh(self.bn2(self.l2(x)))
        x = torch.tanh(self.bn3(self.l3(x)))
        self.out = self.l4(x)
        return self.out

model = DisasterModel()
model


DisasterModel(
  (embedding): Embedding(16264, 20)
  (l1): Linear(in_features=20, out_features=128, bias=True)
  (l2): Linear(in_features=128, out_features=128, bias=True)
  (l3): Linear(in_features=128, out_features=64, bias=True)
  (l4): Linear(in_features=64, out_features=2, bias=True)
)

In [20]:
parameters = sum([x.nelement() for x in model.parameters()])
parameters

352866

In [21]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

In [22]:
train_x_data = X_data[:int(0.9*len(X_data))]
test_x_data = X_data[int(0.9*len(X_data)):]
train_y_data = target_data[:int(0.9*len(X_data))]
test_y_data = target_data[int(0.9*len(X_data)):]

In [30]:
epochs = 100
accuracy = 0
for epoch in range(epochs):
    for i in range((len(train_x_data))//32):
        batch_data = train_x_data[i*32:(i+1)*32]
        batch_labels = train_y_data[i*32:(i+1)*32].view(32,-1,)

        # print(batch_labels.shape)
        logits = model(batch_data)
        loss = F.cross_entropy(logits, batch_labels)
        accuracy += (((logits.max(dim = 1).indices == batch_labels.max(dim=1).indices).sum().item())/32)*100
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
         
    if epoch % 10 == 0:
        print(f"loss : {loss.item()} | Accuracy : {accuracy/10*32}")
        accuracy = 0

loss : 0.22240331768989563 | Accuracy : 67670.0
loss : 0.22981202602386475 | Accuracy : 676280.0
loss : 0.23564784228801727 | Accuracy : 675920.0
loss : 0.5286725163459778 | Accuracy : 676440.0
loss : 0.31426844000816345 | Accuracy : 676490.0
loss : 0.24011477828025818 | Accuracy : 676400.0
loss : 0.4281781315803528 | Accuracy : 676470.0
loss : 0.2169116735458374 | Accuracy : 676340.0
loss : 0.3102492094039917 | Accuracy : 676540.0
loss : 0.25614768266677856 | Accuracy : 676600.0


In [24]:
ablaze = model.embedding.weight[wordtoidx['ablaze']]

In [25]:
fire = model.embedding.weight[wordtoidx['fire']]

In [26]:
torch.cdist(ablaze.unsqueeze(0), fire.unsqueeze(0))

tensor([[7.0570]], grad_fn=<CdistBackward0>)

In [27]:
# from torchmetrics.classification import BinaryAccuracy()
accuracy = 0
with torch.no_grad():
    for i in range((len(test_x_data))//32):
        batch_data = test_x_data[i*32:(i+1)*32]
        batch_labels = test_y_data[i*32:(i+1)*32].view(32,-1,)

        # print(batch_labels.shape)
        logits = model(batch_data)
        # logits = torch.exp(logits)
        # print(logits)
        
        loss = F.cross_entropy(logits, batch_labels)
        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()
        
        accuracy += ((logits.max(dim = 1).indices == test_y_data[:32].max(dim=1).indices).sum().item()) 
        
        print(loss.item())
    print(f"Accuracy is {(accuracy/736)*100}")

3.588651418685913
3.180645227432251
2.8449416160583496
1.9167383909225464
2.662792682647705
1.1606980562210083
1.6568455696105957
1.6729085445404053
2.4846506118774414
3.7518067359924316
4.688756465911865
3.5694451332092285
2.4716033935546875
3.2828598022460938
3.9243111610412598
2.3103365898132324
2.460498809814453
1.865082859992981
2.574251413345337
4.417128562927246
3.6976840496063232
1.0328078269958496
1.9749455451965332
Accuracy is 47.28260869565217


In [154]:
logits.shape

torch.Size([32, 2])

In [28]:
(logits.max(dim = 1).indices == test_y_data[:32].max(dim=1).indices).sum()

tensor(15)

In [156]:
(23/32)*100

71.875

In [149]:
logits.max(dim=1).indices

tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
        1, 1, 0, 0, 1, 0, 0, 1])

In [150]:
test_y_data[:32].max(dim=1).indices

tensor([0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0])