Classifying Disater Tweets

In [1]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
torch.manual_seed(42)
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [2]:
data = pd.read_csv("D:/Datasets/nlp-getting-started/train.csv")

In [3]:
data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
# remove the url links of the tweets
g = re.compile(r"https?://[\w.]*/?\w*\s*")  # the regular expression used to remove the urls from the tweet data.
data['text'] = data['text'].apply(lambda y: re.subn(g, '', string = y)[0])
data


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
# take the hashtags of the tweet and add them to a seperate column
def get_hashtags(x):
    g = re.compile(r"#[A-Za-z]*\s*")
    hash_tag_list = re.findall(g,string = x)
    for i,tag in enumerate(hash_tag_list):
        hash_tag_list[i] =  tag[1:].strip()
    return hash_tag_list
def get_mentions(x):
    mention_list = re.findall(re.compile('@[a-zA-Z0-9_]*\s*'),string = x)
    if len(mention_list) >=1:
        return mention_list
    else:
        return np.NaN
    
data['hashtag'] = data['text'].apply(lambda x : get_hashtags(x))
data['mentions'] = data['text'].apply(lambda x : get_mentions(x))
data

Unnamed: 0,id,keyword,location,text,target,hashtag,mentions
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,[earthquake],
1,4,,,Forest fire near La Ronge Sask. Canada,1,[],
2,5,,,All residents asked to 'shelter in place' are ...,1,[],
3,6,,,"13,000 people receive #wildfires evacuation or...",1,[wildfires],
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Alaska, wildfires]",
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,[],
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,[],"[@aria_ahrary , @TheTawniest ]"
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,1,[],
7611,10872,,,Police investigating after an e-bike collided ...,1,[],


In [6]:

def clean(entry):
    entry = entry.lower()
    entry = entry.replace('\n', '')
    entry = re.subn(re.compile('=>'), '', string=entry)[0]
    entry = re.subn(re.compile('!!+'), '', string=entry)[0]
    entry = re.subn(re.compile('#[^a-z]+'), '', string = entry)[0]
    entry = re.subn(re.compile('\.'), '', string=entry)[0]
    entry = re.subn(re.compile('@[a-zA-Z0-9_]*\s*'), '', string = entry)[0]
    entry = entry.replace('*','')
    
    entry_split = entry.split()
    for i, word in enumerate(entry_split):
        if word.startswith("'"):
            entry_split[i] = word[1:]
        elif word.endswith("'"):
            entry_split[i] = word[:-1]
    entry = ' '.join(entry_split)
    return entry

data['text'] = data['text'].apply(lambda x : clean(x))




We would use a mention token for tweets that have mentions in them.
That would be @ in our vocabulary.

In [7]:
# Lets create a vocabulary
vocab = set()
vocab_dict = {}
for entry in data['text']:
    word_list = entry.split(' ')
    for word in word_list:
        # word = word.replace('\n', '')
        # word = re.subn(re.compile('!!+'), '', string=word)[0]
        # word = re.subn(re.compile('#[^a-z]+'), '', string = word)[0]
        if word in vocab_dict:
            vocab_dict[word] += 1
        else:
            vocab_dict[word] = 1
        vocab.add(word)
print(len(vocab))


18874


In [2]:
import nltk

In [8]:
word_dict = {}
id_to_word = {}
for i,word in enumerate(vocab_dict.keys()):
    word_dict[word] = i
    id_to_word[i] = word

word_dict

{'our': 0,
 'deeds': 1,
 'are': 2,
 'the': 3,
 'reason': 4,
 'of': 5,
 'this': 6,
 '#earthquake': 7,
 'may': 8,
 'allah': 9,
 'forgive': 10,
 'us': 11,
 'all': 12,
 'forest': 13,
 'fire': 14,
 'near': 15,
 'la': 16,
 'ronge': 17,
 'sask': 18,
 'canada': 19,
 'residents': 20,
 'asked': 21,
 'to': 22,
 'shelter': 23,
 'in': 24,
 'place': 25,
 'being': 26,
 'notified': 27,
 'by': 28,
 'officers': 29,
 'no': 30,
 'other': 31,
 'evacuation': 32,
 'or': 33,
 'orders': 34,
 'expected': 35,
 '13,000': 36,
 'people': 37,
 'receive': 38,
 '#wildfires': 39,
 'california': 40,
 'just': 41,
 'got': 42,
 'sent': 43,
 'photo': 44,
 'from': 45,
 'ruby': 46,
 '#alaska': 47,
 'as': 48,
 'smoke': 49,
 'pours': 50,
 'into': 51,
 'a': 52,
 'school': 53,
 '#rockyfire': 54,
 'update': 55,
 'hwy': 56,
 '20': 57,
 'closed': 58,
 'both': 59,
 'directions': 60,
 'due': 61,
 'lake': 62,
 'county': 63,
 '-': 64,
 '#cafire': 65,
 '#flood': 66,
 '#disaster': 67,
 'heavy': 68,
 'rain': 69,
 'causes': 70,
 'flash': 71

In [9]:
X_data_pre = data['text']
y_data_pre = data['target']

In [10]:
X_data = X_data_pre
y_data = y_data_pre
X_data

0       our deeds are the reason of this #earthquake m...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    the out of control wild fires in california ev...
7610             m194 [01:04 utc]?5km s of volcano hawaii
7611    police investigating after an e-bike collided ...
7612    the latest: more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [11]:
# Lets convert each tweet into the corresponding integer representation and make the training set

# print(X_data_new.shape)
def entry_convert(entry):
    # print(type(entry))
    entry_split = entry.split()
    # print(entry_split)
    for i,word in enumerate(entry_split):
        entry_split[i] = word_dict[word]
    return np.array(entry_split, dtype = np.int64)
X_data = X_data.apply(lambda v: entry_convert(v))
X_data

0              [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
1                            [13, 14, 15, 16, 17, 18, 19]
2       [12, 20, 21, 22, 23, 24, 25, 2, 26, 27, 28, 29...
3                        [36, 37, 38, 39, 32, 34, 24, 40]
4       [41, 42, 43, 6, 44, 45, 46, 47, 48, 49, 45, 39...
                              ...                        
7608    [224, 4221, 4602, 4603, 52, 4561, 1732, 51, 46...
7609    [3, 192, 5, 1873, 1319, 412, 24, 40, 657, 24, ...
7610             [9937, 18106, 18107, 603, 5, 9939, 9940]
7611    [233, 4270, 1392, 88, 6252, 6234, 264, 52, 140...
7612    [3, 13788, 649, 4605, 15386, 28, 1904, 40, 290...
Name: text, Length: 7613, dtype: object

In [12]:
y_data = torch.tensor(y_data.astype(np.int64))

y_data

tensor([1, 1, 1,  ..., 1, 1, 1])

In [13]:
len(y_data)

7613

In [31]:
target_data = torch.zeros(y_data.shape[0], 2, dtype=torch.float32)

In [32]:
target_data[range(len(y_data)),y_data] = 1

In [33]:
target_data[102]

tensor([1., 0.])

In [17]:
# y_data = y_data.clone().view(1,-1)
# y_data

In [64]:
class BatchNorm1D():
    """ This class creates a Batch Normalization layer"""
    def __init__(self, dim, epsilon = 1e-5, momentum = 0.1):
        self.epsilon = epsilon
        self.momentum = momentum
        self.training = True
        # Initializing learnable paramters
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        # Buffers trained with a momentum update
        self.running_mean = torch.zeros(dim)
        self.running_variance = torch.ones(dim)

    def __call__(self, X):
        if X.ndim == 2:
            dim = 0
        if X.ndim == 3:
            dim = (0,1)
        # Calculate the forward pass
        if self.training:
            xmean = X.mean(dim, keepdim=True)
            xvar = X.var(dim,keepdim = True, unbiased = True)
        else:
            xmean = self.running_mean
            xvar = self.running_variance

        xhat = (X-xmean)/ torch.sqrt(xvar+self.epsilon)
        self.out = self.gamma * xhat + self.beta

        # Now update the running mean buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * xmean
                self.running_variance = (1-self.momentum)* self.running_variance + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

##### Defining the model

In [82]:
class DisasterModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(len(word_dict), 20)
        self.l1 = nn.Linear(20,128)
        self.bn1 = BatchNorm1D(128)
        # self.tanh1 = nn.Tanh()
        self.l2 = nn.Linear(128,128)
        self.bn2 = BatchNorm1D(128)
        # self.tanh2 = nn.Tanh()
        self.l3 = nn.Linear(128,64)
        self.bn3 = BatchNorm1D(64)
        # self.tanh3 = nn.Tanh()
        self.l4 = nn.Linear(64,2)
    
    def forward(self, xix):
        x = torch.zeros(32,20)
        for i in range(len(xix)):
            # print(xix[i])
            ix_entry = torch.from_numpy(xix[i])
            # print(ix_entry)
            embedding = torch.sum(self.embedding.weight[ix_entry], dim = 0, keepdim = True)
            # print(embedding.shape)
            x[i] = embedding

        # print(x)
        # print(x.shape)
        x = torch.tanh(self.bn1(self.l1(x)))
        # x = torch.tanh(self.bn2(self.l2(x)))
        x = torch.tanh(self.bn3(self.l3(x)))
        self.out = self.l4(x)
        return self.out

model = DisasterModel()
model


DisasterModel(
  (embedding): Embedding(18874, 20)
  (l1): Linear(in_features=20, out_features=128, bias=True)
  (l2): Linear(in_features=128, out_features=128, bias=True)
  (l3): Linear(in_features=128, out_features=64, bias=True)
  (l4): Linear(in_features=64, out_features=2, bias=True)
)

In [20]:
b  = torch.randn((5,4))
x = torch.tensor([1,2,3], dtype=torch.int64)
b[x]

tensor([[ 0.3834,  1.0620,  0.4433,  1.2012],
        [ 0.9260,  0.5059,  0.5502,  1.5115],
        [ 0.5053, -1.1794,  0.4182,  0.6096]])

In [83]:
parameters = sum([x.nelement() for x in model.parameters()])
parameters

405066

In [84]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

In [85]:
train_x_data = X_data[:int(0.9*len(X_data))]
test_x_data = X_data[int(0.9*len(X_data)):]

In [86]:
epochs = 50
for epoch in range(epochs):
    for i in range((len(train_x_data.values))//32):
        batch_data = X_data.values[i*32:(i+1)*32]
        batch_labels = target_data[i*32:(i+1)*32].view(32,-1,)

        # print(batch_labels.shape)
        logits = model(batch_data)
        loss = F.cross_entropy(logits, batch_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if epoch % 10 == 0:
        print(loss.item())

0.7313444018363953
0.261134535074234
0.34552183747291565
0.27206602692604065
0.2340467870235443


In [81]:
with torch.no_grad():
    for i in range((len(test_x_data.values))//32):
        batch_data = X_data.values[i*32:(i+1)*32]
        batch_labels = target_data[i*32:(i+1)*32].view(32,-1,)

        # print(batch_labels.shape)
        logits = model(batch_data)
        # logits = torch.exp(logits)
        # print(logits)
        
        loss = F.cross_entropy(logits, batch_labels)
        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()
        print(loss)
        break
        
    # if epoch % 10 == 0:
        print(loss.item())

tensor(0.0001)


In [67]:
with torch.no_grad():
    dev_data, dev_labels = next(iter(dev_dataloader))

    dev_logits = model(dev_data)

    dev_probs = torch.softmax(dev_logits, dim = 1)
    print(len(dev_probs))
    


    

16


In [68]:
print(dev_probs)

tensor([[1.6707e-04, 9.9983e-01],
        [7.6953e-04, 9.9923e-01],
        [1.6092e-04, 9.9984e-01],
        [1.4233e-04, 9.9986e-01],
        [7.6953e-04, 9.9923e-01],
        [4.4240e-01, 5.5760e-01],
        [5.4918e-06, 9.9999e-01],
        [3.9603e-04, 9.9960e-01],
        [2.0885e-03, 9.9791e-01],
        [1.5038e-01, 8.4962e-01],
        [4.2081e-05, 9.9996e-01],
        [6.5492e-03, 9.9345e-01],
        [1.9650e-03, 9.9804e-01],
        [7.6953e-04, 9.9923e-01],
        [9.7717e-06, 9.9999e-01],
        [5.8694e-03, 9.9413e-01]])


In [69]:
(dev_probs.max(dim = 1).indices == dev_labels).sum()

tensor(3)