In [None]:
!pip install transformers

import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd

#filename = "srilanka_floods_final_data.tsv"
#filename = "hurricane_harvey_final_data.tsv"
#filename = "hurricane_irma_final_data.tsv"
filename = "iraq_iran_earthquake_final_data.tsv"
#filename = "hurricane_maria_final_data.tsv"
#filename = "california_wildfires_final_data.tsv"
#filename = "mexico_earthquake_final_data.tsv"

df = pd.read_csv(filename, sep="\t")
df.head()

Unnamed: 0,tweet_id,image_id,text_info,text_info_conf,image_info,image_info_conf,text_human,text_human_conf,image_human,image_human_conf,image_damage,image_damage_conf,tweet_text,image_url,image_path
0,929989361453621249,929989361453621249_0,informative,1.0,informative,0.6513,injured_or_dead_people,1.0,other_relevant_information,0.6513,,,RT @PressTV: UPDATE: Death toll from Iran’s qu...,http://pbs.twimg.com/media/DOfT-eRWkAAJ_kB.jpg,data_image/iraq_iran_earthquake/13_11_2017/929...
1,929989388863508480,929989388863508480_0,not_informative,0.7535,not_informative,1.0,,,,,,,RT @CAFOD: We pray for all those affected by t...,http://pbs.twimg.com/media/DOftf6AW0AAzjkE.jpg,data_image/iraq_iran_earthquake/13_11_2017/929...
2,929989414826024960,929989414826024960_0,informative,1.0,not_informative,1.0,injured_or_dead_people,1.0,not_relevant_or_cant_judge,1.0,,,RT @ReutersWorld: JUST IN: Death toll reaches ...,http://pbs.twimg.com/media/DOf76fBUMAAqpv_.jpg,data_image/iraq_iran_earthquake/13_11_2017/929...
3,929989443913691136,929989443913691136_0,informative,1.0,informative,0.6031,other_relevant_information,1.0,injured_or_dead_people,0.6031,,,RT @LaylaAlhussein: A magnitude 7.2 earthquake...,http://pbs.twimg.com/media/DOeoQ75XUAEpZwN.jpg,data_image/iraq_iran_earthquake/13_11_2017/929...
4,929989443913691136,929989443913691136_1,informative,1.0,informative,1.0,other_relevant_information,1.0,rescue_volunteering_or_donation_effort,1.0,,,RT @LaylaAlhussein: A magnitude 7.2 earthquake...,http://pbs.twimg.com/media/DOeoQ75WAAIsov_.jpg,data_image/iraq_iran_earthquake/13_11_2017/929...


In [None]:
cond1 = df['text_info'] == 'informative'
cond2 = df['image_info'] == 'informative'
cond3 = df['text_info'] == 'not_informative'
cond4 = df['image_info'] == 'not_informative'

df_clean = df[(cond1&cond2) | (cond3&cond4)]
print(df_clean.head())


In [None]:
tokenizer = BertTokenizer.from_pretrained('albert-base-v2')

# labels = {'business':0,
#           'entertainment':1,
#           'sport':2,
#           'tech':3,
#           'politics':4
#           }

labels = {'not_informative': 0, 'informative':1}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['text_info']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['tweet_text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('albert-base-v2')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        #self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
print("len df_clean", len(df_clean))

np.random.seed(112)
df_train, df_val, df_test = np.split(df_clean.sample(frac=1, random_state=42), 
                                     [int(.8*len(df_clean)), int(.9*len(df_clean))])

print(len(df_train),len(df_val), len(df_test))

#print(df_val.iloc[0])
#print("\n\n\n")
#print(df_test.iloc[0])


In [None]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    #print("train[0] ..", train[0][1])

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    #criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            print("Current Epoch ...", epoch_num)
            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                #print("output", output)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                #acc = (output.argmax(dim=1))
                #print("acc ...", acc)
                #print("train label ..", train_label)

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    #print("output", output)

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 10
model = BertClassifier()
LR = 2e-5
              
train(model, df_train, df_val, LR, EPOCHS)


In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              #acc = (output.argmax(dim=1))
              #print(acc)
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')


In [None]:
evaluate(model, df_test)

Test Accuracy:  0.760
