In [6]:
# python -m spacy download en_core_web_sm    Einmal ausführen

In [1]:
import pandas as pd

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# create iterator from tokenized df
def df_iterator_content(df):
    for _, row in df.iterrows():
        yield tokenizer(row['text'])

vocab = build_vocab_from_iterator(df_iterator_content(df), specials=["<unk>"], min_freq=5)
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)
print(vocab_size)

3240


In [10]:
import torch
from torch.utils.data import Dataset

class TwitterDesasterDataset(Dataset):
    def __init__(self, df, word_count=500, vocab_size=10000):
        self.df = df
        self.word_count = word_count
        self.vocab_size = vocab_size
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        x= self.df.iloc[idx]["text"]
        y= self.df.iloc[idx]["target"]
        y = int(y)
        x = vocab(tokenizer(x))
        if len(x) > self.word_count:
            x=x[:self.word_count]
        else:
            x.extend([0]*(self.word_count-len(x)))
        x = torch.tensor(x)
        return x, y

twitter_dataset = TwitterDesasterDataset(df, word_count=30, vocab_size=vocab_size)
x,y=twitter_dataset[0]
print(x.shape)
print(y)

torch.Size([30])
1


In [17]:
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

torch.manual_seed(1)

class MyLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, out_size, word_count=50, dropout=0.2, num_layers=2):
        super(MyLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim=embedding_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout, num_layers=num_layers)
        self.dropout=nn.Dropout(p=dropout)
        # The linear layer that maps from hidden state space to output space
        self.hidden2output = nn.Linear(hidden_dim*word_count, out_size)

    def forward(self, xb):
        #print("xb shape", xb.shape)
        embeds = self.word_embeddings(xb)
        #print("embeds shape", embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        lstm_out=self.dropout(lstm_out)
        #print("lstm_out shape", lstm_out.shape)
        # lstm_out_view = lstm_out[:, -1, :]   # works but looses information
        lstm_out_view = lstm_out.reshape(xb.shape[0], -1   )
        #print("lstm_out_view shape", lstm_out_view.shape)
        hidden_space = self.hidden2output(lstm_out_view)
        #print("hidden_space shape", hidden_space.shape)
        output = F.log_softmax(hidden_space, dim=1)
        #print("output shape", output.shape)
        return output


In [20]:
def evaluate(model, dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
# Hyperparameter
embed_dim = 64
num_class = 2
hidden_dim = 128
word_count = 300
EPOCHS = 30 # epoch
LR = 0.01  # learning rate
scheduler_patience=7
scheduler_factor=0.2
weight_decay=1e-3
BATCH_SIZE = 64 # batch size for training
dropout=0.5

# check if model works
model= MyLSTM(embed_dim, hidden_dim, vocab_size, num_class, word_count=word_count, dropout=dropout).to(device)
dataset = TwitterDesasterDataset(df, word_count=word_count, vocab_size=vocab_size)
loader = torch.utils.data.DataLoader(dataset, batch_size=5, shuffle=True)

xb, yb = next(iter(loader))
print("yb", yb)
print("xb", xb.shape)
model(xb)

yb tensor([0, 0, 0, 0, 0])
xb torch.Size([5, 300])


tensor([[-0.6471, -0.7415],
        [-0.6332, -0.7570],
        [-0.6749, -0.7117],
        [-0.6709, -0.7159],
        [-0.5939, -0.8033]], grad_fn=<LogSoftmaxBackward0>)

In [23]:
# create train and valid dataset
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])

# import torch DataLoader
from torch.utils.data import DataLoader

model = MyLSTM(embed_dim, hidden_dim, vocab_size, num_class, word_count=word_count, dropout=dropout).to(device)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=weight_decay)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, lr_step_size, gamma=lr_gamma)   # every 10 epochs, LR is multiplied by 0.7
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=scheduler_factor, patience=scheduler_patience)

