In [None]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 

In [None]:
train_df = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
train_df['text'] = train_df['text'].astype(str)
train_df['selected_text'] = train_df['selected_text'].astype(str)

In [None]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(x)]

def split_text(x): return x.split(' ')

In [None]:
def tokenize_text(x):
    return split_text(x['text'])
#     return [tok.text for tok in my_tok.tokenizer(x['text'])]
def tokenize_selected_text(x):
    return split_text(x['selected_text'])
#     return [tok.text for tok in my_tok.tokenizer(x['selected_text'])]

In [None]:
train_df['tokenized_text'] = train_df.apply(tokenize_text, axis=1)
train_df['tokenized_selected_text'] = train_df.apply(tokenize_selected_text, axis=1)

In [None]:
train_df.head()

In [None]:
def get_starting_index(x):
    tokenized_text = x['tokenized_text']
    tokenized_selected_text = x['tokenized_selected_text']
    for i in range(len(tokenized_text)):
        if tokenized_text[i] == tokenized_selected_text[0] and \
           tokenized_text[i:i+len(tokenized_selected_text)]==tokenized_selected_text:
            break
    return i
def get_ending_index(x):
    return x['starting_idx']+len(x['tokenized_selected_text'])
        

In [None]:
train_df['starting_idx'] = train_df.apply(get_starting_index, axis=1)
train_df['ending_idx'] = train_df.apply(get_ending_index, axis=1)

In [None]:
train_df.head(10)

In [None]:
X = train_df['tokenized_text'].to_numpy()
starting_idx, ending_idx = train_df['starting_idx'].to_list(), train_df['ending_idx'].to_list()

from sklearn.model_selection import train_test_split
X_train, X_val, y_start_train, y_start_val, y_end_train, y_end_val = train_test_split(X, starting_idx, ending_idx, test_size=0.2, random_state=42)

In [None]:
for t in X:
    if 'Doctor' in t:
        print(t)

In [None]:
X_train

In [None]:
counts = Counter()
for text in X_train:
    counts.update(text)
len(counts.keys())

In [None]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]
len(counts.keys())

In [None]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:
X_train_len = [len(t) for t in X_train]

In [None]:
np.percentile(X_train_len, 95) # let set the max sequence len to N=40

## Dataset

In [None]:
def encode_sentence(x, vocab2index, N=30, padding_start=True):
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc

In [None]:
class TweetDataset(Dataset):
    def __init__(self, X, y_start, y_end, N=30, padding_start=True):
        self.X = [encode_sentence(t, vocab2index, N, padding_start) for t in X]
        self.y_start = y_start
        self.y_end = y_end
        
    def __len__(self):
        return len(self.y_start)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y_start[idx], self.y_end[idx]

In [None]:
train_ds = TweetDataset(X_train, y_start_train, y_end_train)
val_ds = TweetDataset(X_val, y_start_val, y_end_val)

In [None]:
batch_size = 10
train_dl = DataLoader(train_ds, batch_size=batch_size)
val_dl = DataLoader(val_ds, batch_size=batch_size)

In [None]:
x, y1, y2 = next(iter(train_dl))
x

In [None]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    start_loss = F.mse_loss(start_logits, start_positions)
    end_loss = F.mse_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

In [None]:
class TweetGRU(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim=50, hidden_dim=50) :
        super(TweetGRU,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(hidden_dim, 2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        _, ht = self.gru(x)
        x = self.linear(ht[-1])
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits

In [None]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [None]:
def train_epocs(model, optimizer, train_dl, val_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y_start, y_end in train_dl:
            x = x.long().cuda()
            y_start = y_start.float().cuda()
            y_end = y_end.float().cuda()
            y_start_pred, y_end_pred = model(x)
            optimizer.zero_grad()
            loss = loss_fn(y_start_pred, y_end_pred, y_start, y_end)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y_start.shape[0]
            total += y_start.shape[0]
        val_loss, val_jacc = val_metrics(model, val_dl)
        print("train loss %.3f val loss %.3f and jaccard score %.3f" % (sum_loss/total, val_loss, val_jacc))

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def val_metrics(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0.0
    jacc_score_list = []
    for x, y_start, y_end in valid_dl:
#         print(x)
#         print(y_start)
        x = x.long().cuda()
        y_start = y_start.float().cuda()
        y_end = y_end.float().cuda()
        y_start_pred, y_end_pred = model(x)
        loss = loss_fn(y_start_pred, y_end_pred, y_start, y_end)
        sum_loss += loss.item()*y_start.shape[0]
        
        
        
        for i in range(len(x)):
            tmp = list(X_val[total+i])
            y_start_pred = y_start_pred
            y_end_pred = y_end_pred
            pred_str = " ".join(list(tmp[max(0, int(y_start_pred[i])):max(0, int(y_end_pred[i]))]))
#             print(y_start[i].detach().cpu().numpy())
#             print(y_end[i].detach().cpu().numpy())
#             print(tmp)
#             print(tmp[y_start[i].detach().cpu().numpy():y_end[i]].detach().cpu().numpy())
            selected_text = " ".join(tmp[int(y_start[i].detach().cpu().numpy()) : int(y_end[i].detach().cpu().numpy())])
            jacc_score = jaccard(pred_str, selected_text)
            jacc_score_list.append(jacc_score)
        
        total += y_start.shape[0]
    jacc_mean = np.mean(jacc_score_list)
        
    return sum_loss/total, jacc_mean

In [None]:
vocab_size = len(words)
print(vocab_size)
model = TweetGRU(vocab_size, 500, 500).cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.001)

In [None]:
train_epocs(model, optimizer, train_dl, val_dl, epochs=10)

In [None]:
update_optimizer(optimizer, lr=0.001)
train_epocs(model, optimizer, train_dl, val_dl, epochs=30)