In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from torchvision import transforms

torch.manual_seed(1)

<torch._C.Generator at 0x7f17fc4ff250>

In [2]:
class EmojiDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.tweets_frame = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        sentence = str(self.tweets_frame.iloc[index, 0])
        words = np.array(sentence.split(" "))
        emoji = self.tweets_frame.iloc[index, 1]

        sample = {'words': words, 'emoji':emoji}
        if self.transform:
            sample = self.transform(sample)

        return sample

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:

from collections import defaultdict

emojis_we_want = {
    ':red_heart:': 0, 
    ':face_with_tears_of_joy:': 0, 
    ':loudly_crying_face:': 0, 
    ':smiling_face_with_heart-eyes:': 0, 
    ':fire:': 0,
    ':folded_hands:': 0,
    ':weary_face:': 0,
    ':person_shrugging:': 0,
    ':two_hearts:': 0,
    ':sparkles:': 0
  }
training_data = EmojiDataset(csv_file='/content/drive/MyDrive/Colab Notebooks/py_dev_clean.csv')
train_sample = []
done = 0
i = 0

num_in_emoji = 30000

for sample in training_data:
    if sample['emoji'] in emojis_we_want.keys() and emojis_we_want[sample['emoji']] < num_in_emoji:
        train_sample.append(sample)
        emojis_we_want[sample['emoji']] += 1

    if sum(list(emojis_we_want.values())) >= len(emojis_we_want.keys())*num_in_emoji:
      break



In [30]:
import random

random.shuffle(train_sample)

cut = int(0.2*len(train_sample))
test_sample = train_sample[:cut]
train_sample = train_sample[cut:]

In [31]:
print(len(train_sample))
print(len(test_sample))

250000
50000


In [32]:
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sample in train_sample:
    for word in sample['words']:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index

for sample in test_sample:
    for word in sample['words']:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
print(len(word_to_ix))

85557


In [33]:
tag_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sample in train_sample:
    if sample['emoji'] not in tag_to_ix:
        tag_to_ix[sample['emoji']] = len(tag_to_ix)  # Assign each word with a unique index
print(tag_to_ix)

{':fire:': 0, ':loudly_crying_face:': 1, ':red_heart:': 2, ':face_with_tears_of_joy:': 3, ':weary_face:': 4, ':smiling_face_with_heart-eyes:': 5, ':person_shrugging:': 6, ':sparkles:': 7, ':folded_hands:': 8, ':two_hearts:': 9}


In [34]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [35]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.emojis = prepare_sequence(emojis, tag_to_ix)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        lstm_out = torch.relu(lstm_out)

        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [36]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
emojis = tag_to_ix.keys()
emojis_tensor = prepare_sequence(emojis, tag_to_ix)
emoji_embeddings = nn.Embedding(len(emojis), EMBEDDING_DIM)

emoji_embeds = emoji_embeddings(emojis_tensor)




In [37]:
device = 'cuda' if torch.cuda.is_available()==True else 'cpu'
device = torch.device(device)
print(device)

cuda


In [38]:
device = 'cuda' if torch.cuda.is_available()==True else 'cpu'
device = torch.device(device)
print(device)

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
model = model.to(device)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
model.train()


for epoch in range(3):
    epoch_loss = 0
    samples_seen = 0

    partial_loss = 0
    for sample in train_sample:
        sentence = sample['words']
        tag = sample['emoji']

        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence([tag]*len(sentence), tag_to_ix)

        # Step 3. Run our forward pass.
        sentence_in = sentence_in.to(device)
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()

        targets = targets.to(device)
        loss = loss_function(tag_scores, targets)
        epoch_loss += loss.item()
        partial_loss += loss.item()
        loss.backward()
        optimizer.step()
        samples_seen += 1
        if samples_seen % 1000 == 0:
            print(epoch, samples_seen, epoch_loss/samples_seen, partial_loss/1000)
            partial_loss = 0

        

# See what the scores are after training

cuda
0 1000 2.3060647532939913 2.3060647532939913
0 2000 2.30477021920681 2.3034756851196287
0 3000 2.3041515522797904 2.3029142184257507
0 4000 2.3023261347413064 2.2968498821258545
0 5000 2.301332301330566 2.2973569676876067
0 6000 2.3007160632014276 2.2976348725557325
0 7000 2.299792316743306 2.2942498379945757
0 8000 2.29835746216774 2.2883134801387786
0 9000 2.296960373653306 2.285783665537834
0 10000 2.2954829972028734 2.282186609148979
0 11000 2.294522835363041 2.2849212169647215
0 12000 2.2929376913905144 2.2755011076927185
0 13000 2.2914291738088313 2.2733269628286363
0 14000 2.2901544205035482 2.2735826275348665
0 15000 2.288796290620168 2.269782472252846
0 16000 2.287399386227131 2.2664458203315734
0 17000 2.286263515851077 2.2680895898342133
0 18000 2.2843997136553127 2.252715076327324
0 19000 2.2833092019557952 2.263679991364479
0 20000 2.281680224853754 2.2507296599149704
0 21000 2.279895883582887 2.2442090581655503
0 22000 2.2782879212769593 2.244520712852478
0 23000 2.2

KeyboardInterrupt: ignored

In [39]:
import scipy.stats as stats
import random
# {':red_heart:': 0, ':face_with_tears_of_joy:': 1, ':smiling_face_with_heart-eyes:': 2, ':loudly_crying_face:': 3, ':fire:': 4}
ix_to_emoji = {v:k for k,v in tag_to_ix.items()}
x = 0

i = 0

random.shuffle(train_sample)

true_pos = 0
total = 0
model.eval()

from collections import defaultdict

prediction_nums = {}
for emoji in tag_to_ix.keys():
  prediction_nums[emoji] = defaultdict(int)

print(prediction_nums)
with torch.no_grad():
  for sample in test_sample:
    sentence = sample['words']
    # print(sentence, sample['emoji'])
    inputs = prepare_sequence(sentence, word_to_ix).to(device)
    tag_scores = model(inputs)
    # print(tag_scores)
    # print(np.argsort(-tag_scores.cpu().numpy()))
    
    scores = torch.argmax(tag_scores, dim=1)
    # print(scores)
    predicted = ix_to_emoji[stats.mode(scores.cpu().numpy())[0][0]]

    if predicted == sample['emoji']:
      prediction_nums[sample['emoji']]['true_pos'] += 1

      for temp_emoji in prediction_nums.keys():
        if temp_emoji != predicted: prediction_nums[temp_emoji]['true_neg'] += 1
      true_pos += 1
    
    else:
      prediction_nums[predicted]['false_pos'] += 1
      prediction_nums[sample['emoji']]['false_neg'] += 1

    total += 1
    # print(predicted, sample['emoji'])
    if i%10000 == 0:
      print(true_pos/total)
    i+=1

print(prediction_nums)

{':fire:': defaultdict(<class 'int'>, {}), ':loudly_crying_face:': defaultdict(<class 'int'>, {}), ':red_heart:': defaultdict(<class 'int'>, {}), ':face_with_tears_of_joy:': defaultdict(<class 'int'>, {}), ':weary_face:': defaultdict(<class 'int'>, {}), ':smiling_face_with_heart-eyes:': defaultdict(<class 'int'>, {}), ':person_shrugging:': defaultdict(<class 'int'>, {}), ':sparkles:': defaultdict(<class 'int'>, {}), ':folded_hands:': defaultdict(<class 'int'>, {}), ':two_hearts:': defaultdict(<class 'int'>, {})}
1.0
0.39686031396860316
0.4014799260036998
0.3997200093330222
0.40023999400015
{':fire:': defaultdict(<class 'int'>, {'true_neg': 16630, 'true_pos': 3353, 'false_pos': 4795, 'false_neg': 1630}), ':loudly_crying_face:': defaultdict(<class 'int'>, {'true_neg': 18524, 'false_pos': 3566, 'false_neg': 3638, 'true_pos': 1459}), ':red_heart:': defaultdict(<class 'int'>, {'true_neg': 18692, 'false_neg': 3671, 'true_pos': 1291, 'false_pos': 1978}), ':face_with_tears_of_joy:': defaultdic

In [43]:
for emoji in prediction_nums.keys():
  prediction_nums[emoji]['precision'] = (prediction_nums[emoji]['true_pos'])/(prediction_nums[emoji]['true_pos']+prediction_nums[emoji]['false_pos'])
  prediction_nums[emoji]['recall'] = (prediction_nums[emoji]['true_pos'])/(prediction_nums[emoji]['true_pos']+prediction_nums[emoji]['false_neg'])
  f1 = 2*((prediction_nums[emoji]['precision']*prediction_nums[emoji]['recall'])/(prediction_nums[emoji]['precision']+prediction_nums[emoji]['recall']))
  print(emoji, "prec:", prediction_nums[emoji]['precision'], "recall:", prediction_nums[emoji]['recall'])
  print("f1:", f1)

:fire: prec: 0.4115120274914089 recall: 0.6728878185831828
f1: 0.5106998705353744
:loudly_crying_face: prec: 0.2903482587064677 recall: 0.2862468118501079
f1: 0.28828294803398535
:red_heart: prec: 0.39492199449372895 recall: 0.26017734784361146
f1: 0.3136921394727251
:face_with_tears_of_joy: prec: 0.29909879272232615 recall: 0.35243438188739734
f1: 0.3235835172921266
:weary_face: prec: 0.5394238059135709 recall: 0.2832968345610193
f1: 0.3714919723273724
:smiling_face_with_heart-eyes: prec: 0.3476831091180867 recall: 0.23381584238037798
f1: 0.2796009135713427
:person_shrugging: prec: 0.40863188266410194 recall: 0.686527974146637
f1: 0.5123219534252769
:sparkles: prec: 0.5413877054169203 recall: 0.35199841709537
f1: 0.42661870503597116
:folded_hands: prec: 0.47203077416252603 recall: 0.5965161028964958
f1: 0.5270221904080172
:two_hearts: prec: 0.3666580109062581 recall: 0.28082736674622116
f1: 0.3180538348913166
