In [1]:
from torchtext import *
from torchtext.data import *
from torch.utils.data import DataLoader

import nltk
nltk.download('punkt')
from nltk import word_tokenize

import torch
import torch.nn as nn
import torch.optim as optim

import time
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# emotions_list = ['Joy', 'Trust', 'Fear', 'Surprise', 'Sadness', 'Disgust', 'Anger', 'Anticipation', 'Neutral']
emotions_list = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'neutral']

In [3]:
# 5.3.1
txt_field = data.Field(tokenize=word_tokenize, lower=True, include_lengths=True, batch_first=True)
label_field = data.Field(sequential=False, use_vocab=False, batch_first=True)

# dataset_fields = [('text', txt_field), ('Joy', label_field), 
#                 ('Trust', label_field), ('Fear', label_field), ('Surprise', label_field), 
#                 ('Sadness', label_field), ('Disgust', label_field), ('Anger', label_field), 
#                 ('Anticipation', label_field), ('Neutral', label_field)]

dataset_fields = [('text', txt_field), ('anger', label_field), 
                ('anticipation', label_field), ('disgust', label_field), ('fear', label_field), 
                ('joy', label_field), ('sadness', label_field), ('surprise', label_field), 
                ('trust', label_field), ('neutral', label_field)]


# train, test= TabularDataset.splits(path='./', train='train.csv', test='valid.csv', format='csv', 
#     fields = dataset_fields, skip_header=True)

train, test= TabularDataset.splits(path='./', train='train-80000.csv', test='valid-80000.csv', format='csv', 
    fields = dataset_fields, skip_header=True)


label_field.build_vocab(train)
txt_field.build_vocab(train,vectors=vocab.Vectors("glove.840B.300d.txt"),max_size=20000)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=32, sort_key=lambda x: len(x.text), sort_within_batch=True, device=device)

In [4]:
# for batch in train_iter:
#     print(batch.__dict__.keys())
#     print(batch.fear)
#     break

In [5]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # 传入自变量x列表和因变量y列表

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # 在这个封装中只有一个自变量

            if self.y_vars is not None: # 把所有因变量cat成一个向量
                temp = [getattr(batch, feat).unsqueeze(1) for feat in self.y_vars]
                y = torch.cat(temp, dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)

    def __len__(self):
        return len(self.dl)

train_dl = BatchWrapper(train_iter, "text", emotions_list)
test_dl = BatchWrapper(test_iter, "text", emotions_list)

In [6]:
for batch in train_dl:
    print(batch)
    break

((tensor([[    5,    19,    29,   679,    68,    19],
        [  177,  3049,   141,   763,    11,  3996],
        [  108,   158,    20,  2059,     3,   433],
        [  196,   322,    28,   927,    59,     9],
        [   28,  3535,  3463,    25,   138,     9],
        [  440,     5,    19,   785,  1573,     9],
        [    3,  1884,   336,  4247,   677,     2],
        [ 1271,     2,  1084,     2,   130,     2],
        [    4,   158,    24,     3,  4591,     2],
        [  103,     6,   416,     3,   357,    15],
        [  361,    33,     6,     8,   880,     2],
        [15597,     9,  1000,     9,   774,    63],
        [ 1634,   731,     5,   661,   175,   331],
        [   33,   241,   140,    83,    89,     5],
        [ 1357,    25,  2568,     4,  1430,     2],
        [   54,    12,   290,     9,  1073,    19],
        [    5,   649,    22,   262,     6,  1884],
        [  125,   653,    50,    94,    20,   135],
        [  205,   903,   307,  1186,   213,     5],
        [ 

In [7]:
input_dim = len(txt_field.vocab)
embedding_dim = 64
hidden_dim = 128
output_dim = 9
dropout = 0.5
input_dim

20002

In [8]:
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            batch_first=True
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_len):
        embedded = self.embedding(text)
        output, (last_hidden_state, last_cell_state) = self.rnn(embedded)
        linear_input = last_hidden_state[-1]
        return self.fc(self.dropout(linear_input))

In [9]:
def count_parameters(model):
    temp = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'The model architecture:\n\n', model)
    print(f'\nThe model has {temp:,} trainable parameters')

model = LSTM(input_dim, embedding_dim, hidden_dim, output_dim, dropout)

In [10]:
count_parameters(model)

The model architecture:

 LSTM(
  (embedding): Embedding(20002, 64)
  (rnn): LSTM(64, 128, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=128, out_features=9, bias=True)
)

The model has 1,380,617 trainable parameters


In [11]:
def test_accuracy(tensor, y):
    preds = torch.round(torch.sigmoid(tensor))
    correct = (preds == y).float()
    return correct.sum() / len(correct) * 100


def save_checkpoint(save_path, model, optimizer, val_loss):
    if save_path==None:
        return
    save_path = save_path 
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'val_loss': val_loss}
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(model, optimizer, save_path):
    state_dict = torch.load(save_path)
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    val_loss = state_dict['val_loss']
    print(f'Model loaded from <== {save_path}')
    return val_loss


def TRAIN(net, train_iter, valid_iter, num_epochs, eval_every, total_step, criterion, optimizer, val_loss, device, save_name):
    
    running_loss = 0.0
    running_corrects = 0
    running_num = 0
    global_step = 0
    train_loss = []
    valid_loss = []
    count = []
    if val_loss==None:
        best_val_loss = float("Inf")  
    else: 
        best_val_loss=val_loss

    since = time.time()
    for epoch in range(num_epochs):  # loop over the dataset multiple times
        
        for batch in train_iter: ## batch is ((text, text_len_array), 2d_labels), text_len_array hv all same value thx to BucketIterator
            
            net.train()

            '''Training of the model'''
            # Forward pass
            text = batch[0][0] 
            text_len = batch[0][1][0].item()
            label = batch[1]
            
            outputs = net(text, text_len).squeeze(1)
            loss = criterion(outputs, label)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            global_step += 1

            running_loss += loss.item()

            '''Evaluating the model every x steps'''
            if global_step % eval_every == 0:
                with torch.no_grad():
                    net.eval()
                    val_running_loss = 0.0
                    for val_batch in valid_iter:
                        val_text = val_batch[0][0]
                        val_text_len = val_batch[0][1][0].item()
                        val_label = val_batch[1]
                        
                        val_outputs = net(val_text, val_text_len).squeeze(1)
                        val_loss = criterion(val_outputs, val_label)
                        val_running_loss += val_loss.item()

                    average_train_loss = running_loss / eval_every
                    average_val_loss = val_running_loss / len(valid_iter)
                    train_loss.append(average_train_loss)
                    valid_loss.append(average_val_loss)
                    count.append(global_step)

                    print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                          .format(epoch+1, num_epochs, global_step, total_step, average_train_loss, average_val_loss))

                    running_loss = 0.0
                    
                    if average_val_loss < best_val_loss:
                        best_val_loss = average_val_loss
                        save_checkpoint(save_name, net, optimizer, best_val_loss)

    print('Finished Training, training time: %.4f' % (time.time() - since))

    print('training loss: %.4f\nvalidation loss: %.4f' % (train_loss[-1], valid_loss[-1]))
    plot_x = count
    p1, = plt.plot(plot_x, train_loss, color='red', linewidth=1, label='training loss')
    p2, = plt.plot(plot_x, valid_loss,  color='blue', linewidth=1, label='validation loss')
    plt.legend(handles=[p1, p2], loc='center left', bbox_to_anchor=(1, 0.5))
    plt.ylabel('loss')
    plt.xlabel('number of evaluation')
    plt.title('loss over number of evaluation')
    plt.show()

    return train_loss, valid_loss


def eval(model, iterator):
    
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text = batch[0][0]
            text_len = batch[0][1][0].item()
            label = batch[1]

            outputs = model(text, text_len).squeeze(1)
            acc = test_accuracy(outputs, label) / 9

            epoch_acc += acc.item()

    avg_acc = epoch_acc / len(iterator)
    print(f'Test accuracy: {avg_acc}%')

In [12]:
model = LSTM(input_dim, embedding_dim, hidden_dim, output_dim, dropout).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)
num_epochs = 15
eval_every = 1000
total_step = len(train_iter)*num_epochs
best_val_loss = None
save_path = f'tweet_net.pt'

# train_loss, valid_loss = TRAIN(model, train_dl, test_dl, num_epochs, eval_every, total_step, criterion, optimizer, 
#       best_val_loss, device, save_path)

#eval(model2, test_iter)

In [13]:
load_checkpoint(model, optimizer, f'tweet_net.pt')

Model loaded from <== tweet_net.pt


0.18419405172020198

In [14]:
eval(model, test_dl)

Test accuracy: 94.16180645751953%


In [27]:
import pandas as pd
gordon_df = pd.read_csv('onlyengandsentimenttotext.csv', encoding='utf8', dtype=str, escapechar='\\')


def predict_sentence_sentiment(model, sentence):
    model.eval()
    tokenized = word_tokenize(sentence)
    indexed = [txt_field.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed)
    tensor = tensor.unsqueeze(0)
    length_tensor = torch.LongTensor(length)
    output = model(tensor.to(device), length_tensor)
    prediction = torch.round(torch.sigmoid(output)).int()
    return prediction.cpu().numpy()[0]

def predict_sentiment(row):
    row[emotions_list] = predict_sentence_sentiment(model, row['text'])
    return row

for e in emotions_list:
    gordon_df[e] = 0
gordon_df = gordon_df.apply(predict_sentiment, axis=1)
gordon_df.to_csv('onlyengandsentimenttotext_sentiment.csv', header=True, index=False, encoding='utf-8')

In [34]:
gordon_df

Unnamed: 0,date,user,text,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,neutral
0,2020-03-13 12:10:38,thagnome70065,Nice read. Trump passes coronavirus test wi...,0,0,0,1,0,0,0,0,0
1,2020-03-13 12:10:39,drewthompson116,Stocks set to surge following worst day since ...,0,0,0,1,0,1,1,1,0
2,2020-03-13 12:10:40,jorgenseptember,Maxine Waters Turns Into A Political Crisis....,0,0,1,0,0,1,1,0,0
3,2020-03-13 12:10:40,padilloh,Katie Porter just saved countless lives. Than...,0,0,0,0,0,0,0,0,1
4,2020-03-13 12:10:40,_D_S_J_,Meanwhile in Egypt they made someone dress u...,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1232755,2020-03-28 06:58:30,ViralDVora,Even a normal Sneezing and Cough a Normal Fev...,0,0,0,0,0,0,0,0,1
1232756,2020-03-28 06:58:31,LicPravinw,In the news we heard that 80 lakh mobile users...,0,1,0,1,0,1,0,1,0
1232757,2020-03-28 06:58:32,steffid06,As a diaspora member your communication matte...,0,0,0,0,1,0,0,1,0
1232758,2020-03-28 06:58:32,ArcadiaT3,If u take a few letters from PANDEMIC you get ...,0,0,0,0,0,0,0,0,1


In [33]:
gordon_df.astype(bool).sum(axis=0)

date            1232760
user            1232760
text            1232760
anger            235652
anticipation     305780
disgust          162761
fear             352080
joy              247016
sadness          274793
surprise         159612
trust            360131
neutral          500139
dtype: int64

In [35]:
def filterTime(row):
    row['date'] = row['date'].split(" ")[0]
    return row

gordon2_df = gordon_df.apply(filterTime, axis=1)
gordon2_df

Unnamed: 0,date,user,text,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,neutral
0,2020-03-13,thagnome70065,Nice read. Trump passes coronavirus test wi...,0,0,0,1,0,0,0,0,0
1,2020-03-13,drewthompson116,Stocks set to surge following worst day since ...,0,0,0,1,0,1,1,1,0
2,2020-03-13,jorgenseptember,Maxine Waters Turns Into A Political Crisis....,0,0,1,0,0,1,1,0,0
3,2020-03-13,padilloh,Katie Porter just saved countless lives. Than...,0,0,0,0,0,0,0,0,1
4,2020-03-13,_D_S_J_,Meanwhile in Egypt they made someone dress u...,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1232755,2020-03-28,ViralDVora,Even a normal Sneezing and Cough a Normal Fev...,0,0,0,0,0,0,0,0,1
1232756,2020-03-28,LicPravinw,In the news we heard that 80 lakh mobile users...,0,1,0,1,0,1,0,1,0
1232757,2020-03-28,steffid06,As a diaspora member your communication matte...,0,0,0,0,1,0,0,1,0
1232758,2020-03-28,ArcadiaT3,If u take a few letters from PANDEMIC you get ...,0,0,0,0,0,0,0,0,1


In [36]:
arr = ['date','anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'neutral']
gordon3_df = gordon2_df[arr].groupby('date').sum()
gordon3_df

Unnamed: 0_level_0,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-03-13,9992,11916,7640,14874,9529,12332,6913,14240,20499
2020-03-14,11355,13887,8731,16817,11373,13892,7747,16151,23305
2020-03-15,14891,18755,11037,23054,14876,18595,9785,21757,32102
2020-03-16,16612,22422,11843,25685,18098,19918,11450,26385,37570
2020-03-17,11277,15316,8079,16990,12359,13252,7743,17793,24796
2020-03-18,18587,25495,12948,28922,20250,22154,13250,29719,39431
2020-03-19,18156,23174,12388,26491,18698,20438,11867,27151,37206
2020-03-20,18927,25313,13233,27947,20466,21748,12866,29756,40225
2020-03-21,17109,22358,11690,25898,18340,19774,11668,26624,37513
2020-03-22,18550,23255,12664,27062,18776,20888,12074,26968,42151


In [38]:
gordon3_df.to_csv('onlyengandsentimenttotext_sentiment_date.csv', header=True, index=True, encoding='utf-8')

In [40]:
gordon4_df = gordon3_df.divide(gordon3_df.sum(axis=1),axis=0)
gordon4_df

Unnamed: 0_level_0,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-03-13,0.092574,0.1104,0.070783,0.137805,0.088285,0.114254,0.064048,0.131931,0.18992
2020-03-14,0.092124,0.112666,0.070835,0.136437,0.09227,0.112707,0.062852,0.131034,0.189075
2020-03-15,0.09033,0.113769,0.066951,0.139847,0.090239,0.112798,0.059356,0.131979,0.194732
2020-03-16,0.087439,0.118021,0.062337,0.135196,0.095261,0.104841,0.060269,0.138881,0.197755
2020-03-17,0.088374,0.120027,0.063313,0.133145,0.096854,0.103852,0.060679,0.139438,0.194318
2020-03-18,0.088192,0.120969,0.061436,0.13723,0.096083,0.105117,0.062869,0.141011,0.187093
2020-03-19,0.092837,0.118495,0.063343,0.135456,0.095608,0.104505,0.060679,0.138831,0.190245
2020-03-20,0.089923,0.120263,0.06287,0.132777,0.097234,0.103325,0.061127,0.141371,0.19111
2020-03-21,0.089588,0.117074,0.061213,0.13561,0.096034,0.103543,0.061097,0.139412,0.19643
2020-03-22,0.091656,0.114903,0.062573,0.133713,0.092772,0.103208,0.059658,0.133249,0.208268


In [None]:
# filter RT@user:
# filter @user
# filter #hashtag
# filter emoji
# filter non-eng
# pass data to model
# change format of time
# calculate percentage of each emotion
# calculate relationship with new infected count
# 