In [160]:
import sys
import os
sys.path.append("..")
from globals import ROOT_DIR
from data_providers import TextDataProvider
import argparse
import configparser
from torch import optim
from experiment_builder import ExperimentBuilder
from data_providers import *
import os
from models.cnn import *
from models.multilayer_perceptron import multi_layer_perceptron
import tweepy
from text_utils import *

config = configparser.ConfigParser()
config.read('../config.ini')
path_data = os.path.join(ROOT_DIR, config['DEFAULT']['PATH_DATA'])
path_labels = os.path.join(ROOT_DIR, config['DEFAULT']['PATH_LABELS'])
from models.model_pytorch import TransformerModel, load_openai_pretrained_model, DEFAULT_CONFIG, DoubleHeadModel


In [2]:
path_data = 'data/founta_data.npy'
data = np.load(os.path.join(ROOT_DIR, path_data))
data = data[()]

In [3]:
def convert_to_feature_embeddings(x_embed, key='embedding'):
    if key == 'tokens': #  for tdidf
        return [' '.join(x[key]) for x in x_embed]
    return [x[key] for x in x_embed]

def process_outputs(outputs, experiment_flag=1):
    """
    Cleans text, creates context tweets for reply experiment, and tokenizes
    :param outputs: tweet data / label
    :param experiment_flag: denotes what round of experiments this is, 1) tweet, 2) tweet + context tweet 3) reply net
    :return:
    """
    replies = np.load(os.path.join(ROOT_DIR, 'data/reply_data.npy'))
    replies = replies[()]

    outputs_processed = []
    for output in outputs:
        # add context tweet
        status_id = str(output['in_reply_to_status_id'])
        if status_id in replies:
            output['context_tweet'] = replies[status_id]
        else:
            output['context_tweet'] = ' '.join([' '] * TWEET_SENTENCE_SIZE)  # will be a random embedding

        #  tokenize / clean
        if experiment_flag == 1:
            output['tokens'] = output['tweet'].translate(str.maketrans('', '', string.punctuation)).lower()
        elif experiment_flag == 2:
            output['tokens'] = output['context_tweet'].translate(str.maketrans('', '', string.punctuation)).lower() + \
                               output['tweet'].translate(str.maketrans('', '', string.punctuation)).lower()

        output['tokens'] = output['tokens'].split(' ')
        outputs_processed.append(output)
    return outputs_processed


def extract_tweets(label_data, data, experiment_flag):
    print("=== Extracting tweets from JSON ===")
    labels = []
    labels_map = {'hateful': 0, 'abusive': 1, 'normal': 2, 'spam': 3}
    error_count = 0
    outputs = []

    for key, value in data.items():

        if int(value['id_str']) not in label_data:
            error_count += 1
            continue
        output = {}
        output['tweet'] = value['text']
        output['label'] = labels_map[label_data[int(value['id_str'])]]
        labels.append(output['label'])
        output['retweet_count'] = value['retweet_count']
        output['retweeted'] = int(value['retweeted'])
        output['in_reply_to_status_id'] = value['in_reply_to_status_id'] if value[
                                                                                'in_reply_to_status_id'] is not None else -1
        output['favorite_count'] = value['favorite_count']
        output['label_string'] = label_data[int(value['id_str'])]
        outputs.append(output)
    outputs_processed = process_outputs(outputs, experiment_flag)
    return outputs_processed, labels



In [4]:
label_data = pd.read_csv(os.path.join(ROOT_DIR,'data/labels.csv'), header='infer', index_col=0, squeeze=True).to_dict()
outputs, labels = extract_tweets(label_data, data, 1)


=== Extracting tweets from JSON ===


In [5]:
x_train, y_train, x_valid, y_valid, x_test, y_test = split_data(outputs, labels, 28)

[Sizes] Training set: 64.00%, Validation set: 16.00%, Test set: 20.00%


In [6]:
bpe_path = os.path.join(ROOT_DIR,'models/model/vocab_40000.bpe')
encoder_path = os.path.join(ROOT_DIR,'models/model/encoder_bpe_40000.json')

encoder_path, bpe_path 

('/Users/ashemagalhaes/PycharmProjects/hate_speech/models/model/encoder_bpe_40000.json',
 '/Users/ashemagalhaes/PycharmProjects/hate_speech/models/model/vocab_40000.bpe')

In [7]:
text_encoder = TextEncoder(encoder_path, bpe_path)
text_encoder

<text_utils.TextEncoder at 0x1a4566e780>

In [8]:
encoder = text_encoder.encoder
n_vocab = len(text_encoder.encoder)

In [30]:
text_encoder.encode(['this is a sentence', 'this is also a sentence'])

                                                                                

[[616, 544, 246, 5958], [616, 544, 1359, 246, 5958]]

In [32]:
x_train_enc = text_encoder.encode([x['tweet'] for x in x_train])
x_valid_enc = text_encoder.encode([x['tweet'] for x in x_valid])
x_test_enc = text_encoder.encode([x['tweet'] for x in x_test])


                                                                                

In [79]:
def wrap_data(batch_size, seed, x_train, y_train, x_valid, y_valid, x_test, y_test):
    train_set = DataProvider(inputs=x_train, targets=y_train, seed=seed)
    train_data_local = torch.utils.data.DataLoader(train_set,
                                                   batch_size=batch_size,
                                                   num_workers=2,
                                                   sampler=ImbalancedDatasetSampler(train_set))

    valid_set = DataProvider(inputs=x_valid, targets=y_valid, seed=seed)
    valid_data_local = torch.utils.data.DataLoader(valid_set,
                                                   batch_size=batch_size,
                                                   num_workers=2,
                                                   shuffle=False)

    test_set = DataProvider(inputs=x_test, targets=y_test, seed=seed)
    test_data_local = torch.utils.data.DataLoader(test_set,
                                                  batch_size=batch_size,
                                                  num_workers=2,
                                                  shuffle=False)

    return train_data_local, valid_data_local, test_data_local


In [80]:
data = {
    'x_train': x_train_enc, 
    'y_train': y_train, 
    'x_valid': x_valid_enc, 
    'y_valid': y_valid, 
     'x_test': x_test_enc, 
    'y_test': y_test, 
}

In [81]:
train_data, valid_data, test_data = wrap_data(2048, 28, **data)

In [141]:
def run_train_iter(model, device, optimizer, criterion, x, y, stats, experiment_key='train'):
    """
    Receives the inputs and targets for the model and runs a training iteration. Returns loss and accuracy metrics.
    :param x: The inputs to the model. A numpy array of shape batch_size, channels, height, width
    :param y: The targets for the model. A numpy array of shape batch_size, num_classes
    :return: the loss and accuracy for this batch
    """
    # sets model to training mode
    # (in case batch normalization or other methods have different procedures for training and evaluation)
    model.train()
#     x = torch.tensor(x, dtype=torch.long).to(device)
    x = [np.array(item) for item in x]
    x = torch.FloatTensor(x)
    print(torch.FloatTensor(x).shape)
    y = y.to(device)
    optimizer.zero_grad()  # set all weight grads from previous training iters to 0
    out = model.forward(x)  # forward the data in the model
    print(out.shape)
    
    out = F.max_pool1d(out, out.shape[-1])
    out = out.view(out.shape[0], -1)
    
    out = nn.Linear(in_features=out.shape[1], 
                                    out_features=4,
                                    bias=False)(out)     
    print(out.shape)
    
    
    # loss = F.cross_entropy(input=out, target=y)  # compute loss
    loss = criterion(out, y)
    loss.backward()  # backpropagate to compute gradients for current iter loss

    optimizer.step()  # update network parameters
    _, predicted = torch.max(out.data, 1)  # get argmax of predictions
    accuracy = np.mean(list(predicted.eq(y.data).cpu()))  # compute accuracy
    stats['{}_acc'.format(experiment_key)].append(accuracy)
    stats['{}_loss'.format(experiment_key)].append(loss.data.detach().cpu().numpy())

# def run_evaluation_iter(model, device, optimizer, criterion, x, y, stats, experiment_key='valid'):
#     """
#     Receives the inputs and targets for the model and runs an evaluation iterations. Returns loss and accuracy metrics.
#     :param x: The inputs to the model. A numpy array of shape batch_size, channels, height, width
#     :param y: The targets for the model. A numpy array of shape batch_size, num_classes
#     :return: the loss and accuracy for this batch
#     """
#     with torch.no_grad():
#         model.eval()  # sets the system to validation mode
#         x = np.array(x)
#         print(x)
# #         x = x.to(device)
# #         y = y.to(device)
        
#         out = model.forward(torch.FloatTensor(x))  # forward the data in the model
       
        
#         loss = criterion(out, y)
        
#         # loss = F.cross_entropy(out, y)  # compute loss
#         _, predicted = torch.max(out.data, 1)  # get argmax of predictions
        
#         accuracy = np.mean(list(predicted.eq(y.data).cpu()))
#         stats['{}_acc'.format(experiment_key)].append(accuracy)  # compute accuracy
#         stats['{}_loss'.format(experiment_key)].append(loss.data.detach().cpu().numpy())
        

In [142]:
def save_model(model, model_save_dir, model_save_name, model_idx):
    """
    Save the network parameter state and current best val epoch idx and best val accuracy.
    :param model_save_name: Name to use to save model without the epoch index
    :param model_idx: The index to save the model with.
    :param best_validation_model_idx: The index of the best validation model to be stored for future use.
    :param best_validation_model_acc: The best validation accuracy to be stored for use at test time.
    :param model_save_dir: The directory to store the state at.
    :param state: The dictionary containing the system state.

    """
    # Save state each epoch
    path = os.path.join(model_save_dir, "{}_{}".format(model_save_name, str(model_idx)))
    torch.save(model.state_dict(), f=path)
    

def load_model(model, model_save_dir, model_save_name, model_idx):
    """
    Load the network parameter state and the best val model idx and best val acc to be compared with the future val accuracies, in order to choose the best val model
    :param model_save_dir: The directory to store the state at.
    :param model_save_name: Name to use to save model without the epoch index
    :param model_idx: The index to save the model with.
    """
    path = os.path.join(model_save_dir, "{}_{}".format(model_save_name, str(model_idx)))
    checkpoint = torch.load(f=path)
    # freeze parameters
    model.load_state_dict(checkpoint)
    for parameter in model.parameters():
        parameter.requires_grad = False
    return model 

In [164]:
from collections import OrderedDict, defaultdict
import tqdm

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100, eta_min=1e-4)
device = torch.device('cpu')
train_stats = OrderedDict()
num_epochs = 2

for epoch_idx in range(num_epochs):
    epoch_start_time = time.time()
    epoch_stats = defaultdict(list)
    with tqdm.tqdm(total=len(train_data)) as pbar_train:  # create a progress bar for training
        for idx, (x, y) in enumerate(train_data):  # get data batches
            run_train_iter(model, device, optimizer, criterion, x=x, y=y, stats=epoch_stats)  # take a training iter step
#             pbar_train.update(1)
#             pbar_train.set_description("loss: {:.4f}, accuracy: {:.4f}".format(epoch_stats['train_loss'][-1],
#                                                                                epoch_stats['train_acc'][-1]))

#     with tqdm.tqdm(total=len(valid_data)) as pbar_val:  # create a progress bar for validation
#         for x, y in valid_data:  # get data batches
#             run_evaluation_iter(model, device, optimizer, criterion, x=x, y=y, stats=epoch_stats)  # run a validation iter
#             pbar_val.update(1)  # add 1 step to the progress bar
#             pbar_val.set_description("loss: {:.4f}, accuracy: {:.4f}".format(epoch_stats['valid_loss'][-1],
#                                                                              epoch_stats['valid_acc'][-1]))
     
    
    
#     save_model(model, '', 'testing', epoch_idx)

  0%|          | 0/18 [00:00<?, ?it/s]

torch.Size([10, 2048])





RuntimeError: The size of tensor a (10) must match the size of tensor b (4) at non-singleton dimension 3

In [163]:
model = DoubleHeadModel(DEFAULT_CONFIG, len(encoder),'multiple_choice',40990, 4)
load_openai_pretrained_model(model.transformer)

Loading weights...


In [154]:
DEFAULT_CONFIG

{'n_embd': 768,
 'n_head': 12,
 'n_layer': 12,
 'embd_pdrop': 0.1,
 'attn_pdrop': 0.1,
 'resid_pdrop': 0.1,
 'afn': 'gelu',
 'clf_pdrop': 0.1,
 'n_class': 4}