In [10]:
import sys
sys.path.append("..")
from comet_ml import Experiment
import argparse
import configparser
from torch import optim

from experiment_builder import ExperimentBuilder
from globals import ROOT_DIR
from data_providers import *
import os

from models.fc_linear_tdidf import fc_linear_tdidf
from models.cnn import *
config = configparser.ConfigParser()
config.read('../config.ini')


['../config.ini']

In [11]:
config['DEFAULT']['PATH_DATA']

'data/80k_tweets.json'

In [14]:
def extract_data(embedding_key, embedding_level_key, seed):
    path_data = os.path.join(ROOT_DIR, config['DEFAULT']['PATH_DATA'])
    path_labels = os.path.join(ROOT_DIR, config['DEFAULT']['PATH_LABELS'])
    data_provider = TextDataProvider(path_data, path_labels)
    if embedding_level_key == 'word':
        output = data_provider.generate_word_level_embeddings(embedding_key, seed)
    elif embedding_level_key == 'char':
        output = data_provider.generate_char_level_embeddings(seed)
    else:
        output = data_provider.generate_tdidf_embeddings(seed)

    if True:
        print("[Sizes] Training set: {}, Validation set: {}, Test set: {}".format(len(output['x_train']),
                                                                                  len(output['x_valid']),
                                                                                  len(output['x_test'])))
    return output


In [15]:
data = extract_data('fasttext','word', 28)

=== Extracting annotations ===
=== Extracting tweets from JSON ===
[Stats] Removed 3/58358 labels
[Stats] Average tweet length is 17 words
[Stats] Average tweet length is 121 characters
[Stats] Average favorite count is 15
[Stats] Average retweet count is 146
[Stats] Average follower count is 710
[Sizes] Training set: 64.00%, Validation set: 16.00%, Test set: 20.00%
[Model] Using fasttext embeddings
[Sizes] Training set: 64.00%, Validation set: 16.00%, Test set: 20.00%
[Sizes] Training set: 37348, Validation set: 9338, Test set: 11672


In [16]:
def wrap_data(batch_size, seed, x_train, y_train, x_valid, y_valid, x_test, y_test):
    train_set = DataProvider(inputs=x_train, targets=y_train, seed=seed)
    train_data_local = torch.utils.data.DataLoader(train_set,
                                                   batch_size=batch_size,
                                                   num_workers=2,
                                                   sampler=ImbalancedDatasetSampler(train_set))

    valid_set = DataProvider(inputs=x_valid, targets=y_valid, seed=seed)
    valid_data_local = torch.utils.data.DataLoader(valid_set,
                                                   batch_size=batch_size,
                                                   num_workers=2,
                                                   shuffle=False)

    test_set = DataProvider(inputs=x_test, targets=y_test, seed=seed)
    test_data_local = torch.utils.data.DataLoader(test_set,
                                                  batch_size=batch_size,
                                                  num_workers=2,
                                                  shuffle=False)

    return train_data_local, valid_data_local, test_data_local

In [17]:
train_data, valid_data, test_data = wrap_data(64, 28, **data)


In [19]:
for x, y in test_data:  # sample batch
    print(y)
    break

tensor([2, 2, 2, 3, 2, 2, 3, 2, 2, 0, 3, 2, 2, 2, 2, 2, 2, 3, 3, 2, 1, 2, 3, 2,
        2, 2, 1, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 3, 0, 3, 2])


In [20]:
for x, y in test_data:  # sample batch
    print(y)
    break

tensor([2, 2, 2, 3, 2, 2, 3, 2, 2, 0, 3, 2, 2, 2, 2, 2, 2, 3, 3, 2, 1, 2, 3, 2,
        2, 2, 1, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 3, 0, 3, 2])
