In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split

test_fraction = 0.0125
seed = 17

In [None]:
df = pd.read_csv('./data/sentiment.csv', encoding='ISO-8859-1', names=["target", "ids", "date", "flag", "user", "text"])

possible_labels = df.target.unique()

label_dict = {}
for index, label in enumerate(possible_labels):
    label_dict[label] = index
# label 0 == negative, 1 == positive
df['label'] = df.target.replace(label_dict)
df = df.drop(['date', 'flag', 'user', 'ids', 'target'], axis=1)
df.head(10)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-uncased',
    do_lower_case=True
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.text.values,
    df.label.values,
    test_size=test_fraction,
    random_state=seed
)

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    X_train,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)
torch.save(encoded_data_train['input_ids'], 'input_ids_train.pt')
torch.save(encoded_data_train['attention_mask'], 'attention_mask_train.pt')
torch.save(torch.tensor(y_train), 'y_train.pt')
print('*******************************')
print('*            DONE             *')
print('*******************************')

In [None]:
encoded_data_test = tokenizer.batch_encode_plus(
    X_test,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)
torch.save(encoded_data_test['input_ids'], 'input_ids_test.pt')
torch.save(encoded_data_test['attention_mask'], 'attention_mask_test.pt')
torch.save(torch.tensor(y_test), 'y_tests.pt')
print('*******************************')
print('*            DONE             *')
print('*******************************')