In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [2]:
PATH = ["../dataset/train.tsv","../dataset/valid.tsv","../dataset/test.tsv", "../dataset/augm_helper.tsv", "../dataset/train_augmented.tsv"]

In [3]:
train = pd.read_csv(PATH[0], sep="\t")
valid = pd.read_csv(PATH[1], sep="\t")
test = pd.read_csv(PATH[2], sep="\t")
augm1 = pd.read_csv(PATH[3], sep="\t")
augm2 = pd.read_csv(PATH[4], sep="\t")

In [4]:
data = train.preprocessed_tweet_text_no_link
data = data.append(valid.preprocessed_tweet_text_no_link)
data = data.append(test.preprocessed_tweet_text_no_link)
data = data.append(augm1.preprocessed_tweet_text_no_link)
data = data.append(augm2.preprocessed_tweet_text_no_link)
data = data.reset_index(drop=True)
data.shape

(66962,)

In [5]:
vectorizer = TextVectorization(max_tokens=26000, output_sequence_length = 128, split='whitespace', standardize='lower_and_strip_punctuation', output_mode="int")
ds = tf.data.Dataset.from_tensor_slices(data)
vectorizer.adapt(ds.batch(64))

2022-01-14 11:49:47.998681: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
vectorizer.get_vocabulary()[:4]

['', '[UNK]', 'the', 'of']

In [7]:
train_data_aug_q1 = train.preprocessed_tweet_text_no_link.to_list()
train_data_aug_q1 += augm1.preprocessed_tweet_text_no_link.to_list()
train_data_aug_q1 += augm2.preprocessed_tweet_text_no_link.to_list()
train_data = pd.DataFrame(train_data_aug_q1,columns=["preprocessed_tweet_text_no_link"])

In [8]:
train_vectors_text = vectorizer(train.preprocessed_tweet_text_no_link.to_numpy())
train_vectors_text_q1_aug = vectorizer(train_data.preprocessed_tweet_text_no_link.to_numpy())

In [9]:
valid_vectors_text = vectorizer(valid.preprocessed_tweet_text_no_link.to_numpy())
test_vectors_text = vectorizer(test.preprocessed_tweet_text_no_link.to_numpy())

In [10]:
np.save("vectors/X_train_q1_aug", train_vectors_text_q1_aug)
np.save("vectors/X_valid", valid_vectors_text)
np.save("vectors/X_test", test_vectors_text)

In [11]:
Y_train = train.q1_label.map(dict(yes=1, no=0))
Y_augm1 = augm1.q1_label
Y_augm2 = augm2.q1_label.map(dict(yes=1, no=0))
Y_valid = valid.q1_label.map(dict(yes=1, no=0))
Y_test = test.q1_label.map(dict(yes=1, no=0))

In [12]:
np.save("vectors/Y_train_q1", np.array(Y_train.to_list() +  Y_augm1.to_list() + Y_augm2.to_list()))
np.save("vectors/Y_valid_q1", Y_valid.to_numpy())
np.save("vectors/Y_test_q1", Y_test.to_numpy())

### Prepare data for q [2,3,4]

In [13]:
train_data_aug_q24 = train.preprocessed_tweet_text_no_link.to_list()
train_data_aug_q24 += augm2.preprocessed_tweet_text_no_link.to_list()
train_data = pd.DataFrame(train_data_aug_q24,columns=["preprocessed_tweet_text_no_link"])

In [14]:
train_vectors_text = vectorizer(train_data.preprocessed_tweet_text_no_link.to_numpy())
valid_vectors_text = vectorizer(valid.preprocessed_tweet_text_no_link.to_numpy())
test_vectors_text = vectorizer(test.preprocessed_tweet_text_no_link.to_numpy())

In [15]:
np.save("vectors/X_train_q2-4", train_vectors_text)
np.save("vectors/X_valid_q2-4", valid_vectors_text)
np.save("vectors/X_test_q2-4", test_vectors_text)

In [16]:
train.q2_label = train.q2_label.fillna("unk")
train.q3_label = train.q3_label.fillna("unk")
train.q4_label = train.q4_label.fillna("unk")

In [17]:
augm2.q2_label = augm2.q2_label.fillna("unk")
augm2.q3_label = augm2.q3_label.fillna("unk")
augm2.q4_label = augm2.q4_label.fillna("unk")

In [18]:
train.q2_label = train.q2_label.map(dict(unk=0, yes=1, no=2))
train.q3_label = train.q3_label.map(dict(unk=0, yes=1, no=2))
train.q4_label = train.q4_label.map(dict(unk=0, yes=1, no=2))

In [19]:
augm2.q2_label = augm2.q2_label.map(dict(unk=0, yes=1, no=2))
augm2.q3_label = augm2.q3_label.map(dict(unk=0, yes=1, no=2))
augm2.q4_label = augm2.q4_label.map(dict(unk=0, yes=1, no=2))

In [20]:
train.q4_label.value_counts()

2    3745
0    1972
1     688
Name: q4_label, dtype: int64

In [21]:
valid[["q2_label","q3_label","q4_label"]] = valid[["q2_label","q3_label","q4_label"]].fillna("unk")
valid.q2_label = valid.q2_label.map(dict(unk=0, yes=1, no=2))
valid.q3_label = valid.q3_label.map(dict(unk=0, yes=1, no=2))
valid.q4_label = valid.q4_label.map(dict(unk=0, yes=1, no=2))

In [22]:
test.q4_label.unique()

array(['no', 'yes', nan, 'Unnamed: 5'], dtype=object)

In [23]:
test.shape

(1775, 18)

In [24]:
test.q2_label = test.q2_label.replace('Unnamed: 3', 'unk')
test.q3_label = test.q3_label.replace('Unnamed: 4', 'unk')
test.q4_label = test.q4_label.replace('Unnamed: 5', 'unk')
test.q4_label = test.q4_label.replace('nan', 'unk')

In [25]:
test[["q2_label","q3_label","q4_label"]] = test[["q2_label","q3_label","q4_label"]].fillna("unk")
test.q2_label = test.q2_label.map(dict(unk=0, yes=1, no=2))
test.q3_label = test.q3_label.map(dict(unk=0, yes=1, no=2))
test.q4_label = test.q4_label.map(dict(unk=0, yes=1, no=2))

In [26]:
train_q2_4 = train[["q2_label","q3_label","q4_label"]].to_numpy()

In [27]:
augm_q2_4 = augm2[["q2_label","q3_label","q4_label"]].to_numpy()

In [28]:
train_result = np.concatenate((train_q2_4, augm_q2_4), axis=0)

In [29]:
np.save("vectors/Y_train_q2-4", train_result)
np.save("vectors/Y_valid_q2-4", valid[["q2_label","q3_label","q4_label"]].to_numpy())
np.save("vectors/Y_test_q2-4", test[["q2_label","q3_label","q4_label"]].to_numpy())