In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [2]:
PATH = ["../dataset/train.tsv","../dataset/valid.tsv","../dataset/test.tsv", "../dataset/augm_helper.tsv"]

In [3]:
train = pd.read_csv(PATH[0], sep="\t")
valid = pd.read_csv(PATH[1], sep="\t")
test = pd.read_csv(PATH[2], sep="\t")
augm = pd.read_csv(PATH[3], sep="\t")

In [4]:
data = train.preprocessed_tweet_text_no_link
data = data.append(valid.preprocessed_tweet_text_no_link)
data = data.append(test.preprocessed_tweet_text_no_link)
data = data.append(augm.preprocessed_tweet_text_no_link)
data = data.reset_index(drop=True)
data.shape

(15722,)

In [5]:
vectorizer = TextVectorization(max_tokens=26000, output_sequence_length = 128, split='whitespace', standardize='lower_and_strip_punctuation', output_mode="int")
ds = tf.data.Dataset.from_tensor_slices(data)
vectorizer.adapt(ds.batch(64))

2022-01-14 08:01:53.781533: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
vectorizer.get_vocabulary()[:4]

['', '[UNK]', 'the', 'of']

In [7]:
train_vectors_text = vectorizer(train.preprocessed_tweet_text_no_link.to_numpy())
valid_vectors_text = vectorizer(valid.preprocessed_tweet_text_no_link.to_numpy())
test_vectors_text = vectorizer(test.preprocessed_tweet_text_no_link.to_numpy())

In [8]:
np.save("vectors/X_train_all", train_vectors_text)
np.save("vectors/X_valid_all", valid_vectors_text)
np.save("vectors/X_test_all", test_vectors_text)

In [9]:
train.q1_label = train.q1_label.map(dict(yes=1, no=0))
valid.q1_label = valid.q1_label.map(dict(yes=1, no=0))
test.q1_label = test.q1_label.map(dict(yes=1, no=0))

In [10]:
train.q2_label = train.q2_label.fillna("unk")
train.q3_label = train.q3_label.fillna("unk")
train.q4_label = train.q4_label.fillna("unk")

In [11]:
train.q2_label = train.q2_label.map(dict(unk=0, yes=1, no=2))
train.q3_label = train.q3_label.map(dict(unk=0, yes=1, no=2))
train.q4_label = train.q4_label.map(dict(unk=0, yes=1, no=2))

In [12]:
train.q4_label.value_counts()

2    3745
0    1972
1     688
Name: q4_label, dtype: int64

In [13]:
valid[["q2_label","q3_label","q4_label"]] = valid[["q2_label","q3_label","q4_label"]].fillna("unk")
valid.q2_label = valid.q2_label.map(dict(unk=0, yes=1, no=2))
valid.q3_label = valid.q3_label.map(dict(unk=0, yes=1, no=2))
valid.q4_label = valid.q4_label.map(dict(unk=0, yes=1, no=2))

In [14]:
test.q4_label.unique()

array(['no', 'yes', nan, 'Unnamed: 5'], dtype=object)

In [15]:
test.q2_label = test.q2_label.replace('Unnamed: 3', 'unk')
test.q3_label = test.q3_label.replace('Unnamed: 4', 'unk')
test.q4_label = test.q4_label.replace('Unnamed: 5', 'unk')
test.q4_label = test.q4_label.replace('nan', 'unk')

In [16]:
test[["q2_label","q3_label","q4_label"]] = test[["q2_label","q3_label","q4_label"]].fillna("unk")
test.q2_label = test.q2_label.map(dict(unk=0, yes=1, no=2))
test.q3_label = test.q3_label.map(dict(unk=0, yes=1, no=2))
test.q4_label = test.q4_label.map(dict(unk=0, yes=1, no=2))

In [17]:
np.save("vectors/Y_train_all", train[["q1_label","q2_label","q3_label","q4_label"]].to_numpy())
np.save("vectors/Y_valid_all", valid[["q1_label","q2_label","q3_label","q4_label"]].to_numpy())
np.save("vectors/Y_test_all", test[["q1_label","q2_label","q3_label","q4_label"]].to_numpy())

In [18]:
train[["q1_label","q2_label","q3_label","q4_label"]].to_numpy()

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       ...,
       [1, 2, 1, 2],
       [1, 2, 1, 2],
       [1, 2, 1, 2]])