In [None]:
!pip install datasets

# Classes

In [None]:
class CustomDataset(object):
    def __init__(self, doc_path, label_path, word_level_tokenize=False):
        """
        Define and pre-process(e.g. split, cleanup and encode label) the user-customised dataset instance.

        Expected input format:
        Doc file: txt file where each row is a document in the whole dataset.
        Label file: txt file where each row contains the label information in the whole dataset, which is formated as (index + \t + split + \t + actual label). Order correponds to the order of the documents in the Doc txt file.

        Args:
            doc_path (str), path to where all the documents in the dataset are stored.
            label_path (str), path to where all the labels in the dataset are stored.
            word_level_tokenize (boolean), to indicate whether tokenized docs should be output. This is necessary for ELMo model but not other pretrained models. Default to be False

        """
        raw_in = open(doc_path,'r')
        raw_texts = raw_in.read().split('\n')
        label_file = open(label_path,"r")
        raw_labels = label_file.read().split('\n')
        self.raw_texts = raw_texts
        self.raw_labels = raw_labels
        self.original_split = ['train','test']
        self.word_level_tokenize = word_level_tokenize

   def all_data(self):
        self.raw_texts_train = []
        self.raw_labels_train = []
        self.raw_texts_test = []
        self.raw_labels_test = []
        self.label_mapping = []
        for i in range(len(self.raw_labels)):
            index = self.raw_labels[i].split('\t')[0]
            split = self.raw_labels[i].split('\t')[1]
            label = self.raw_labels[i].split('\t')[2]
            if label not in self.label_mapping:
                self.label_mapping.append(label)

            if 'train' in split:
                self.raw_texts_train.append(self.raw_texts[i])
                self.raw_labels_train.append(self.label_mapping.index(label))
            else:
                self.raw_texts_test.append(self.raw_texts[i])
                self.raw_labels_test.append(self.label_mapping.index(label))

        if self.word_level_tokenize:
            self.tokenized_texts_train = self.initial_clean_up(self.raw_texts_train)
            self.tokenized_texts_test = self.initial_clean_up(self.raw_texts_test)
            return self.tokenized_texts_train, self.tokenized_texts_test, self.raw_labels_train, self.raw_labels_test, self.label_mapping
        else:
            return self.raw_texts_train, self.raw_texts_test, self.raw_labels_train, self.raw_labels_test, self.label_mapping

    def initial_clean_up(self, docs):
        tokenized_docs=[]  # cleaned and tokenized texts
        for doc in docs:
            clean_doc = re.sub(r'[^\w\s]','', doc)
            lower_case = word_tokenize(clean_doc)
            tokenized_docs.append(lower_case)
        return tokenized_docs

    def encode_labels(self):
        self.lEnc = LabelEncoder()
        self.lEnc.fit(self.raw_labels_train)
        self.label_encoded_train = self.lEnc.transform(raw_labels_train)
        # label_encoded_val = lEnc.transform(raw_labels_val)
        self.label_encoded_test = self.lEnc.transform(raw_labels_test)
        return self.label_encoded_train, self.label_encoded_test

    def train_val_split(self, val_size=0.1,random_state=42):
        if not self.word_level_tokenize:
            X_train, X_val, y_train, y_val = train_test_split(self.raw_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        else:
            X_train, X_val, y_train, y_val = train_test_split(self.tokenized_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        return X_train, X_val, y_train, y_val

In [None]:
from datasets import load_dataset
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

class BBCNewsDataset(object):
    def __init__(self, word_level_tokenize=False):
        """
        Define and pre-process(e.g. split, cleanup and encode label) the specific BBCNews dataset used in PEACH experiment.

        Args:
            word_level_tokenize (boolean), to indicate whether tokenized docs should be output. This is necessary for ELMo model but not other pretrained models. Default to be False

        """
        self.dataset = load_dataset("SetFit/bbc-news")
        self.original_split = ['train','test']
        self.word_level_tokenize = word_level_tokenize

    def all_data(self):
        self.raw_texts_train = self.dataset['train']['text']
        self.raw_labels_train = self.dataset['train']['label']
        self.raw_texts_test = self.dataset['test']['text']
        self.raw_labels_test = self.dataset['test']['label']
        if self.word_level_tokenize:
            self.tokenized_texts_train = self.initial_clean_up(self.raw_texts_train)
            self.tokenized_texts_test = self.initial_clean_up(self.raw_texts_test)
            return self.tokenized_texts_train, self.tokenized_texts_test, self.raw_labels_train, self.raw_labels_test
        else:
            return self.raw_texts_train, self.raw_texts_test, self.raw_labels_train, self.raw_labels_test

    def initial_clean_up(self, docs):
        tokenized_docs=[]  # cleaned and tokenized texts
        for doc in docs:
            clean_doc = re.sub(r'[^\w\s]','', doc)
            lower_case = word_tokenize(clean_doc)
            tokenized_docs.append(lower_case)
        return tokenized_docs

    def encode_labels(self):
        self.lEnc = LabelEncoder()
        self.lEnc.fit(self.raw_labels_train)
        self.label_encoded_train = self.lEnc.transform(self.raw_labels_train)
        # label_encoded_val = lEnc.transform(raw_labels_val)
        self.label_encoded_test = self.lEnc.transform(self.raw_labels_test)
        return self.label_encoded_train, self.label_encoded_test

    def train_val_split(self, val_size=0.1,random_state=42):
        if not self.word_level_tokenize:
            X_train, X_val, y_train, y_val = train_test_split(self.raw_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        else:
            X_train, X_val, y_train, y_val = train_test_split(self.tokenized_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        return X_train, X_val, y_train, y_val



In [None]:
class MRDataset(object):
    def __init__(self, doc_path, label_path, word_level_tokenize=False):
        """
        Define and pre-process(e.g. split, cleanup and encode label) the specific MR dataset used in PEACH experiment.

        Args:
            doc_path (str), path to where all the documents in the dataset are stored.
            label_path (str), path to where all the labels in the dataset are stored.
            word_level_tokenize (boolean), to indicate whether tokenized docs should be output. This is necessary for ELMo model but not other pretrained models. Default to be False

        """
        raw_in = open(doc_path,'r')
        raw_texts = raw_in.read().split('\n')
        label_file = open(label_path,"r")
        raw_labels = label_file.read().split('\n')
        self.raw_texts = raw_texts
        self.raw_labels = raw_labels
        self.original_split = ['train','test']
        self.word_level_tokenize = word_level_tokenize

   def all_data(self):
        self.raw_texts_train = []
        self.raw_labels_train = []
        self.raw_texts_test = []
        self.raw_labels_test = []
        self.label_mapping = []
        for i in range(len(self.raw_labels)):
            index = self.raw_labels[i].split('\t')[0]
            split = self.raw_labels[i].split('\t')[1]
            label = self.raw_labels[i].split('\t')[2]
            if label not in self.label_mapping:
                self.label_mapping.append(label)

            if 'train' in split:
                self.raw_texts_train.append(self.raw_texts[i])
                self.raw_labels_train.append(self.label_mapping.index(label))
            else:
                self.raw_texts_test.append(self.raw_texts[i])
                self.raw_labels_test.append(self.label_mapping.index(label))

        if self.word_level_tokenize:
            self.tokenized_texts_train = self.initial_clean_up(self.raw_texts_train)
            self.tokenized_texts_test = self.initial_clean_up(self.raw_texts_test)
            return self.tokenized_texts_train, self.tokenized_texts_test, self.raw_labels_train, self.raw_labels_test, self.label_mapping
        else:
            return self.raw_texts_train, self.raw_texts_test, self.raw_labels_train, self.raw_labels_test, self.label_mapping

    def initial_clean_up(self, docs):
        tokenized_docs=[]  # cleaned and tokenized texts
        for doc in docs:
            clean_doc = re.sub(r'[^\w\s]','', doc)
            lower_case = word_tokenize(clean_doc)
            if len(lower_case)>128:
                lower_case = lower_case[:128]
            else:
                lower_case = lower_case
            tokenized_docs.append(lower_case)
        return tokenized_docs

    def encode_labels(self):
        self.lEnc = LabelEncoder()
        self.lEnc.fit(self.raw_labels_train)
        self.label_encoded_train = self.lEnc.transform(raw_labels_train)
        # label_encoded_val = lEnc.transform(raw_labels_val)
        self.label_encoded_test = self.lEnc.transform(raw_labels_test)
        return self.label_encoded_train, self.label_encoded_test

    def train_val_split(self, val_size=0.1,random_state=42):
        if not self.word_level_tokenize:
            X_train, X_val, y_train, y_val = train_test_split(self.raw_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        else:
            X_train, X_val, y_train, y_val = train_test_split(self.tokenized_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        return X_train, X_val, y_train, y_val


In [None]:
class IMDBDataset(object):
    def __init__(self, word_level_tokenize=False):
        """
        Define and pre-process(e.g. split, cleanup and encode label) the specific IMDB dataset used in PEACH experiment.

        Args:
            word_level_tokenize (boolean), to indicate whether tokenized docs should be output. This is necessary for ELMo model but not other pretrained models. Default to be False

        """
        self.dataset = load_dataset("imdb")
        self.original_split = ['train','test']
        self.word_level_tokenize = word_level_tokenize

    def all_data(self):
        self.raw_texts_train = self.dataset['train']['text']
        self.raw_labels_train = self.dataset['train']['label']
        self.raw_texts_test = self.dataset['test']['text']
        self.raw_labels_test = self.dataset['test']['label']
        if self.word_level_tokenize:
            self.tokenized_texts_train = self.initial_clean_up(self.raw_texts_train)
            self.tokenized_texts_test = self.initial_clean_up(self.raw_texts_test)
            return self.tokenized_texts_train, self.tokenized_texts_test, self.raw_labels_train, self.raw_labels_test
        else:
            return self.raw_texts_train, self.raw_texts_test, self.raw_labels_train, self.raw_labels_test

    def initial_clean_up(self, docs):
        tokenized_docs=[]  # cleaned and tokenized texts
        for doc in docs:
            clean_doc = re.sub(r'[^\w\s]','', doc)
            lower_case = word_tokenize(clean_doc)
            tokenized_docs.append(lower_case)
        return tokenized_docs

    def encode_labels(self):
        self.lEnc = LabelEncoder()
        self.lEnc.fit(self.raw_labels_train)
        self.label_encoded_train = self.lEnc.transform(self.raw_labels_train)
        # label_encoded_val = lEnc.transform(raw_labels_val)
        self.label_encoded_test = self.lEnc.transform(self.raw_labels_test)
        return self.label_encoded_train, self.label_encoded_test

    def train_val_split(self, val_size=0.1,random_state=42):
        if not self.word_level_tokenize:
            X_train, X_val, y_train, y_val = train_test_split(self.raw_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        else:
            X_train, X_val, y_train, y_val = train_test_split(self.tokenized_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        return X_train, X_val, y_train, y_val



In [None]:
class MSRPDataset(object):
    def __init__(self):
        """
        Define and pre-process(e.g. split, cleanup and encode label) the specific MSRP dataset used in PEACH experiment.

        Args:
            word_level_tokenize (boolean), to indicate whether tokenized docs should be output. This is necessary for ELMo model but not other pretrained models. Default to be False

        """
        self.dataset = load_dataset("HHousen/msrp")
        self.original_split = ['train','test']
        self.word_level_tokenize = word_level_tokenize

    def all_data(self):
        raw_texts_A_train = self.dataset['train']['sentence1']
        raw_texts_B_train = self.dataset['train']['sentence2']
        self.raw_texts_train = [raw_texts_A_train[i]+ ' ' + raw_texts_B_train[i] for i in range(len(raw_texts_A_train))]
        self.raw_labels_train = self.dataset['train']['label']


        raw_texts_A_test = self.dataset['test']['sentence1']
        raw_texts_B_test = self.dataset['test']['sentence2']
        self.raw_texts_test = [raw_texts_A_test[i]+ ' ' + raw_texts_B_test[i] for i in range(len(raw_texts_A_test))]
        self.raw_labels_test = self.dataset['test']['label']

        if self.word_level_tokenize:
            self.tokenized_texts_train = self.initial_clean_up(self.raw_texts_train)
            self.tokenized_texts_test = self.initial_clean_up(self.raw_texts_test)
            return self.tokenized_texts_train, self.tokenized_texts_test, self.raw_labels_train, self.raw_labels_test
        else:
            return self.raw_texts_train, self.raw_texts_test,  self.raw_labels_train, self.raw_labels_test

    def initial_clean_up(self, docs):
        tokenized_docs=[]
        for doc in docs:
            clean_doc = re.sub(r'[^\w\s]','', doc)
            lower_case = word_tokenize(clean_doc)
            tokenized_docs.append(lower_case)
        return tokenized_docs

    def encode_labels(self):
        self.lEnc = LabelEncoder()
        self.lEnc.fit(self.raw_labels_train)
        self.label_encoded_train = self.lEnc.transform(self.raw_labels_train)
        self.label_encoded_test = self.lEnc.transform(self.raw_labels_test)
        return self.label_encoded_train, self.label_encoded_test, self.label_encoded_val

    def train_val_split(self, val_size=0.1,random_state=42):
        if not self.word_level_tokenize:
            X_train, X_val, y_train, y_val = train_test_split(self.raw_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        else:
            X_train, X_val, y_train, y_val = train_test_split(self.tokenized_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        return X_train, X_val, y_train, y_val


In [None]:
class SICKDataset(object):
    def __init__(self):
        """
        Define and pre-process(e.g. split, cleanup and encode label) the specific SICK dataset used in PEACH experiment.

        Args:
            word_level_tokenize (boolean), to indicate whether tokenized docs should be output. This is necessary for ELMo model but not other pretrained models. Default to be False

        """
        self.dataset = load_dataset("sick")
        self.original_split = ['train','test']
        self.word_level_tokenize = word_level_tokenize

    def all_data(self):
        raw_texts_A_train = self.dataset['train']['sentence_A']
        raw_texts_B_train = self.dataset['train']['sentence_B']
        self.raw_texts_train = [raw_texts_A_train[i]+ ' ' + raw_texts_B_train[i] for i in range(len(raw_texts_A_train))]
        self.raw_labels_train = self.dataset['train']['label']

        raw_texts_A_val = self.dataset['validation']['sentence_A']
        raw_texts_B_val =self.dataset['validation']['sentence_B']
        self.raw_texts_val = [raw_texts_A_val[i]+ ' ' + raw_texts_B_val[i] for i in range(len(raw_texts_A_val))]
        self.raw_labels_val = self.dataset['validation']['label']

        raw_texts_A_test = self.dataset['test']['sentence_A']
        raw_texts_B_test = self.dataset['test']['sentence_B']
        self.raw_texts_test = [raw_texts_A_test[i]+ ' ' + raw_texts_B_test[i] for i in range(len(raw_texts_A_test))]
        self.raw_labels_test = self.dataset['test']['label']

        if self.word_level_tokenize:
            self.tokenized_texts_train = self.initial_clean_up(self.raw_texts_train)
            self.tokenized_texts_test = self.initial_clean_up(self.raw_texts_test)
            self.tokenized_texts_val = self.initial_clean_up(self.raw_texts_val)
            return self.tokenized_texts_train, self.tokenized_texts_test, self.tokenized_texts_val, self.raw_labels_train, self.raw_labels_test, self.raw_labels_val
        else:
            return self.raw_texts_train, self.raw_texts_test, self.raw_texts_val, self.raw_labels_train, self.raw_labels_test, self.raw_labels_val

    def initial_clean_up(self, docs):
        tokenized_docs=[]  # cleaned and tokenized texts
        for doc in docs:
            clean_doc = re.sub(r'[^\w\s]','', doc)
            lower_case = word_tokenize(clean_doc)
            tokenized_docs.append(lower_case)
        return tokenized_docs

    def encode_labels(self):
        self.lEnc = LabelEncoder()
        self.lEnc.fit(self.raw_labels_train)
        self.label_encoded_train = self.lEnc.transform(self.raw_labels_train)
        self.label_encoded_val = self.lEnc.transform(self.raw_labels_val)
        self.label_encoded_test = self.lEnc.transform(self.raw_labels_test)
        return self.label_encoded_train, self.label_encoded_test, self.label_encoded_val

    def train_val_split(self, val_size=0.1,random_state=42):
        if not self.word_level_tokenize:
            X_train, X_val, y_train, y_val = self.raw_texts_train, self.raw_texts_val, self.label_encoded_train, self.label_encoded_val
        else:
            X_train, X_val, y_train, y_val =  self.tokenized_docs_train,  self.tokenized_docs_val,  self.label_encoded_train,  self.label_encoded_val
        return X_train, X_val, y_train, y_val


In [None]:
import pandas as pd
class SST2Dataset(object):
    def __init__(self, train_path, val_path, test_path, word_level_tokenize=False):
        """
        Define and pre-process(e.g. split, cleanup and encode label) the specific SST2 dataset used in PEACH experiment.

        Args:
            train_path (str), path to where all the documents and labels in the training dataset are stored.
            val_path (str), path to where all the documents and labels in the validation dataset are stored.
            test_path (str), path to where all the documents and labels in the testing dataset are stored.
            word_level_tokenize (boolean), to indicate whether tokenized docs should be output. This is necessary for ELMo model but not other pretrained models. Default to be False

        """
        self.train_in_df = pd.read_csv('train.tsv', sep='\t', header=None)
        self.val_in_df = pd.read_csv('dev.tsv', sep='\t', header=None)
        self.test_in_df = pd.read_csv('test.tsv', sep='\t', header=None)
        self.original_split = ['train','test''val']
        self.word_level_tokenize = word_level_tokenize

    def all_data(self)
        self.raw_texts_train = self.train_in_df[0].tolist()
        self.raw_labels_train = self.train_in_df[1].tolist()
        self.raw_texts_val = self.val_in_df[0].tolist()
        self.raw_labels_val = self.val_in_df[1].tolist()
        self.raw_texts_test = self.test_in_df[0].tolist()
        self.raw_labels_test = self.test_in_df[1].tolist()
        if self.word_level_tokenize:
            self.tokenized_texts_train = self.initial_clean_up(self.raw_texts_train)
            self.tokenized_texts_test = self.initial_clean_up(self.raw_texts_test)
            self.tokenized_texts_val = self.initial_clean_up(self.raw_texts_val)
            return self.tokenized_texts_train, self.tokenized_texts_test, self.tokenized_texts_val, self.raw_labels_train, self.raw_labels_test, self.raw_labels_val
        else:
            return self.raw_texts_train, self.raw_texts_test, self.raw_texts_val, self.raw_labels_train, self.raw_labels_test, self.raw_labels_val

    def initial_clean_up(self, docs):
        tokenized_docs=[]  # cleaned and tokenized texts
        for doc in docs:
            clean_doc = re.sub(r'[^\w\s]','', doc)
            lower_case = word_tokenize(clean_doc)
            tokenized_docs.append(lower_case)
        return tokenized_docs

    def encode_labels(self):
        self.lEnc = LabelEncoder()
        self.lEnc.fit(self.raw_labels_train)
        self.label_encoded_train = self.lEnc.transform(self.raw_labels_train)
        self.label_encoded_val = self.lEnc.transform(self.raw_labels_val)
        self.label_encoded_test = self.lEnc.transform(self.raw_labels_test)
        return self.label_encoded_train, self.label_encoded_test, self.label_encoded_val

    def train_val_split(self):
        if not self.word_level_tokenize:
            X_train, X_val, y_train, y_val = self.raw_texts_train, self.raw_texts_val, self.label_encoded_train, self.label_encoded_val
        else:
            X_train, X_val, y_train, y_val =  self.tokenized_docs_train,  self.tokenized_docs_val,  self.label_encoded_train,  self.label_encoded_val
        return X_train, X_val, y_train, y_val

In [None]:
class TRECDataset(object):
    def __init__(self, word_level_tokenize=False):
        """
        Define and pre-process(e.g. split, cleanup and encode label) the specific TREC dataset used in PEACH experiment.

        Args:
            word_level_tokenize (boolean), to indicate whether tokenized docs should be output. This is necessary for ELMo model but not other pretrained models. Default to be False

        """
        self.dataset = load_dataset("trec")
        self.original_split = ['train','test']
        self.word_level_tokenize = word_level_tokenize

    def all_data(self):
        self.raw_texts_train = self.dataset['train']['text']
        self.raw_labels_train = self.dataset['train']['coarse_label']
        self.raw_texts_test = self.dataset['test']['text']
        self.raw_labels_test = self.dataset['test']['coarse_label']
        if self.word_level_tokenize:
            self.tokenized_texts_train = self.initial_clean_up(self.raw_texts_train)
            self.tokenized_texts_test = self.initial_clean_up(self.raw_texts_test)
            return self.tokenized_texts_train, self.tokenized_texts_test, self.raw_labels_train, self.raw_labels_test
        else:
            return self.raw_texts_train, self.raw_texts_test, self.raw_labels_train, self.raw_labels_test

    def initial_clean_up(self, docs):
        tokenized_docs=[]  # cleaned and tokenized texts
        for doc in docs:
            clean_doc = re.sub(r'[^\w\s]','', doc)
            lower_case = word_tokenize(clean_doc)
            tokenized_docs.append(lower_case)
        return tokenized_docs

    def encode_labels(self):
        self.lEnc = LabelEncoder()
        self.lEnc.fit(self.raw_labels_train)
        self.label_encoded_train = self.lEnc.transform(self.raw_labels_train)
        # label_encoded_val = lEnc.transform(raw_labels_val)
        self.label_encoded_test = self.lEnc.transform(self.raw_labels_test)
        return self.label_encoded_train, self.label_encoded_test

    def train_val_split(self, val_size=0.1,random_state=42):
        if not self.word_level_tokenize:
            X_train, X_val, y_train, y_val = train_test_split(self.raw_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        else:
            X_train, X_val, y_train, y_val = train_test_split(self.tokenized_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        return X_train, X_val, y_train, y_val




In [None]:
class TwentyNGDataset(object):
    def __init__(self, doc_path, label_path, word_level_tokenize=False):
        """
        Define and pre-process(e.g. split, cleanup and encode label) the specific 20ng dataset used in PEACH experiment.

        Args:
            doc_path (str), path to where all the documents in the dataset are stored.
            label_path (str), path to where all the labels in the dataset are stored.
            word_level_tokenize (boolean), to indicate whether tokenized docs should be output. This is necessary for ELMo model but not other pretrained models. Default to be False

        """
        raw_in = open(doc_path,'r')
        raw_texts = raw_in.read().split('\n')
        label_file = open(label_path,"r")
        raw_labels = label_file.read().split('\n')
        self.raw_texts = raw_texts
        self.raw_labels = raw_labels
        self.original_split = ['train','test']
        self.word_level_tokenize = word_level_tokenize

   def all_data(self):
        self.raw_texts_train = []
        self.raw_labels_train = []
        self.raw_texts_test = []
        self.raw_labels_test = []
        self.label_mapping = []
        for i in range(len(self.raw_labels)):
            index = self.raw_labels[i].split('\t')[0]
            split = self.raw_labels[i].split('\t')[1]
            label = self.raw_labels[i].split('\t')[2]
            if label not in self.label_mapping:
                self.label_mapping.append(label)

            if 'train' in split:
                self.raw_texts_train.append(self.raw_texts[i])
                self.raw_labels_train.append(self.label_mapping.index(label))
            else:
                self.raw_texts_test.append(self.raw_texts[i])
                self.raw_labels_test.append(self.label_mapping.index(label))

        if self.word_level_tokenize:
            self.tokenized_texts_train = self.initial_clean_up(self.raw_texts_train)
            self.tokenized_texts_test = self.initial_clean_up(self.raw_texts_test)
            return self.tokenized_texts_train, self.tokenized_texts_test, self.raw_labels_train, self.raw_labels_test, self.label_mapping
        else:
            return self.raw_texts_train, self.raw_texts_test, self.raw_labels_train, self.raw_labels_test, self.label_mapping

    def initial_clean_up(self, docs):
        tokenized_docs=[]  # cleaned and tokenized texts
        for doc in docs:
            clean_doc = re.sub(r'[^\w\s]','', doc)
            lower_case = word_tokenize(clean_doc)
            if len(lower_case)>128:
                lower_case = lower_case[:128]
            else:
                lower_case = lower_case
            tokenized_docs.append(lower_case)
        return tokenized_docs

    def encode_labels(self):
        self.lEnc = LabelEncoder()
        self.lEnc.fit(self.raw_labels_train)
        self.label_encoded_train = self.lEnc.transform(raw_labels_train)
        # label_encoded_val = lEnc.transform(raw_labels_val)
        self.label_encoded_test = self.lEnc.transform(raw_labels_test)
        return self.label_encoded_train, self.label_encoded_test

    def train_val_split(self, val_size=0.1,random_state=42):
        if not self.word_level_tokenize:
            X_train, X_val, y_train, y_val = train_test_split(self.raw_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        else:
            X_train, X_val, y_train, y_val = train_test_split(self.tokenized_texts_train, self.label_encoded_train, test_size=val_size, random_state=random_state)
        return X_train, X_val, y_train, y_val