### utils

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split
import re
import codecs

In [2]:
def _read_file_datasets(file_path, sep=" "):
    with open(file_path, encoding="utf-8") as f:
        file = f.readlines()
    data = list(map(lambda s : s.split(sep), file))
    data = list(map(lambda s : [float(s_i.strip()) if is_number(s_i) else s_i.strip() for s_i in s], data))
    return data

In [3]:
def is_number(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

## OpenWebText

In [4]:
import datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("openwebtext", cache_dir="./")

## STATICWORDEMBEDDINGSEVALUATION

### ws353

In [4]:
file_path = "./ws353simrel/wordsim353_annotator2.txt"

In [5]:
data = _read_file_datasets(file_path, sep="\t")

In [6]:
data[0]

['s', 'love', 'sex', 6.77]

### MEN

In [7]:
file_path = "./MEN/MEN_dataset_natural_form_full"

In [8]:
data = _read_file_datasets(file_path)

In [9]:
data[0]

['sun', 'sunlight', 50.0]

### SIMLEX

In [10]:
file_path = "./SIMLEX/SimLex-999.txt"

In [11]:
data = _read_file_datasets(file_path, sep="\t")

In [12]:
data[0]

['old', 'new', 'A', 1.58, 2.72, 2.81, 2.0, 7.25, 1.0, 0.41]

### RW

In [13]:
file_path = "./RW/rw.txt"

In [14]:
data = _read_file_datasets(file_path, sep="\t")

In [15]:
data[0]

['squishing', 'squirt', 5.88, 7.0, 7.0, 6.0, 1.0, 4.0, 6.0, 6.0, 7.0, 2.0, 4.0]

### RG

In [16]:
file_path = "./RG/rg65.csv"

In [17]:
data = _read_file_datasets(file_path, sep=";")

In [18]:
data[0]

['cord', 'smile', 0.02]

### Mturk

In [19]:
file_path = "./Mturk/EN-MTurk-287.txt"

In [20]:
data = _read_file_datasets(file_path, sep="\t")

In [21]:
data[0]

['episcopal', 'russia', 2.75]

## CONTEXT-DEPENDENT WORD EMBEDDINGS

### WiC dataset

In [22]:
train_file = "./WiC/train/train.data.txt"

In [23]:
train_data = _read_file_datasets(train_file, sep="\t")

In [24]:
train_data[0]

['carry',
 'V',
 '2-1',
 'You must carry your camping gear .',
 'Sound carries well over water .']

### SCWS

In [25]:
train_file = "SCWS/ratings.txt"
train_data = _read_file_datasets(train_file, sep="\t")

In [26]:
train_data[0]

[1.0,
 'Brazil',
 'n',
 'nut',
 'n',
 'gap in income between blacks and other non-whites is relatively small compared to the large gap between whites and non-whites . Other factors such as illiteracy and education level show the same patterns . Unlike in the US where African Americans were united in the civil rights struggle , in <b> Brazil </b> the philosophy of whitening has helped divide blacks from other non-whites and prevented a more active civil rights movement . Though Afro-Brazilians make up half the population there are very few black politicians . The city of Salvador , Bahia for instance is 80 % Afro-Brazilian but has never',
 'of the neck , bridge , and pickups , there are features which are found in almost every guitar . The photo below shows the different parts of an electric guitar . The headstock ( 1 ) contains the metal machine heads , which are used for tuning ; the <b> nut </b> ( 1.4 ) , a thin fret-like strip of metal , plastic , graphite or bone which the strings 

## Document Classification

### 20newsgroup dataset

In [27]:
class newsgroupDATASET(Dataset):
    def __init__(self, news, labels):
        self.news = news
        self.labels = labels
    def __getitem__(self, index):
        """ r e t o u r n e un c o u p l e ( exemple , l a b e l ) c o r r e s p o n d a n t a l ’ i n d e x """
        x, y = self.news[index], self.labels[index]
        
        return x, y
        
    def __len__(self):
        """ r e n v o i e l a t a i l l e du j e u de donnees """
        return len(self.labels)

In [28]:
from sklearn.datasets import fetch_20newsgroups

newsgroup_train_X, newsgroup_train_y = fetch_20newsgroups(subset="train", download_if_missing=False, 
                                                          return_X_y=True, shuffle=True,
                                                          random_state=1, remove=('headers', 'footers', 'quotes'))
newsgroup_test_X, newsgroup_test_y = fetch_20newsgroups(subset="test", download_if_missing=False, 
                                                        return_X_y=True, shuffle=True,
                                                        random_state=1, remove=('headers', 'footers', 'quotes'))

In [29]:
# return : tuple of news, tensor of labels
train_loader = DataLoader(newsgroupDATASET(newsgroup_train_X, newsgroup_train_y), batch_size=32, shuffle=False)
test_loader = DataLoader(newsgroupDATASET(newsgroup_test_X, newsgroup_test_y), batch_size=32, shuffle=False)

In [30]:
for x, y in train_loader:
    print("Length news batch:", len(x))
    print("Targets:", y)
    break

Length news batch: 32
Targets: tensor([17,  0, 17, 11, 10, 15,  4, 17, 13, 12,  1,  6, 13, 15,  4, 11, 11, 10,
        12, 19, 12, 12, 16, 16, 10, 15,  6, 11, 10,  8, 11,  4])


### WOS-11967

In [31]:
class WOSDATASET(Dataset):
    def __init__(self, documents, labels):
        self.documents = documents
        self.labels = labels
    def __getitem__(self, index):
        """ r e t o u r n e un c o u p l e ( exemple , l a b e l ) c o r r e s p o n d a n t a l ’ i n d e x """
        x, y = self.documents[index], self.labels[index]
        
        return x, y
        
    def __len__(self):
        """ r e n v o i e l a t a i l l e du j e u de donnees """
        return len(self.labels)

In [32]:
def _generate_examples(input_file, label_file, label_level_1_file, label_level_2_file):
        """Yields examples."""
        with open(input_file, encoding="utf-8") as f:
            input_data = f.readlines()
        with open(label_file, encoding="utf-8") as f:
            label_data = f.readlines()
        with open(label_level_1_file, encoding="utf-8") as f:
            label_level_1_data = f.readlines()
        with open(label_level_2_file, encoding="utf-8") as f:
            label_level_2_data = f.readlines()
        for i in range(len(input_data)):
            yield i, {
                "input_data": input_data[i],
                "label": label_data[i],
                "label_level_1": label_level_1_data[i],
                "label_level_2": label_level_2_data[i],
            }
            
def _read_data(input_file, label_file, label_level_1_file, label_level_2_file):
    with open(input_file, encoding="utf-8") as f:
        input_data = f.readlines()
    with open(label_file, encoding="utf-8") as f:
        label_data = f.readlines()
        label_data = list(map(lambda s: int(s.strip()), label_data))
    with open(label_level_1_file, encoding="utf-8") as f:
        label_level_1_data = f.readlines()
    with open(label_level_2_file, encoding="utf-8") as f:
        label_level_2_data = f.readlines()
    return input_data, label_data, label_level_1_data, label_level_2_data

In [33]:
dir_path = "./WOS11967/"
input_file = dir_path + "X.txt"
label_file = dir_path + "Y.txt"
label_level_1_file = dir_path + "YL1.txt"
label_level_2_file = dir_path + "YL2.txt"

In [34]:
examples = _generate_examples(input_file, label_file, label_level_1_file, label_level_2_file)

In [35]:
for example in examples:
    print(example[1]["label"])
    break

10



In [36]:
input_data, label_data, label_level_1_data, label_level_2_data = _read_data(input_file, label_file, label_level_1_file, label_level_2_file)

In [37]:
input_data_train, input_data_test, label_data_train, label_data_test = train_test_split(input_data, 
                                                                                        label_data,
                                                                                        test_size=0.2,
                                                                                        stratify=label_data)

In [38]:
# return : tuple of documents, tensor of labels
train_loader = DataLoader(WOSDATASET(input_data_train, label_data_train), batch_size=32, shuffle=True)
test_loader = DataLoader(WOSDATASET(input_data_test, label_data_test), batch_size=32, shuffle=True)

In [39]:
for x, y in train_loader:
    print("Length documents batch:", len(x))
    print("Example : \n", x[0])
    print("Targets:", y)
    break

Length documents batch: 32
Example : 
 A high-altitude (>1,500 m asl) integrated participatory watershed development programme was implemented between 2004 and 2008 in the West Khasi Hills, Meghalaya, North-Eastern Indian Himalaya. The aim was to assess and refine practices for integrating crop, fish and livestock production systems. Soil and water conservation measures, with the active participation of local inhabitants, included the construction and renovation of ponds, jalkunds (micro rainwater-harvesting structures) and bench and half-moon terraces. Impact analysis revealed that 4.3 million litres of water were harvested and enhanced potato and rice crop productivity by 30% to 40% and 45% to 50% respectively. Farmers are now able to earn net incomes of around $56.8 and $8.9 per month from community dairy units and fish ponds respectively.

Targets: tensor([21, 10, 12,  2,  6,  5, 24,  2, 29,  1, 28, 19, 24,  4,  9, 12, 18,  7,
         8, 16,  4,  1, 19, 29,  0, 18, 27,  8,  9, 18,

### TREC-6 dataset

In [40]:
class TREC6DATASET(Dataset):
    def __init__(self, questions, labels):
        self.questions = questions
        self.labels = labels
    def __getitem__(self, index):
        """ r e t o u r n e un c o u p l e ( exemple , l a b e l ) c o r r e s p o n d a n t a l ’ i n d e x """
        x, y = self.questions[index], self.labels[index]
        
        return x, y
        
    def __len__(self):
        """ r e n v o i e l a t a i l l e du j e u de donnees """
        return len(self.labels)

In [41]:
train_file = "TREC-6/train.txt"
test_file = "TREC-6/test.txt"

In [42]:
def convert_data(data_name):
    features = []
    lbl = []
    with codecs.open(data_name, 'r', encoding="latin-1") as f:
        for line in f:
            words = clean_str(line.strip())[2:]
            y = int(line[0])
            features.append(words)
            lbl.append(y)
    return features, lbl



def clean_str(string):
    
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()


In [43]:
# Dataset name
train_input, train_output = convert_data(train_file)
test_input, test_output = convert_data(test_file)

In [44]:
# return : tuple of documents, tensor of labels
train_loader = DataLoader(WOSDATASET(train_input, train_output), batch_size=32, shuffle=True)
test_loader = DataLoader(WOSDATASET(test_input, train_output), batch_size=32, shuffle=True)

In [45]:
for x, y in train_loader:
    print("Length questions batch:", len(x))
    print("Example : \n", x[0])
    print("Targets:", y)
    break

Length questions batch: 32
Example : 
 What city did the Mormons establish as their headquarters in 1847 ?
Targets: tensor([4, 0, 3, 1, 1, 4, 5, 0, 3, 0, 3, 3, 0, 0, 3, 0, 0, 0, 1, 4, 0, 1, 0, 3,
        4, 0, 3, 5, 4, 5, 1, 0])


### SST dataset

In [46]:
class SSTDATASET(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels
    def __getitem__(self, index):
        """ r e t o u r n e un c o u p l e ( exemple , l a b e l ) c o r r e s p o n d a n t a l ’ i n d e x """
        x, y = self.reviews[index], self.labels[index]
        
        return x, y
        
    def __len__(self):
        """ r e n v o i e l a t a i l l e du j e u de donnees """
        return len(self.labels)

In [47]:
train_file = "SST/train.txt"
test_file = "SST/test.txt"

In [48]:
# Dataset name
train_input, train_output = convert_data(train_file)
test_input, test_output = convert_data(test_file)

In [49]:
# return : tuple of documents, tensor of labels
train_loader = DataLoader(SSTDATASET(train_input, train_output), batch_size=32, shuffle=False)
test_loader = DataLoader(SSTDATASET(test_input, train_output), batch_size=32, shuffle=False)

In [50]:
for x, y in train_loader:
    print("Length reviews batch:", len(x))
    print("Example : \n", x[0])
    print("Targets:", y)
    break

Length reviews batch: 32
Example : 
 a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films
Targets: tensor([1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 1, 1, 0, 1, 0, 0, 1])
