Download datasets

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
load_data = drive.CreateFile({'id':'1AlbO4pMgbCAZO2XyGyKXsdWszyj-xMHx'})

In [0]:
# Loading GLUE datasets notebook
load_data.GetContentFile('Load_GLUE.ipynb')

# run Load_GLUE notebook
%run Load_GLUE.ipynb # this notebook downloads and extracts data

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Downloading and extracting CoLA...
	Completed!
Downloading and extracting SST...
	Completed!
Downloading and extracting QQP...
	Completed!
Downloading and extracting STS...
	Completed!
Downloading and extracting MNLI...
	Completed!
Downloading and extracting SNLI...
	Completed!
Downloading and extracting QNLI...
	Completed!
Downloading and extracting RTE...
	Completed!
Downloading and extracting WNLI...
	Completed!
Processing MRPC...
	Completed!
Downloading and extracting diagnostic...
	Completed!


In [0]:
import pandas as pd

# names of respective datasets
data_names = ['CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B', 'WNLI']
# 'SNLI' also included, but not explicitly stated in GLUE benchmark --> not regarded until now


# define dictionary to save column names for each dataset
column_names = {}
# to store all datasets in one dict
datasets = {}

# Load the datasets into a pandas dataframe.
for i in data_names:
  # for CoLA dataset, column names need to be added
  if i == 'CoLA':
    datasets[i] = pd.read_csv("/content/drive/My Drive/Master Thesis/{0}/train.tsv".format(i), 
                    delimiter='\\t', quotechar='"', 
                    names=['sentence_source', 'label', 'label_notes', 'sentence'])
  elif i == 'MRPC':
    datasets[i] = pd.read_csv("/content/drive/My Drive/Master Thesis/{0}/train.tsv".format(i), 
                    delimiter='\\t', quotechar='"', header = 1, names=['Quality', '#1 ID', '#2 ID', 
                                                                       '#1 String', '#2 String'])
  else:
    datasets[i] = pd.read_csv("/content/drive/My Drive/Master Thesis/{0}/train.tsv".format(i), 
                    delimiter = '\\t', quotechar = '"')
  # Report the number of sentences.
  print('Number of training sentences: {:,}\n'.format(datasets[i].shape[0]))
  # Display 5 random rows from the data.
  print(datasets[i].sample(10))
  # save column names for each dataset in dict
  column_names[i] = datasets[i].columns

### in order to work with less data, sample data from datasets
for i in data_names:
  if datasets[i].shape[0] >= 1000:
    datasets[i] = datasets[i].sample(1000)
  else:
    datasets[i] = datasets[i].sample(datasets[i].shape[0])



Number of training sentences: 8,551

     sentence_source  ...                                           sentence
5166            ks08  ...   I believe it to be the switch that is defective.
3616            ks08  ...  It was several young students that the policem...
6840            m_02  ...                   I forwarded Winifred the letter.
1764            r-67  ...  Wilt is taller than I imagine anybody would ev...
4729            ks08  ...  This bed was surely slept in by a huge guy las...
4110            ks08  ...                           We were glad what to do.
7851            ad03  ...                                 Can he will do it?
2611            l-93  ...   Lora buttered at the toast with unsalted butter.
2412            l-93  ...   The contractor will build a house for $ 100,000.
2136            l-93  ...            Carla poured the pitcher with lemonade.

[10 rows x 4 columns]
Number of training sentences: 392,702

         index  promptID  ...         label1     gold_



Number of training sentences: 104,743

       index  ...           label
62248  62248  ...      entailment
67755  67755  ...      entailment
72783  72783  ...  not_entailment
59721  59721  ...  not_entailment
73525  73525  ...      entailment
5974    5974  ...      entailment
33510  33510  ...      entailment
99620  99620  ...      entailment
30793  30793  ...      entailment
25190  25190  ...  not_entailment

[10 rows x 4 columns]




Number of training sentences: 363,870

            id  ... is_duplicate
91751   307118  ...          0.0
330375    2560  ...          0.0
231484  136722  ...          1.0
89709   232263  ...          0.0
92481   309648  ...          1.0
14643   202442  ...          1.0
353922   89259  ...          0.0
82387   237335  ...          1.0
285688   55780  ...          0.0
180898  310011  ...          0.0

[10 rows x 6 columns]
Number of training sentences: 2,490

      index  ...           label
1569   1569  ...  not_entailment
1896   1896  ...      entailment
2125   2125  ...      entailment
0         0  ...  not_entailment
761     761  ...  not_entailment
2183   2183  ...      entailment
101     101  ...  not_entailment
2485   2485  ...  not_entailment
787     787  ...      entailment
1338   1338  ...      entailment

[10 rows x 4 columns]




Number of training sentences: 67,349

                                                sentence  label
59109                     lumpy as two-day old porridge       0
63673                 this is n't worth sitting through       0
17112  much about the film , including some of its ca...      1
34171                first-class , thoroughly involving       1
13130                            a dark-as-pitch comedy       0
55117  a delightful surprise because despite all the ...      1
58453  walk out of the good girl with mixed emotions --       0
43275                      's undeniably hard to follow       0
64707  anything except that the chelsea hotel today i...      0
18506            and that is where ararat went astray .       0




Number of training sentences: 5,749

      index  ... score
2944   2944  ...  2.75
4433   4433  ...  4.00
4194   4194  ...  0.60
1109   1109  ...  1.20
3816   3816  ...  1.60
2341   2341  ...  2.00
3211   3211  ...  3.40
1797   1797  ...  0.00
2421   2421  ...  2.20
2136   2136  ...  4.40

[10 rows x 10 columns]
Number of training sentences: 635

     index  ... label
140    140  ...     0
156    156  ...     1
434    434  ...     1
70      70  ...     1
397    397  ...     0
47      47  ...     0
473    473  ...     1
589    589  ...     0
39      39  ...     1
69      69  ...     0

[10 rows x 4 columns]




In [0]:
# save colnames of respective datasets in dictionary
colnames = {}
for i in data_names:
    colnames[i] = datasets[i].columns.values

print(colnames)

{'CoLA': array(['sentence_source', 'label', 'label_notes', 'sentence'],
      dtype=object), 'MNLI': array(['index', 'promptID', 'pairID', 'genre', 'sentence1_binary_parse',
       'sentence2_binary_parse', 'sentence1_parse', 'sentence2_parse',
       'sentence1', 'sentence2', 'label1', 'gold_label'], dtype=object), 'MRPC': array(['Quality', '#1 ID', '#2 ID', '#1 String', '#2 String'],
      dtype=object), 'QNLI': array(['index', 'question', 'sentence', 'label'], dtype=object), 'QQP': array(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'],
      dtype=object), 'RTE': array(['index', 'sentence1', 'sentence2', 'label'], dtype=object), 'SST-2': array(['sentence', 'label'], dtype=object), 'STS-B': array(['index', 'genre', 'filename', 'year', 'old_index', 'source1',
       'source2', 'sentence1', 'sentence2', 'score'], dtype=object), 'WNLI': array(['index', 'sentence1', 'sentence2', 'label'], dtype=object)}


In [0]:
# print all possible values of labels
for i in data_names:
  for j in range(0, len(datasets[i].columns.values)):
    if datasets[i].columns.values[j] == 'label':
      print(i + " labels contain following possible values of labels: ")
      print(pd.unique(datasets[i].label)) 
    # MNLI contains variable gold_label
    elif datasets[i].columns.values[j] == 'gold_label':
      print(i + " labels (gold_label) contain following possible values of labels: ")
      print(pd.unique(datasets[i].gold_label)) 
    # QQP contains variable is_duplicate
    elif datasets[i].columns.values[j] == 'is_duplicate':
      print(i + " labels (is_duplicate) contain following possible values of labels: ")
      print(pd.unique(datasets[i].is_duplicate)) 
    # MRPC contains variable Quality
    elif datasets[i].columns.values[j] == 'Quality':
      print(i + " labels (is_duplicate) contain following possible values of labels: ")
      print(pd.unique(datasets[i].Quality)) 

# unique values of labels: [0 1] and ['not_entailment' 'entailment']
# unique values of gold_label (MNLI): ['neutral' 'entailment' 'contradiction']
# unique values of is_duplicate (QQP): [ 0.  1. nan]
# unique values of Quality (MRPC): [0 1]
# STS-B contains scores

CoLA labels contain following possible values of labels: 
[1 0]
MNLI labels (gold_label) contain following possible values of labels: 
['neutral' 'entailment' 'contradiction']
MRPC labels (is_duplicate) contain following possible values of labels: 
[0 1]
QNLI labels contain following possible values of labels: 
['not_entailment' 'entailment']
QQP labels (is_duplicate) contain following possible values of labels: 
[ 0.  1. nan]
RTE labels contain following possible values of labels: 
['not_entailment' 'entailment']
SST-2 labels contain following possible values of labels: 
[0 1]
WNLI labels contain following possible values of labels: 
[1 0]


In [0]:
import numpy as np

# modifying labels for MNLI, QNLI and RTE (before: string labels), 
# so that the model can work with them (model cannot work with strings)
labels = {}
mnli_qnli_rte = ['MNLI', 'QNLI', 'RTE']

#### Encode labels that have string values as int values, so that the model can
#### cope with it

# create empty ndarrays for QNLI and RTE labels
for i in mnli_qnli_rte:
  # for each colums in respective datasets
  for j in range(0, len(datasets[i].columns.values)):
    if datasets[i].columns.values[j] == 'label':
      labels[i] = np.empty(shape=datasets[i].shape[0], dtype = int, order='C')
        # check if the lengths of the empty arrays equal lengths of the respective datasets
    elif datasets[i].columns.values[j] == 'gold_label':
      labels[i] = np.empty(shape=datasets[i].shape[0], dtype = int, order='C')
  print(len(labels[i]) == datasets[i].shape[0])  # output: TRUE

# encode labels for MNLI, QNLI and RTE
for i in mnli_qnli_rte:
  # for each colums in respective datasets
  for j in range(0, len(datasets[i].columns.values)):
    counter = 0
    # for column 'label'
    if datasets[i].columns.values[j] == 'label':
      # for each label in label (for QNLI & RTE)
      for label in datasets[i].label:
        # encode 'not_entailment' as 0
        if label == 'not_entailment':
          labels[i][counter] = 0
        # encode 'entailment' as 1
        elif label == 'entailment':
          labels[i][counter] = 1
        counter += 1
    elif datasets[i].columns.values[j] == 'gold_label':
      # for each label in gold_label (MNLI)
      for label in datasets[i].gold_label:
        # encode 'neutral' as 0
        if label == 'neutral':
          labels[i][counter] = 0
        # encode 'entailment' as 1
        elif label == 'entailment':
          labels[i][counter] = 1
        # encode 'contradiction' as -1
        elif label == 'contradiction':
          labels[i][counter] = -1
        counter += 1
  # print unique labels and respective count of each unique label
  print(np.unique(labels[i], return_counts=True))

# add new column to dataset (with coded labels)
for i in mnli_qnli_rte:
  datasets[i]['label_coded'] = labels[i]
  print(np.unique(datasets[i].label_coded))
  print(datasets[i].sample(5))

# rename columns of MRPC (problem with # in python)
datasets['MRPC'].columns = ['Quality', 'ID1', 'ID2', 'String1', 'String2']

# save colnames after encoding labels (so that encoded labels are also included)
colnames = {}
for i in data_names:
    colnames[i] = datasets[i].columns.values

print(colnames)

True
True
True
(array([-1,  0,  1]), array([130903, 130900, 130899]))
(array([0, 1]), array([52366, 52377]))
(array([0, 1]), array([1241, 1249]))
[-1  0  1]
         index  promptID   pairID  ...         label1     gold_label label_coded
184965  184965    136669  136669e  ...     entailment     entailment           1
168825  168825    107256  107256n  ...        neutral        neutral           0
334210  334210    124235  124235n  ...        neutral        neutral           0
30901    30901     53651   53651n  ...        neutral        neutral           0
249052  249052     51002   51002c  ...  contradiction  contradiction          -1

[5 rows x 13 columns]
[0 1]
       index  ... label_coded
45426  45426  ...           0
90646  90646  ...           1
51473  51473  ...           1
14572  14572  ...           1
73156  73156  ...           0

[5 rows x 5 columns]
[0 1]
      index  ... label_coded
5         5  ...           1
1992   1992  ...           0
413     413  ...           1
1162

In [0]:
import math  # for operating with NaNs

# extract the sentences & labels of training set as numpy ndarrays
sent_quest_1 = {}    # for saving all first sentences & questions
sent_quest_2 = {}    # for saving all second sentences & questions are saved
labels = {}          # for saving all labels & scores

# print all colnames for each dataset in an ordered manner
for i in data_names:
  print('\n' + i + ' colnames:')
  for j in colnames[i]:
    print(j)

# print colnames of respective datasets (defines earlier) to find out which datasets
# contain sentence1, etc.
for i in data_names:
  for j in colnames[i]:
    # fct. endswith so that only variables which refer to sentence1/question(1) are
    # taken into consideration
    if j.endswith('sentence1'):  
      sent_quest_1[i] = datasets[i].sentence1.values
    elif j.endswith('question1'):
      sent_quest_1[i] = datasets[i].question1.values
    elif j.endswith('question'):
      sent_quest_1[i] = datasets[i].question.values
    # MRPC 
    elif j == 'String1':
      sent_quest_1[i] = datasets[i].String1.values
    ### now for sentence2/question2
    elif j.endswith('sentence2'):
      sent_quest_2[i] = datasets[i].sentence2.values
    elif j.endswith('question2'):
      sent_quest_2[i] = datasets[i].question2.values
    elif j == 'String2':
      sent_quest_2[i] = datasets[i].String2.values
    ### now for labels
    elif j.endswith('label_coded'):
      labels[i] = datasets[i].label_coded.values
    # MRPC
    elif j == 'Quality':
      labels[i] = datasets[i].Quality.values
    elif j.endswith('score'):
      labels[i] = datasets[i].score.values

# for special cases (with only sentence) if statement outside of for loop necessary
# for QNLI (question, sentence) and CoLA & SST-2 (both: only sentence)
sent_quest_1['CoLA'] = datasets['CoLA'].sentence.values
sent_quest_1['SST-2'] = datasets['SST-2'].sentence.values
sent_quest_2['QNLI'] = datasets['QNLI'].sentence.values

# print keys of labels
print('\n' + 'Keys of labels (to find out which are missing)')
print(labels.keys())

# for label (not encoded) if statement outside of loop (CoLA, SST-2, WNLI)
labels['CoLA'] = datasets['CoLA'].label.values
labels['SST-2'] = datasets['SST-2'].label.values
labels['WNLI'] = datasets['WNLI'].label.values

# for QQP (special case because didn't recognize that it's type int though it only 
# has values 0 and 1)
labels['QQP'] = datasets['QQP'].is_duplicate.values.astype(np.int)

print('\n' + 'Keys of labels')
print(labels.keys())

# print dictionary keys
print('\n' + 'Keys of sent_quest_1')
print(sent_quest_1.keys())
print('\n' + 'Keys of sent_quest_2')
print(sent_quest_2.keys())

##### Delete NaNs for QQP & MNLI!!! ###### 
# print  length of QQP sentences to compare later
print('\n' + 'Number of elements in QQP: ' + str(len(sent_quest_1['QQP'])))
# create list to save nans
nas = (np.argwhere(np.isnan(labels['QQP'])))
nans_qqp = list()
for na in nas:
    nans_qqp.extend(na)
# number of NANs
print('\n' + 'Number of NaNs in QQP: ' + str(len(nans_qqp)))
# delete NaNs
sent_quest_1['QQP'] = np.delete(sent_quest_1['QQP'], nans_qqp)
sent_quest_2['QQP'] = np.delete(sent_quest_2['QQP'], nans_qqp)
labels['QQP'] = np.delete(labels['QQP'], nans_qqp)
# print  length of QQP sentences to compare 
print('\n' + 'Number of elements in QQP: ' + str(len(sent_quest_1['QQP'])))

### MNLI
print('\n' + 'Number of elements in MNLI: ' + str(len(sent_quest_1['MNLI'])))
# create list to save nans
nans_mnli = list()
for i in range(0,len(sent_quest_2['MNLI'])):
  if type(sent_quest_2['MNLI'][i]) != str:
    nans_mnli.append(i)
# Number of NaNs
print('\n' + 'Number of NaNs in MNLI: ' + str(len(nans_mnli)))
# delete NaNs
sent_quest_1['MNLI'] = np.delete(sent_quest_1['MNLI'], nans_mnli)
sent_quest_2['MNLI'] = np.delete(sent_quest_2['MNLI'], nans_mnli)
labels['MNLI'] = np.delete(labels['MNLI'], nans_mnli)
print('\n' + 'Number of elements in MNLI: ' + str(len(sent_quest_1['MNLI'])))


CoLA colnames:
sentence_source
label
label_notes
sentence

MNLI colnames:
index
promptID
pairID
genre
sentence1_binary_parse
sentence2_binary_parse
sentence1_parse
sentence2_parse
sentence1
sentence2
label1
gold_label
label_coded

MRPC colnames:
Quality
ID1
ID2
String1
String2

QNLI colnames:
index
question
sentence
label
label_coded

QQP colnames:
id
qid1
qid2
question1
question2
is_duplicate

RTE colnames:
index
sentence1
sentence2
label
label_coded

SST-2 colnames:
sentence
label

STS-B colnames:
index
genre
filename
year
old_index
source1
source2
sentence1
sentence2
score

WNLI colnames:
index
sentence1
sentence2
label

Keys of labels (to find out which are missing)
dict_keys(['MNLI', 'MRPC', 'QNLI', 'RTE', 'STS-B'])

Keys of labels
dict_keys(['MNLI', 'MRPC', 'QNLI', 'RTE', 'STS-B', 'CoLA', 'SST-2', 'WNLI', 'QQP'])

Keys of sent_quest_1
dict_keys(['MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'STS-B', 'WNLI', 'CoLA', 'SST-2'])

Keys of sent_quest_2
dict_keys(['MNLI', 'MRPC', 'QQP', 'RTE',