###Libraries

In [30]:
import os
import sys

import pandas as pd
import numpy as np

from urllib import request
import zipfile
from zipfile import ZipFile

from typing import List, Dict, Callable

from tqdm import tqdm

###Download / Extraction of the dataset

In [4]:
def download_dataset(download_path: str, url: str):
    if not os.path.exists(download_path):
        print("Downloading dataset...")
        request.urlretrieve(url, download_path)
        print("Download complete!")

def extract_dataset(download_path: str, extract_path: str):
    print("Extracting dataset... (it may take a while...)")
    with ZipFile(download_path) as loaded_zip:
        loaded_zip.extractall(extract_path)
    print("Extraction completed!")

In [5]:
print(f"Current work directory: {os.getcwd()}")
dataset_folder = os.path.join(os.getcwd(), "Datasets")

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"
dataset_path = os.path.join(dataset_folder, "dependency_treebank.zip")
print(dataset_path)

download_dataset(dataset_path, url)
extract_dataset(dataset_path, dataset_folder)

Current work directory: /content
/content/Datasets/dependency_treebank.zip
Downloading dataset...
Download complete!
Extracting dataset... (it may take a while...)
Extraction completed!


###Dataframe structuring

Complete dataset, used and indexing for phrases and words to preserve the ordering of the sentences

In [60]:
def encode_dataset(dataset_name: str) -> pd.DataFrame:
    dataframe_rows = []
    folder = os.path.join(os.getcwd(), "Datasets", dataset_name)
    for filename in tqdm(os.listdir(folder)):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path):
              with open(file_path, mode='r', encoding='utf-8') as text_file:
                # read it and extract 
                file_id = int(filename.split("_")[1].split(".")[0])
                lines = text_file.read().splitlines()
                c=0
                w=0
                for line in lines:
                  phrase_id = str(c).zfill(2)
                  word_id = str(w).zfill(2)
                  if line.strip():
                    word = line.split("\t")[0]
                    pos = line.split("\t")[1]

                    dataframe_row = {
                        "doc": file_id,
                        "phrase_word": phrase_id+"_"+word_id,
                        "word": word,
                        "POS": pos
                    }

                    dataframe_rows.append(dataframe_row)
                    w += 1
                  else:
                    c += 1
                    w = 0

                  
        except Exception as e:
              print('Failed to process %s. Reason: %s' % (file_path, e))
              sys.exit(0)
    
    folder = os.path.join(os.getcwd(), "Datasets", "Dataframes", dataset_name)
    if not os.path.exists(folder):
        os.makedirs(folder)

    # transform the list of rows in a proper dataframe
    df = pd.DataFrame(dataframe_rows)
    df = df[["doc","phrase_word","word","POS"]]
    df.sort_values(by=['doc',"phrase_word"],inplace=True)
    dataframe_path = os.path.join(folder, dataset_name + ".pkl")
    df.to_pickle(dataframe_path)
    return df

In [61]:
print("Encoding dataset...")
df = encode_dataset(dataset_name='dependency_treebank')
print("Encoding completed!")
print(df)

Encoding dataset...


100%|██████████| 199/199 [00:00<00:00, 929.97it/s]


Encoding completed!
       doc phrase_word     word  POS
11421    1       00_00   Pierre  NNP
11422    1       00_01   Vinken  NNP
11423    1       00_02        ,    ,
11424    1       00_03       61   CD
11425    1       00_04    years  NNS
...    ...         ...      ...  ...
62768  199       02_10  quarter   NN
62769  199       02_11       of   IN
62770  199       02_12     next   JJ
62771  199       02_13     year   NN
62772  199       02_14        .    .

[94084 rows x 4 columns]


###Text preprocessing/normalization

Most is already done by tokenizing the text in the creation of the first DataFrame, removing Uppercases, points and commas may not be convenient

-discuss later

In [62]:
import re
from functools import reduce

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@;-]')


In [56]:
def lower(text: str) -> str:
    """
    Transforms given text to lower case.
    """
    return text.lower()

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis, with spacing character
    """
    return REPLACE_BY_SPACE_RE.sub(' ', text)

def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    """
    return text.strip()

In [63]:
PREPROCESSING_PIPELINE = [
                          lower,
                          replace_special_characters,
                          strip_text
                          ]

def text_prepare(text: str,
                 filter_methods: List[Callable[[str], str]] = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """
    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE
    return reduce(lambda txt, f: f(txt), filter_methods, text)

In [64]:
print('Pre-processing text...')
print()
# Replace each sentence with its pre-processed version
df['word'] = df['word'].apply(lambda txt: text_prepare(txt))


print("Pre-processing completed!")

Pre-processing text...

Pre-processing completed!


In [65]:
print(df['word'])

11421     pierre
11422     vinken
11423          ,
11424         61
11425      years
          ...   
62768    quarter
62769         of
62770       next
62771       year
62772          .
Name: word, Length: 94084, dtype: object


###Train/Test split

Split in 3 different dataframes as requested

In [37]:
def split_dataframe(df: pd.DataFrame):

    df_train = df[df['doc'] <= 100]
    #folder = os.path.join(os.getcwd(), "Datasets", "Dataframes","dependency_treebank")
    #dataframe_path = os.path.join(folder, "train_dataset" + ".pkl")
    #df_train.to_pickle(dataframe_path)

    temp_df_val = df[df['doc'] >= 101]
    df_val = temp_df_val[temp_df_val['doc'] <= 150]
    #dataframe_path = os.path.join(folder, "val_dataset" + ".pkl")
    #df_val.to_pickle(dataframe_path)

    df_test = df[df['doc'] >= 151]
    #dataframe_path = os.path.join(folder, "test_dataset" + ".pkl")
    #df_test.to_pickle(dataframe_path)

    print("DataFrame splitted!")

    return df_train, df_val, df_test

In [67]:
df_train,df_val,df_test = split_dataframe(df)


DataFrame splitted!


###Vocabulary creation


In [40]:
from collections import OrderedDict

def build_vocabulary(df: pd.DataFrame):
    """
    Given a dataset, builds the corresponding word vocabulary.

    :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
    :return:
      - word vocabulary: vocabulary index to word
      - inverse word vocabulary: word to vocabulary index
      - word listing: set of unique terms that build up the vocabulary
    """
    idx_to_word = OrderedDict()
    word_to_idx = OrderedDict()
    
    curr_idx = 0
    for i, row in df.iterrows():
      token = row["word"]
      if token not in word_to_idx:
                word_to_idx[token] = curr_idx
                idx_to_word[curr_idx] = token
                curr_idx += 1

    word_listing = list(idx_to_word.values())
    return idx_to_word, word_to_idx, word_listing


In [68]:
df_train_val = pd.concat([df_train,df_val])

idx_to_word, word_to_idx, word_listing = build_vocabulary(df_train_val)
print(f'[Debug] Index -> Word vocabulary size: {len(idx_to_word)}')
print(f'[Debug] Word -> Index vocabulary size: {len(word_to_idx)}')
print(f'[Debug] Some words: {[(idx_to_word[idx], idx) for idx in np.arange(10) + 1]}')

[Debug] Index -> Word vocabulary size: 9898
[Debug] Word -> Index vocabulary size: 9898
[Debug] Some words: [('vinken', 1), (',', 2), ('61', 3), ('years', 4), ('old', 5), ('will', 6), ('join', 7), ('the', 8), ('board', 9), ('as', 10)]


###GloVe embeddings

Loading GloVe model from gensim library

In [25]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(model_type: str,
                         embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = ""
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    elif model_type.strip().lower() == 'fasttext':
        download_path = "fasttext-wiki-news-subwords-300"
    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove, fasttext")
        
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        print('FastText: 300')
        raise e

    return emb_model

In [26]:
embedding_model = load_embedding_model(model_type="glove", embedding_dimension=50)



Checking OOV terms

In [69]:
def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                    word_listing: List[str]):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    embedding_vocabulary = set(embedding_model.vocab.keys())
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [70]:
oov_terms = check_OOV_terms(embedding_model, word_listing)
oov_percentage = float(len(oov_terms)) * 100 / len(word_listing)
print(f"Total OOV terms: {len(oov_terms)} ({oov_percentage:.2f}%)")

Total OOV terms: 874 (8.83%)


In [71]:
print(oov_terms)

['', '2,303,328', 'freudtoy', 'four year old', 'three year', 'pre tax', 'moleculon', 'makato', '1738.1', '1992 1999', 'full time', 'secilia', 'synergistics', 'tire kickers', 'tiphook', 'ac 130u', 'senate house', 'alurralde', 'pre approved', 'red blooded', 'when issued', 'one country', 'derel', 'longer term', 'cost sharing', 'triple a', 'sogo shosha', 'product design', 'stock manipulation', 'non u.s.', 'three lawyer', 'anti takeover', 'money losing', 'ensrud', 'top selling', 'old house', 'index fund', 'food industry', 'eight month', 'car safety', 'index related', 'walbrecher', 'collective bargaining', '361,376', 'white collar', 'church goers', 'propagandizes', 'a d', 'coca cola', 'more efficient', 'shokubai', 'price support', 'subskills', '36 day', '35564.43', 'nih appointed', 'besuboru', 'reagan bush', 'trading company', '11,762', '50 state', 'two year old', '1.5755', '37 a share', 'short wave', 'water authority', 'chicago style', 'bermuda based', 'band wagon', '20 point', '5\\ 8', '23

###Sentences split

In [10]:
def build_sequences(df):
    new_data = {"sentences": [], "pos_tags": []}

    sentence = []
    pos = []

    for i,row in df.iterrows():

       if (not row["word"] == "?" and
           not row["word"] == "!" and
           not row["word"] == "."):

        sentence.append(row["word"])
        pos.append(row["POS"])
      
       else:

        sentence.append(row["word"])
        pos.append(row["POS"])

        new_data["sentences"].append(sentence)
        new_data["pos_tags"].append(pos)

        sentence = []
        pos = []
      
    new_df = pd.DataFrame(new_data, columns=["sentence","pos_tags"])
    return new_df

In [11]:
df_train_s = build_sequences(df_train)
df_val_s = build_sequences(df_val)
df_test_s = build_sequences(df_test)