# Notebook to test preprocessing Stanford Sentiment Tree Dataset

In [1]:
import numpy as np
import string
import unicodedata

!curl -fsS https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip -o /tmp/trainDevTestTrees_PTB.zip
!unzip -q -o -d ./data /tmp/trainDevTestTrees_PTB.zip
!rm -f /tmp/trainDevTestTrees_PTB.zip

In [2]:
def normalize_sentence(s):
    replacements = [(" ,", ","), (" .", "."), (" '", "'"), 
                    ("\/", "/"), (" ;", ";"), (" :", ":"),
                    (" %", "%"), (u"æ", "ae"), (u"Æ", "AE"), 
                    (u"œ", "oe"), (u"Œ", "OE"), ("-LRB- ", "("), 
                    (" -RRB-", ")"), ("-LRB-", "("), ("-RRB-", ")"),
                    (" n't", "n't"), ("`` ", '"'), ("``", '"'),
                    ("''", '"'), ("` ", "'"), ("$ ", "$"),
                    (" !", "!"), ("\\", ""), (" ?", "?")]
    for to_replace, replacement in replacements:
        s = s.replace(to_replace, replacement)
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in string.ascii_letters + ' .,;:\'-#!/"=&$?|_'
    ) # https://stackoverflow.com/a/518232/2809427

def get_corpus(path, eol=' '):
    corpus = ""
    with open(path, 'r') as fp:
        for line in fp.readlines():
            soup = line.split()
            tokens = []
            for chunk in soup[2:]:
                if chunk.endswith(')'):
                    tokens.append(chunk.rstrip(')'))
            corpus += normalize_sentence(" ".join(tokens)) + eol
    return corpus

In [3]:
def write_to_path(corpus, path):
    with open(path, 'w') as fp:
        fp.write(corpus)

In [4]:
write_to_path(get_corpus("./data/trees/train.txt"), "./data/train_corpus.txt")
write_to_path(get_corpus("./data/trees/dev.txt"), "./data/val_corpus.txt")
write_to_path(get_corpus("./data/trees/test.txt"), "./data/test_corpus.txt")

In [5]:
# Check
with open("./data/train_corpus.txt", 'r') as fp:
    input = fp.read()
    print(input[0:min(500, len(input))])

The Rock is destined to be the st Century's new "Conan" and that he's going to make a splash even greater than Arnold Schwarzenegger, Jean-Claud Van Damme or Steven Segal. The gorgeously elaborate continuation of "The Lord of the Rings" trilogy is so huge that a column of words can not adequately describe co-writer/director Peter Jackson's expanded vision of J.R.R. Tolkien's Middle-earth. Singer/composer Bryan Adams contributes a slew of songs -- a few potential hits, a few more simply intrusive
