## Q1 Data Preprocessing

In [1]:
#read file into dict{file:[list of sentence]} 
import glob, re
read_files = glob.glob("corpus\\*.txt")
corpus = dict.fromkeys(read_files)
for i in range(len(read_files)):
    with open(read_files[i], "r") as infile:
        corpus[read_files[i]]  = infile.read()

In [42]:
#pip install stanfordcorenlp first
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP(r'stanford-corenlp-full-2018-10-05', memory='8g')

In [3]:
#split sentences in each file
props={'annotators': 'ssplit','pipelineLanguage':'en','outputFormat':'json'}
import json
corpus_s = dict.fromkeys(corpus)
for key, value in corpus.items():
    corpus_s[key]=[]
    test = json.loads(nlp.annotate(value, properties=props))
    for i in range(len(test['sentences'])):
        word_l = []
        for j in range(len(test['sentences'][i]['tokens'])):
            word_l.append(test['sentences'][i]['tokens'][j]['word'])
        corpus_s[key].append(' '.join(word_l))

In [4]:
#remove punctuations, multiple spaces
import string
re_p = re.compile('[%s]' % re.escape(string.punctuation))
corpus_cl = dict.fromkeys(corpus_s)
for key, value in corpus_s.items():
    corpus_cl[key] = []
    for i in range(len(value)):
        corpus_cl[key].append(re.sub(r' {2,}',' ', re_p.sub(' ', value[i])))

In [5]:
#count words in each sentence
word_count = dict.fromkeys(corpus)
for key, value in corpus_cl.items():
    word_count[key] = []
    for i in range(len(value)):
        word_count[key].append(len(list(filter(None, value[i].split(' ')))))

In [6]:
#keep the sentence with <=50 words
corpus_rdtu = dict.fromkeys(corpus_cl)
for key, value in corpus_s.items():
    corpus_rdtu[key] = []
    for i in range(len(value)):
        if word_count[key][i] <=50:
            corpus_rdtu[key].append(value[i])

In [7]:
pos_tag = dict.fromkeys(corpus_rdtu)
for key, value in corpus_rdtu.items():
    pos_tag[key]=[]
    for i in range(len(value)):
        pos_tag[key].append(nlp.pos_tag(value[i]))  

In [8]:
c_parsing = dict.fromkeys(corpus_rdtu)
for key, value in corpus_rdtu.items():
    c_parsing[key]=[]
    for i in range(len(value)):
        c_parsing[key].append(nlp.parse(value[i]))

In [9]:
d_parsing = dict.fromkeys(corpus_rdtu)
for key, value in corpus_rdtu.items():
    d_parsing[key]=[]
    for i in range(len(value)):
        d_parsing[key].append(nlp.dependency_parse(value[i]))

In [10]:
#token{file1:[[sentence1_token_1,sentence1_token_2,...],[sentence2],...], file2:[[s1][s2]...],...}
tkn = dict.fromkeys(corpus_rdtu)
for key, value in corpus_rdtu.items():
    tkn[key]=[]
    for i in range(len(value)):
        tkn[key].append(nlp.word_tokenize(value[i]))

In [11]:
nlp.close()

In [12]:
#save the intermediate files
import pickle
with open('pos_tag.pkl', 'wb+') as f:
    pickle.dump(pos_tag, f, pickle.HIGHEST_PROTOCOL)
with open('c_parsing.pkl', 'wb+') as f:
    pickle.dump(c_parsing, f, pickle.HIGHEST_PROTOCOL)
with open('d_parsing.pkl', 'wb+') as f:
    pickle.dump(d_parsing, f, pickle.HIGHEST_PROTOCOL)
with open('tkn.pkl', 'wb+') as f:
    pickle.dump(tkn, f, pickle.HIGHEST_PROTOCOL)

In [16]:
#Convert to output format postag
#postag{file1:[[sentence1],[sentence2],...], file2:[[s1][s2]...],...}
pos_tag_text = dict.fromkeys(pos_tag)
for key, value in pos_tag.items():
    pos_tag_text[key] = []
    for i in range(len(value)):
        pos_list_raw = []
        for j in range(len(value[i])):
            pos_list_raw.append('/'.join(value[i][j]))
        pos_tag_text[key].append(' '.join(pos_list_raw))

In [17]:
#Convert to output format d_parser
#d_parser_op{file1:[[sentence1_token_1,sentence1_token_2,...],[sentence2],...], file2:[[s1][s2]...],...}
d_parser_op = dict.fromkeys(d_parsing)
for key, value in d_parsing.items():
    d_parser_op[key] = []
    for i in range(len(value)):
        d_list_raw = []
        for j in range(len(value[i])):
            if value[i][j][1] >0: 
                d_list_raw.append('(%s, %s-%s, %s-%s)' % (value[i][j][0],tkn[key][i][value[i][j][1]-1], value[i][j][1], tkn[key][i][value[i][j][2]-1], value[i][j][2]))
            else:
                d_list_raw.append('(%s, ROOT-%s, %s-%s)' % (value[i][j][0], value[i][j][1], tkn[key][i][value[i][j][2]-1], value[i][j][2]))
        d_parser_op[key].append(d_list_raw)

In [18]:
#output
for key, value in pos_tag_text.items():
    with open('output\\%s' % key[-8:], 'w+') as f:
        for i in range(len(value)):
            f.write('%s \n %s \n %s\n\n' % (value[i], c_parsing[key][i], d_parser_op[key][i]))

## Q1.1 Average Verb Count

In [70]:
#load the intermediate files
import pickle
with open('pos_tag.pkl', 'rb+') as f:
    pos_tag = pickle.load(f)

In [19]:
c_v = 0
c_s = 0
verb_tags = []
for key, value in pos_tag.items():
    for i in range(len(value)):
        for j in range(len(value[i])):
            if value[i][j][1][0] == 'V':
                verb_tags.append(value[i][j][1])
                c_v += 1
    c_s += len(value)
v_avg_count = c_v/c_s
print('average number of verbs: %s' % v_avg_count)

average number of verbs: 3.6505163928744713


In [20]:
### After looking at the penn treebank, all the tags start with 'V' are verbs:
print('verb_tags: %s' % set(verb_tags))

verb_tags: {'VBD', 'VB', 'VBN', 'VBG', 'VBZ', 'VBP'}


## Q1.2 Number of Sentences Parsed

In [34]:
#load the intermediate files
import pickle
with open('d_parsing.pkl', 'rb+') as f:
    d_parsing = pickle.load(f)

In [35]:
s=0
for key, value in d_parsing.items():
    for i in range(len(value)):
        for j in range(len(value[i])):
            if value[i][j][0] == 'ROOT':
                s += 1

In [36]:
print("The total number of sentences parsed: %s" % s)

The total number of sentences parsed: 14427


## Q1.3 Total Number of Prepositions

In [71]:
#load the intermediate files
import pickle
with open('d_parsing.pkl', 'rb+') as f:
    d_parsing = pickle.load(f)
with open('tkn.pkl', 'rb+') as f:
    tkn = pickle.load(f)

In [69]:
prep_count = dict.fromkeys(d_parsing)
for key, value in d_parsing.items():
    prep_count[key] = 0
    for i in range(len(value)):
        for j in range(len(value[i])):
            if value[i][j][0] == 'case':
                prep_count[key] += 1

In [70]:
#output
with open('prep_count.txt', 'w+') as f:
    for key, value in prep_count.items():
        f.write('%s: %s \n' % (key[-8: ],value))

In [81]:
prep= []
for key, value in d_parsing.items():
    for i in range(len(value)):
        for j in range(len(value[i])):
            if value[i][j][0] == 'case':
                prep.append(tkn[key][i][value[i][j][2]-1])

In [89]:
prep_count = dict.fromkeys(set(prep))
for key in prep_count:
    prep_count[key] = 0
for key, value in d_parsing.items():
    for i in range(len(value)):
        for j in range(len(value[i])):
            if value[i][j][0] == 'case':
                prep_count[tkn[key][i][value[i][j][2]-1]] += 1

In [104]:
from collections import Counter
k = Counter(prep_count) 
# Finding 3 highest values 
high = k.most_common(3) 
print("The most common 3 prepositions are '%s', '%s', '%s'." % (high[0][0],high[1][0],high[2][0]))

The most common 3 prepositions are 'of', 'in', 'to'.


## Q1.4 Errors 

#### Constituent Parser:
Ambiguity: Two common kinds of ambiguity are attachment ambiguity and coordination ambiguity. A sentence has an attachment ambiguity if a particular constituent can be attached to the parse tree at more than one place. In coordination ambiguity different sets of phrases can be conjoined by a conjunction like "and". The problem can be solved by a probabilistic parser. Probabilistic parsers compute the probability of each interpretation and choose the most probable interpretation.

Lack of lexical preferences: The problem can be solved by modifying the probabilistic model of the parser to
allow for lexicalized rules.

#### Dpendency Parser:
computational limitations: The transition-based approaches can only produce projective trees, hence any sentences with non-projective structures will necessarily contain some errors. The problem can be solved by more flexible graph-based parsing approaches.
Rely too much on verbs: Every node needs a head and the whole parser rely on verbs. Dependency grammar approach abstracts away from word-order information, representing only the information that is necessary for the parse.