# Context-free Grammars

We will use some utilities from [NLTK](http://www.nltk.org/). This code is meant for demos and I would not use it in any kind of production environment. Even in Python, it is better to reimplement some of these algorithms to ensure correctness and speed.

In [25]:
import nltk
from nltk import CFG, PCFG
from nltk.parse import pchart
import sys, re

We need some utility functions to make it easier to work with the NLTK Context-free grammar (CFG) API functions.

In [26]:
def strip_comment(s):
    """Remove comments from a string containing nltk CFG"""
    return re.sub(r'#.*?$', "", s, flags=re.MULTILINE)

def print_grammar(grammar):
    print("Start:", grammar.start(), file=sys.stderr)
    print("Productions:", grammar.productions(), file=sys.stderr)

def print_all_trees(parser, inp):
    for (i,tree) in enumerate(parser.parse(inp.split()), start=1):
        print("{}:".format(i))
        print(tree)

In [27]:
grammar = CFG.fromstring("""
N -> N N | 'natural' | 'language' | 'learning' | 'course'
""")

inp = 'natural language learning course'
print_grammar(grammar)
parser = nltk.ChartParser(grammar)
print_all_trees(parser, inp)

1:
(N (N (N (N natural) (N language)) (N learning)) (N course))
2:
(N (N (N natural) (N (N language) (N learning))) (N course))
3:
(N (N natural) (N (N (N language) (N learning)) (N course)))
4:
(N (N natural) (N (N language) (N (N learning) (N course))))
5:
(N (N (N natural) (N language)) (N (N learning) (N course)))


Start: N
Productions: [N -> N N, N -> 'natural', N -> 'language', N -> 'learning', N -> 'course']


In [28]:
grammar_string = """
N -> N N [{}] 
N -> 'natural' [{}] 
N -> 'language' [{}] 
N -> 'learning' [{}]
N -> 'course' [{}]
""".format(1/5, 1/5, 1/5, 1/5, 1/5)
print(grammar_string)

# uses N -> N N three times and each of
# the words once each and because each
# rule has the same probability we can
# easily compute the probability of
# each parse tree for this PCFG
print((1/5)**3 * (1/5)**4)

grammar = PCFG.fromstring(grammar_string)

inp = 'natural language learning course'
print_grammar(grammar)
parser = pchart.InsideChartParser(grammar)
print_all_trees(parser, inp)


N -> N N [0.2] 
N -> 'natural' [0.2] 
N -> 'language' [0.2] 
N -> 'learning' [0.2]
N -> 'course' [0.2]

1.2800000000000006e-05
1:
(N
  (N natural)
  (N (N language) (N (N learning) (N course)))) (p=1.28e-05)
2:
(N
  (N natural)
  (N (N (N language) (N learning)) (N course))) (p=1.28e-05)
3:
(N
  (N (N natural) (N language))
  (N (N learning) (N course))) (p=1.28e-05)
4:
(N
  (N (N natural) (N (N language) (N learning)))
  (N course)) (p=1.28e-05)
5:
(N
  (N (N (N natural) (N language)) (N learning))
  (N course)) (p=1.28e-05)


Start: N
Productions: [N -> N N [0.2], N -> 'natural' [0.2], N -> 'language' [0.2], N -> 'learning' [0.2], N -> 'course' [0.2]]


In [37]:
grammar_string = """
N_1_4 -> N_1   N_2_4
N_1_4 -> N_1_3 N_4
N_1_4 -> N_1_2 N_3_4
N_1_2 -> N_1   N_2   # natural language
N_2_3 -> N_2   N_3   # language learning
N_3_4 -> N_3   N_4   # learning course
N_1_3 -> N_1_2 N_3   # natural language learning
N_1_3 -> N_1   N_2_3 # natural language learning
N_2_4 -> N_2_3 N_4   # language learning course
N_2_4 -> N_2   N_3_4 # language learning course
N_1 -> 'natural' 
N_2 -> 'language' 
N_3 -> 'learning'
N_4 -> 'course'
"""
print(strip_comment(grammar_string))
grammar = CFG.fromstring(strip_comment(grammar_string))
print_grammar(grammar)
parser = nltk.ChartParser(grammar)
inp = 'natural language learning course'.strip()
print_all_trees(parser, inp)


N_1_4 -> N_1   N_2_4
N_1_4 -> N_1_3 N_4
N_1_4 -> N_1_2 N_3_4
N_1_2 -> N_1   N_2   
N_2_3 -> N_2   N_3   
N_3_4 -> N_3   N_4   
N_1_3 -> N_1_2 N_3   
N_1_3 -> N_1   N_2_3 
N_2_4 -> N_2_3 N_4   
N_2_4 -> N_2   N_3_4 
N_1 -> 'natural' 
N_2 -> 'language' 
N_3 -> 'learning'
N_4 -> 'course'

1:
(N_1_4
  (N_1_3 (N_1_2 (N_1 natural) (N_2 language)) (N_3 learning))
  (N_4 course))
2:
(N_1_4
  (N_1_3 (N_1 natural) (N_2_3 (N_2 language) (N_3 learning)))
  (N_4 course))
3:
(N_1_4
  (N_1 natural)
  (N_2_4 (N_2_3 (N_2 language) (N_3 learning)) (N_4 course)))
4:
(N_1_4
  (N_1 natural)
  (N_2_4 (N_2 language) (N_3_4 (N_3 learning) (N_4 course))))
5:
(N_1_4
  (N_1_2 (N_1 natural) (N_2 language))
  (N_3_4 (N_3 learning) (N_4 course)))


Start: N_1_4
Productions: [N_1_4 -> N_1 N_2_4, N_1_4 -> N_1_3 N_4, N_1_4 -> N_1_2 N_3_4, N_1_2 -> N_1 N_2, N_2_3 -> N_2 N_3, N_3_4 -> N_3 N_4, N_1_3 -> N_1_2 N_3, N_1_3 -> N_1 N_2_3, N_2_4 -> N_2_3 N_4, N_2_4 -> N_2 N_3_4, N_1 -> 'natural', N_2 -> 'language', N_3 -> 'learning', N_4 -> 'course']


In [40]:
grammar_string = """
N_1_4 -> N_1   N_2_4   [{}]
N_1_4 -> N_1_3 N_4     [{}]
N_1_4 -> N_1_2 N_3_4   [{}]
N_1_2 -> N_1   N_2     [1.0] # natural language
N_2_3 -> N_2   N_3     [1.0] # language learning
N_3_4 -> N_3   N_4     [1.0] # learning course
N_1_3 -> N_1_2 N_3     [{}]  # natural language learning
N_1_3 -> N_1   N_2_3   [{}]  # natural language learning
N_2_4 -> N_2_3 N_4     [{}]  # language learning course
N_2_4 -> N_2   N_3_4   [{}]  # language learning course
N_1 -> 'natural'       [1.0]
N_2 -> 'language'      [1.0]
N_3 -> 'learning'      [1.0]
N_4 -> 'course'        [1.0]
""" 

# write down the probabilities as fractions over 
# the possible choices for each non-terminal in
# the lhs which has multiple possible rhs expansions
N_1_4_probs = [1/3, 1/3, 1/3]
N_1_3_probs = [1/2, 1/2]
N_2_4_probs = [1/2, 1/2]

prob_grammar_string = grammar_string.format(*N_1_4_probs, *N_1_3_probs, *N_2_4_probs)
# using * the lists are flattened and we get (1/3, 1/3, 1/3, 1/2, 1/2, 1/2, 1/2)

print(strip_comment(prob_grammar_string))
grammar = PCFG.fromstring(strip_comment(prob_grammar_string))
print_grammar(grammar)
parser = pchart.InsideChartParser(grammar)
parser.trace(3)
inp = 'natural language learning course'.strip()
print_all_trees(parser, inp)


N_1_4 -> N_1   N_2_4   [0.3333333333333333]
N_1_4 -> N_1_3 N_4     [0.3333333333333333]
N_1_4 -> N_1_2 N_3_4   [0.3333333333333333]
N_1_2 -> N_1   N_2     [1.0] 
N_2_3 -> N_2   N_3     [1.0] 
N_3_4 -> N_3   N_4     [1.0] 
N_1_3 -> N_1_2 N_3     [0.5]  
N_1_3 -> N_1   N_2_3   [0.5]  
N_2_4 -> N_2_3 N_4     [0.5]  
N_2_4 -> N_2   N_3_4   [0.5]  
N_1 -> 'natural'       [1.0]
N_2 -> 'language'      [1.0]
N_3 -> 'learning'      [1.0]
N_4 -> 'course'        [1.0]

  |[-] . . .| [0:1] 'natural'                        [1.0]
  |. [-] . .| [1:2] 'language'                       [1.0]
  |. . [-] .| [2:3] 'learning'                       [1.0]
  |. . . [-]| [3:4] 'course'                         [1.0]
  |. . . [-]| [3:4] 'course'                         [1.0]
  |. . . [-]| [3:4] N_4 -> 'course' *                [1.0]
  |. . . > .| [3:3] N_4 -> * 'course'                [1.0]
  |. . [-] .| [2:3] 'learning'                       [1.0]
  |. . [-] .| [2:3] N_3 -> 'learning' *              [1.0]
  |. 

Start: N_1_4
Productions: [N_1_4 -> N_1 N_2_4 [0.333333], N_1_4 -> N_1_3 N_4 [0.333333], N_1_4 -> N_1_2 N_3_4 [0.333333], N_1_2 -> N_1 N_2 [1.0], N_2_3 -> N_2 N_3 [1.0], N_3_4 -> N_3 N_4 [1.0], N_1_3 -> N_1_2 N_3 [0.5], N_1_3 -> N_1 N_2_3 [0.5], N_2_4 -> N_2_3 N_4 [0.5], N_2_4 -> N_2 N_3_4 [0.5], N_1 -> 'natural' [1.0], N_2 -> 'language' [1.0], N_3 -> 'learning' [1.0], N_4 -> 'course' [1.0]]


In [42]:
N_1_4_probs = [1/30, 28/30, 1/30]
N_1_3_probs = [19/20, 1/20]
N_2_4_probs = [1/2, 1/2]

prob_grammar_string = grammar_string.format(*N_1_4_probs, *N_1_3_probs, *N_2_4_probs)
grammar = PCFG.fromstring(strip_comment(prob_grammar_string))

parser = pchart.InsideChartParser(grammar)
parser.trace(3)
inp = 'natural language learning course'.strip()
print_all_trees(parser, inp)

  |[-] . . .| [0:1] 'natural'                        [1.0]
  |. [-] . .| [1:2] 'language'                       [1.0]
  |. . [-] .| [2:3] 'learning'                       [1.0]
  |. . . [-]| [3:4] 'course'                         [1.0]
  |. . . [-]| [3:4] 'course'                         [1.0]
  |. . . [-]| [3:4] N_4 -> 'course' *                [1.0]
  |. . . > .| [3:3] N_4 -> * 'course'                [1.0]
  |. . [-] .| [2:3] 'learning'                       [1.0]
  |. . [-] .| [2:3] N_3 -> 'learning' *              [1.0]
  |. . [-> .| [2:3] N_3_4 -> N_3 * N_4               [1.0]
  |. . [---]| [2:4] N_3_4 -> N_3 N_4 *               [1.0]
  |. . > . .| [2:2] N_3_4 -> * N_3 N_4               [1.0]
  |. . > . .| [2:2] N_3 -> * 'learning'              [1.0]
  |. [-] . .| [1:2] 'language'                       [1.0]
  |. [-] . .| [1:2] N_2 -> 'language' *              [1.0]
  |. [-> . .| [1:2] N_2_3 -> N_2 * N_3               [1.0]
  |. [---] .| [1:3] N_2_3 -> N_2 N_3 *               [1.

In [72]:
from collections import defaultdict
lexical_grammar_string = """
N_n -> 'natural' [{n}]
N_l -> 'language' [{l}]
N_p -> 'learning' [{p}]
N_c -> 'course' [{c}]
"""
recursive_grammar_list = []
head_options = 'nlpc'.split()
head_count = defaultdict(int)
for head_1 in 'nlpc':
    for head_2 in 'nlpc':
        rule1 = 'N_{} -> N_{} N_{} [{{{}}}]'.format(head_1, head_1, head_2, head_1)
        rule2 = 'N_{} -> N_{} N_{} [{{{}}}]'.format(head_2, head_1, head_2, head_2)
        recursive_grammar_list.extend([rule1, rule2])
        head_count[head_1] += 1
        head_count[head_2] += 1
grammar_string = \
    "N -> N_n [0.25] | N_l [0.25] | N_p [0.25] | N_c [0.25]" + \
    lexical_grammar_string + \
    "\n".join(recursive_grammar_list)

probs = { h: (1/(head_count[h]+1)) for h in 'nlpc' }
prob_grammar_string = grammar_string.format(**probs)
print(prob_grammar_string)

grammar = PCFG.fromstring(strip_comment(prob_grammar_string))
parser = pchart.InsideChartParser(grammar)
parser.trace(3)
inp = 'natural language learning course'.strip()
print_all_trees(parser, inp)


N -> N_n [0.25] | N_l [0.25] | N_p [0.25] | N_c [0.25]
N_n -> 'natural' [0.1111111111111111]
N_l -> 'language' [0.1111111111111111]
N_p -> 'learning' [0.1111111111111111]
N_c -> 'course' [0.1111111111111111]
N_n -> N_n N_n [0.1111111111111111]
N_n -> N_n N_n [0.1111111111111111]
N_n -> N_n N_l [0.1111111111111111]
N_l -> N_n N_l [0.1111111111111111]
N_n -> N_n N_p [0.1111111111111111]
N_p -> N_n N_p [0.1111111111111111]
N_n -> N_n N_c [0.1111111111111111]
N_c -> N_n N_c [0.1111111111111111]
N_l -> N_l N_n [0.1111111111111111]
N_n -> N_l N_n [0.1111111111111111]
N_l -> N_l N_l [0.1111111111111111]
N_l -> N_l N_l [0.1111111111111111]
N_l -> N_l N_p [0.1111111111111111]
N_p -> N_l N_p [0.1111111111111111]
N_l -> N_l N_c [0.1111111111111111]
N_c -> N_l N_c [0.1111111111111111]
N_p -> N_p N_n [0.1111111111111111]
N_n -> N_p N_n [0.1111111111111111]
N_p -> N_p N_l [0.1111111111111111]
N_l -> N_p N_l [0.1111111111111111]
N_p -> N_p N_p [0.1111111111111111]
N_p -> N_p N_p [0.1111111111111111]


In [33]:
from IPython.core.display import HTML


def css_styling():
    styles = open("../css/notebook.css", "r").read()
    return HTML(styles)
css_styling()