In [1]:
import nltk

In [2]:
# nltk.download('all', halt_on_error=False)

In [3]:
from nltk.corpus import europarl_raw

german_text = europarl_raw.german.raw(fileids='ep-00-01-17.de')
print("Total characters in the corpus: ", len(german_text))
print("First 100 characters in the corpus\n", german_text[:100])

Total characters in the corpus:  157171
First 100 characters in the corpus
  
Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sit


In [4]:
# Uninstall NLTK
# import os
# import shutil
# import nltk

# # Create the path
# # nltk_data_path = os.path.join(user_home, users, username, 
# #                               appdata, roaming, nltk_data)
# nltk_data_path = r'C:\Users\Baha Tegar\AppData\Roaming\nltk_data'


# # Check if the directory exists
# if os.path.exists(nltk_data_path):
#     # Remove the directory and its contents
#     shutil.rmtree(nltk_data_path)
#     print(f"Removed nltk_data directory at {nltk_data_path}")
# else:
#     print("nltk_data directory does not exist")


# Parts of Speech (POS) Tagging

> POS tagging is giving label into words.

> Parts of Speech are specific lexical categories to which words are assigned based on their syntactic context and role.


NOTE:  
> The origin of the word ‘lexical’ is believed to be the Greek word ‘lexis’ which somewhat means ‘vocabulary’; the total stack of words found in a certain language.

> Anything ‘lexical’ in 21st Century is believed to be somewhere related to a study attempting to find a stable relation and coordination between the words used in a language.

> The term "lexical meaning" refers to the inherent meaning of a word or lexical unit as it is understood in a language, independent of the context in which it is used.

- Lexical units includes words, phrases, idioms, and expressions that have specific meanings in a language.

Example:

Word: "Apple"

- Denotation: A round fruit with red, green, or yellow skin and a whitish interior.
- Connotation: Health, knowledge (as in the story of Adam and Eve), and technology (Apple Inc.).

Word: "Snake"

- Denotation: A long, limbless reptile with a scaly skin.
- Connotation: Danger, deceit (as in calling someone a "snake"), and treachery.

Idiom: "Break the ice"

- Denotation: To do something to initiate conversation in a social setting.
- Connotation: Starting a conversation or activity to ease tension.

NOTE:
- We will use Penn Treebank. Read more: https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/Penn-Treebank-Tagset.pdf

In [16]:
import nltk

sentence = 'The brown fox is quick and he is jumping over the lazy dog'
print(sentence)

tokens = nltk.word_tokenize(sentence)
tagged_sent = nltk.pos_tag(tokens, tagset='universal')
print(tagged_sent)

The brown fox is quick and he is jumping over the lazy dog
[('The', 'DET'), ('brown', 'ADJ'), ('fox', 'NOUN'), ('is', 'VERB'), ('quick', 'ADJ'), ('and', 'CONJ'), ('he', 'PRON'), ('is', 'VERB'), ('jumping', 'VERB'), ('over', 'ADP'), ('the', 'DET'), ('lazy', 'ADJ'), ('dog', 'NOUN')]


In [18]:
# Using pattern module
from pattern.en import tag
try:
    tagged_sent = tag(sentence)
except:
    StopIteration
    
print(tagged_sent)

[('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


# Shallow Parsing

The algorithm:
- POS Tagging
> Break sentence into smallest token/word and tag it with POS.

- Chunks identified
> The parsing identifies contiguous or non-contiguous sequences of words that form meaningful units, such as noun phrases (NP), verb phrases (VP), prepositional phrases (PP), etc.

- Group them together
> Group them together into higher-level phrases and identified it until reach the initial sentence-level.

## Limitations:
- Lack of Deep Syntactic Structure: Shallow parsing does not provide detailed syntactic relationships or hierarchical structures.

- Ambiguity Handling: It may struggle with resolving syntactic ambiguities that require deeper context understanding.

- Context Sensitivity: Shallow parsing relies heavily on surface patterns and may not capture nuanced grammatical or semantic relationships.

In [15]:
from pattern.en import parsetree

try:
    sentence = 'The brown fox is quick and he is jumping over the lazy dong'

    tree = parsetree(sentence)
except:
    StopIteration

tree

[Sentence('The/DT/B-NP/O brown/JJ/I-NP/O fox/NN/I-NP/O is/VBZ/B-VP/O quick/JJ/B-ADJP/O and/CC/O/O he/PRP/B-NP/O is/VBZ/B-VP/O jumping/VBG/I-VP/O over/IN/B-PP/O the/DT/O/O lazy/JJ/B-ADJP/O dong/IN/B-PP/O')]

NOTES:
- The I prefix => It is inside a chunk.
- The B prefix => It is beginning of a chunk.
- The O prefix => It does not belong to any chunk.

The B- tag is always used when there are subsequent tags following it of the same type without the presence of O tags between them.

In [None]:
tree[0].constituents()

In [None]:
# Print all chunks
print("All chunks:")
for sentence_tree in tree:
    print(sentence_tree.chunks)

print()
# Depict each phrase and its internal constituents
for sentence_tree in tree:
    for chunk in sentence_tree.chunks:
        print(chunk.type, '->', [(word.string, word.type) for word in chunk.words])
        

        

**Generic Functions Shallow Parse**

In [9]:
# Create some generic functions to parse and visualized shallow parsed sentence trees

from pattern.en import parsetree, Chunk
from nltk.tree import Tree

# Create a shallow parsed sentence tree
def create_sentence_tree(sentence, lemmatize=False):
    sentence_tree = parsetree(sentence,
                              relations=True,
                              lemmata=lemmatize)
    return sentence_tree[0]

# Get various constituents of the parse tree
def get_sentence_tree_constituents(sentence_tree):
    return sentence_tree.constituents()

# Process the shallow aprsed tree into an easy to understand format
def process_sentence_tree(sentence_tree):
    
    tree_constituents = get_sentence_tree_constituents(sentence_tree)
    processed_tree = [
        (item.type, [(w.string, w.type) for w in item.words]) 
            if type(item) == Chunk
            else ("-", [(item.string, item.type)])
        for item in tree_constituents
    ]
    return processed_tree

# Print the sentence tree using nltk's Tree syntax
def print_sentence_tree(sentence_tree):
    
    processed_tree = process_sentence_tree(sentence_tree)
    processed_tree = [
        Tree(item[0], [Tree(x[1], [x[0]]) for x in item[1]]) for item in processed_tree
    ]
    tree = Tree('S', processed_tree)
    print(tree)
    
# Visualize the sentence tree using nltk's Tree syntax
def visualize_sentence_tree(sentence_tree):
    processed_tree = process_sentence_tree(sentence_tree)
    processed_tree = [
        Tree(item[0], [Tree(x[1], [x[0]]) for x in item[1]])
            for item in processed_tree
    ]
    tree = Tree('S', processed_tree)
    tree.draw()

In [10]:
# raw shallow parsed tree
t = create_sentence_tree(sentence)
t

Sentence('The/DT/B-NP/O/NP-SBJ-1 brown/JJ/I-NP/O/NP-SBJ-1 fox/NN/I-NP/O/NP-SBJ-1 is/VBZ/B-VP/O/VP-1 quick/JJ/B-ADJP/O/O and/CC/O/O/O he/PRP/B-NP/O/NP-SBJ-2 is/VBZ/B-VP/O/VP-2 jumping/VBG/I-VP/O/VP-2 over/IN/B-PP/O/O the/DT/O/O/O lazy/JJ/B-ADJP/O/O dong/IN/B-PP/O/O')

In [11]:
# processed shallow parsed tree
pt = process_sentence_tree(t)
pt

[('NP', [('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN')]),
 ('VP', [('is', 'VBZ')]),
 ('ADJP', [('quick', 'JJ')]),
 ('-', [('and', 'CC')]),
 ('NP', [('he', 'PRP')]),
 ('VP', [('is', 'VBZ'), ('jumping', 'VBG')]),
 ('PP', [('over', 'IN')]),
 ('-', [('the', 'DT')]),
 ('ADJP', [('lazy', 'JJ')]),
 ('PP', [('dong', 'IN')])]

In [12]:
# print shallow parsed tree in an easy to understand format 
# using nltk's Tree syntax

print_sentence_tree(t)

(S
  (NP (DT The) (JJ brown) (NN fox))
  (VP (VBZ is))
  (ADJP (JJ quick))
  (- (CC and))
  (NP (PRP he))
  (VP (VBZ is) (VBG jumping))
  (PP (IN over))
  (- (DT the))
  (ADJP (JJ lazy))
  (PP (IN dong)))


In [13]:
# visualize the shallow parsed tree
visualize_sentence_tree(t)

# Dependency-based Parsing

> Using dependency-based grammars to analyze and infer both structure and semantic dependencies and relationships between tokens in a sentence.

**Using spacy**

In [39]:
# sentence = 'The brown fox is quick and he is jumping over the lazy dog'

# # Load dependencies
# import spacy
# parser = spacy.load('en_core_web_sm')
# parsed_sent = parser(sentence)

# # Generate dependency parser output
# dependency_pattern = '{left} <--- {word}[{w_type}] ---> {right}\n------'
# for token in parsed_sent:
#     print(dependency_pattern.format(word=token.orth_,
#                                     w_type=token.dep_,
#                                     left=[t.orth_ for t in token.lefts],
#                                     right=[t.orth_ for t in token.rights]))
    

**Using Standford Dependency Parser**

In [51]:
# from nltk.parse.stanford import StanfordDependencyParser
# import os

# sdp = StanfordDependencyParser(path_to_jar=r'C:\Users\Baha Tegar\Desktop\Preparation\TextAnalytics\stanford-parser-4.2.0\stanford-parser-full-2020-11-17\stanford-parser.jar',
#                                path_to_models_jar=r'C:\Users\Baha Tegar\Desktop\Preparation\TextAnalytics\stanford-parser-4.2.0\stanford-parser-full-2020-11-17\stanford-parser-4.2.0-models.jar')

# result = list(sdp.raw_parse(sentence))

# result[0]

# Constituency-based Parsing

>  It involves analyzing the syntactic structure of a sentence by breaking it down into its constituent parts or phrases. Each constituent is a group of words that functions as a single unit within a hierarchical structure, typically represented as a tree.

There are various types of parsing algorithms, including the following:
- Recursive Descent parsing
- Shift Reduce parsing
- Chart parsing
- Bottom-up parsing
- Top-down parsing
- PCFG parsing