### This project illustrates how NLP preprocessing and parsing techniques can be used to uncover the prvalant grammatical structure of a text, which later may possibly used for sentiment analysis by feeding most common grammatical structures data to a machine learning model. 

In [4]:
from collections import Counter

"""Please note that I am not the author of these functions. I have found 
   these functionsin a NLP topic in codeacademy website."""
# function that pulls chunks out of chunked sentence and finds the most common chunks
def np_chunk_counter(chunked_sentences):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract noun phrase chunks
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'NP'):
            chunks.append(tuple(subtree))

    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(30)

# function that pulls chunks out of chunked sentence and finds the most common chunks
def vp_chunk_counter(chunked_sentences):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract verb phrase chunks
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'VP'):
            chunks.append(tuple(subtree))

    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(30)


In [5]:
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize

def word_sentence_tokenize(text):
  
  # create a PunktSentenceTokenizer
  sentence_tokenizer = PunktSentenceTokenizer(text)
  
  # sentence tokenize text
  sentence_tokenized = sentence_tokenizer.tokenize(text)
  
  # create a list to hold word tokenized sentences
  word_tokenized = list()
  
  # for-loop through each tokenized sentence in sentence_tokenized
  for tokenized_sentence in sentence_tokenized:
    # word tokenize each sentence and append to word_tokenized
    word_tokenized.append(word_tokenize(tokenized_sentence))
    
  return word_tokenized

In [6]:
from nltk import pos_tag, RegexpParser


# import text of choice here
with open('The illiad.txt', encoding='utf-8') as fileObject:
  text = fileObject.read().lower()

# sentence and word tokenize text here
word_tokenized_text = word_sentence_tokenize(text)

# store and print any word tokenized sentence here

single_word_tokenized_sentence = word_tokenized_text[1]
print(single_word_tokenized_sentence)
# create a list to hold part-of-speech tagged sentences here
pos_tagged_text = []

# create a for loop through each word tokenized sentence here
for sentence in word_tokenized_text:
  # part-of-speech tag each sentence and append to list of pos-tagged sentences here
  pos_tagged_text.append(pos_tag(sentence))

# store and print any part-of-speech tagged sentence here
single_pos_sentence = pos_tagged_text[1]
print(single_pos_sentence)

# define noun phrase chunk grammar here
np_chunk_grammar = "NP: {<DT>?<JJ.?>*<NN>}"

# create noun phrase RegexpParser object here
np_chunk_parser = RegexpParser(np_chunk_grammar)

# define verb phrase chunk grammar here
vp_chunk_grammar = "VP: {<DT>?<JJ.?>*<NN><VB.*><RB.?>?}"

# create verb phrase RegexpParser object here
vp_chunk_parser = RegexpParser(vp_chunk_grammar)

# create a list to hold noun phrase chunked sentences and a list to hold verb phrase chunked sentences here
np_chunked_text = []
vp_chunked_text = []

# create a for loop through each pos-tagged sentence here
for pos_tagged_sentence in pos_tagged_text:
  # chunk each sentence and append to lists here
  np_chunked_text.append(np_chunk_parser.parse(pos_tagged_sentence))
  vp_chunked_text.append(vp_chunk_parser.parse(pos_tagged_sentence))

# Lets see how 'np_chunked_text' looks like
# print("This is np_chunked_text's raw content: \n\n")
# print(np_chunked_text)

# store and print the most common NP-chunks here
most_common_np_chunks = np_chunk_counter(np_chunked_text)
print('Most common NP chunks: \n\n', most_common_np_chunks)

# store and print the most common VP-chunks here
most_common_vp_chunks = vp_chunk_counter(vp_chunked_text)
print('Most common VP chunks: \n\n', most_common_vp_chunks)


['1899', 'contents', 'introduction', '.']
[('1899', 'CD'), ('contents', 'NNS'), ('introduction', 'NN'), ('.', '.')]
Most common NP chunks: 

 [((('hector', 'NN'),), 321), ((('i', 'NN'),), 276), ((('jove', 'NN'),), 257), ((('troy', 'NN'),), 208), ((('vain', 'NN'),), 195), ((('war', 'NN'),), 192), ((('son', 'NN'),), 167), ((('the', 'DT'), ('plain', 'NN')), 157), ((('the', 'DT'), ('field', 'NN')), 154), ((('thou', 'NN'),), 153), ((('the', 'DT'), ('ground', 'NN')), 138), ((('death', 'NN'),), 134), ((('hand', 'NN'),), 132), ((('greece', 'NN'),), 126), ((('heaven', 'NN'),), 125), ((('fate', 'NN'),), 123), ((('thee', 'NN'),), 122), ((('breast', 'NN'),), 121), ((('the', 'DT'), ('trojan', 'NN')), 120), ((('the', 'DT'), ('god', 'NN')), 119), ((('the', 'DT'), ('war', 'NN')), 117), ((('the', 'DT'), ('greeks', 'NN')), 116), ((('blood', 'NN'),), 113), ((('homer', 'NN'),), 112), ((('the', 'DT'), ('king', 'NN')), 105), ((('force', 'NN'),), 102), ((('rage', 'NN'),), 100), ((('care', 'NN'),), 98), ((('m