# Elements in Text Example

This example is based on Chapter 1: Language and Computation from Bengfort, Bilbro & Ojeda's Applied Text Analysis with Python. Their example detected gender in text, and because that example doesn't resonate with me, I wanted to instead explore detecting elements in text (earth, air, water, fire).

In [None]:
import nltk

from collections import Counter

EARTH = 'earth'
AIR = 'air'
FIRE = 'fire'
WATER = 'water'
UNKNOWN = 'unknown'

# I manually generated these lists by brainstorming, searching 
# for synonyms, and adding morphological variations of words 
# (e.g., adding plurals)
AIR_WORDS = {
    'air', 'breeze', 'breezy', 'cloud', 'clouds', 'fog',
    'gray', 'grey', 'light', 'mist', 'smoke', 'smog',
    'white', 'wind', 'windy'
}

EARTH_WORDS = {
    'brown', 'dirt', 'earth', 'earthy', 'green', 'ground',
    'grounded', 'grounding', 'root', 'roots', 'soil', 'tree',
    'trees', 'worm', 'worms'
}

FIRE_WORDS = {
    'active', 'fire', 'flames', 'hot', 'roast', 'red',
    'orange', 'yellow', 'sun', 'warm', 'warmth'
}

WATER_WORDS = {
    'blue', 'downpour', 'drops', 'flow', 'moist', 'ocean',
    'rain', 'river', 'sprinkle', 'stream', 'water', 'wave',
    'waves'
}


def elementalize(words):
    earth_len = len(EARTH_WORDS.intersection(words))
    air_len = len(AIR_WORDS.intersection(words))
    water_len = len(WATER_WORDS.intersection(words))
    fire_len = len(FIRE_WORDS.intersection(words))
    element_counts = {EARTH: earth_len,
                      AIR: air_len,
                      WATER: water_len,
                      FIRE: fire_len}

    # If we don't find any element words, then we can't make any predictions.
    if earth_len == 0 and air_len == 0 and water_len == 0 and fire_len == 0:
        return UNKNOWN
    else:
        max_element_value = max(sorted(element_counts.values()))
        max_element_name = [k for k, v in element_counts.items() if v == max_element_value][0]
        return max_element_name

def count_elements(sentences):
    num_sentences = Counter()
    num_words = Counter()

    for sentence in sentences:
        element = elementalize(sentence)
        num_sentences[element] += 1
        num_words[element] += len(sentence)

    return num_sentences, num_words


def parse_element(text):
    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    num_sentences, num_words = count_elements(sentences)
    total = sum(num_words.values())

    print(f'Frequency-based score of how much of an Element (Air, Water, Fire, Earth) is in Text')
    for element, count in num_words.items():
        percent = (count / total) * 100
        nsents = num_sentences[element]
        print(f'{percent}% {element} ({nsents} sentences)')


# I used https://randomwordgenerator.com/paragraph.php to generate a random paragraph for this exercise
sample_text = 'The day had begun on a bright note. The sun finally peeked through the rain for the first time in a week, and the birds were singing in its warmth. There was no way to anticipate what was about to happen. It was a worst-case scenario and there was no way out of it.'

parse_element(sample_text)