# Elements in Text Example

This example is based on Chapter 1: Language and Computation from Bengfort, Bilbro & Ojeda's Applied Text Analysis with Python. Their example detected gender in text, and because that example doesn't resonate with me, I wanted to instead explore detecting elements in text (earth, air, water, fire).

In [2]:
import nltk

from collections import Counter

EARTH = 'earth'
AIR = 'air'
FIRE = 'fire'
WATER = 'water'
UNKNOWN = 'unknown'

EARTH_WORDS = {'earth', 'ground', 'soil', 'dirt', 'green', 'brown'}
AIR_WORDS = {'air', 'light', 'smoke', 'fog', 'smog', 'cloud', 'mist', 'white', 'grey'}
WATER_WORDS = {'blue', 'water', 'ocean', 'river', 'stream', 'flow'}
FIRE_WORDS = {'fire', 'hot', 'red', 'orange', 'yellow', 'active'}


def elementalize(words):
    earth_len = len(EARTH_WORDS.intersection(words))
    air_len = len(AIR_WORDS.intersection(words))
    water_len = len(WATER_WORDS.intersection(words))
    fire_len = len(FIRE_WORDS.intersection(words))

    # TODO: Calculate the max and return that instead
    print(f'Earth: {earth_len} Air: {air_len} Water: {water_len} Fire: {fire_len}')

    return WATER

def count_elements(sentences):
    num_sentences = Counter()
    num_words = Counter()

    for sentence in sentences:
        element = elementalize(sentence)
        num_sentences[element] += 1
        num_words[element] += len(sentence)

    return num_sentences, num_words

def parse_element(text):
    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    num_sentences, num_words = count_elements(sentences)
    total = sum(num_words.values())

    for element, count in num_words.items():
        percent = (count / total) * 100
        nsents = num_sentences[element]

        print(f'{percent}% {element} ({nsents} sentences)')

sample_text = 'The day had begun on a bright note. The sun finally peeked through the rain for the first time in a week, and the birds were sinf=ging in its warmth. There was no way to anticipate what was about to happen. It was a worst-case scenario and there was no way out of it.'

parse_element(sample_text)

Earth: 0 Air: 0 Water: 0 Fire: 0
Earth: 0 Air: 0 Water: 0 Fire: 0
Earth: 0 Air: 0 Water: 0 Fire: 0
Earth: 0 Air: 0 Water: 0 Fire: 0
100.0% water (4 sentences
