In [20]:
##============ Dependencies and libraries ============##
from dotenv import load_dotenv
import os

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import spacy


# Load environment variables from the .env file
load_dotenv()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cezarykubinski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cezarykubinski/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cezarykubinski/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/cezarykubinski/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cezarykubinski/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/cezarykubinski/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

### Keyword preparation

In [21]:
# Base keywords
base_keywords = [
    # Water-related keywords
    "water", "hydro", "aquatic", "h2o", "liquid", "drinking water", "wastewater", 
    "water treatment", "desalination", "water recycling", "irrigation", "hydroelectric", 
    "water supply", "water management", "water conservation", "municipal water", "water rights",

    # Trading and market-related keywords
    "water market", "water trading", "commodity", "stock market", "water stocks", 
    "water futures", "water options", "trading volume", "market trends", "water prices",

    # Delivering and processing-related keywords
    "water delivery", "water supply chain", "water transportation", "pipelines", 
    "water treatment plants", "desalination plants", "water recycling facilities", 
    "bottling", "packaging", "distribution", "water utility", "public utility", "wastewater treatment", "sewage treatment", "water conservation", "efficiency", "smart water systems", "leak detection", "AI in water", "IoT in water", "water automation", "digital water",

    # Events that impact water and demand
    "drought", "flood", "water scarcity", "climate change", "weather patterns", 
    "natural disasters", "water pollution", "government regulations", "water policy", 
    "infrastructure development", "water crisis", "water infrastructure investment", "extreme weather", "water contamination", "microplastics", "lead in water", "water regulation", "environmental policy", "clean water act", "infrastructure bill", 

    # Water companies and stocks
    "american water works", "aqua america", "california water service group", 
    "connecticut water company", "mueller water products", "nalco holding company", 
    "pentair ltd.", "sjw group", "veolia environnement", "xylem inc.", 
    "profit", "revenue", "growth", "investment", "market share", "dividends", "water utility stocks", "water sector investment", "infrastructure spending", "environmental regulations", "public utility", "ESG", "Environmental, Social, Governance",

    # Additional keywords
    "water quality", "water security", "water sustainability", "water efficiency", 
    "water conservation", "water infrastructure", "water technology", "water innovation", 
    "water management systems", "water industry trends",
    "drought management", "water shortage", "water regulation", "aquifer levels", "irrigation", "EPA water standards", "conservation funds", "hydrology", "water utility", "water quality", "European water policy", "river basin management", "freshwater resources", 

    # Water stocks
    "AWK", "WTR", "CWT", "CTWS", "MWA", "NLC", "PNR", "SJW", "VEOEY", "XYL"
]

# Text - Generated by meta-llama/Llama-3.3-70B-Instruct
input_text = """
Here's a comprehensive overview of the factors that impact water availability, demand, processing methods, and prices, as well as the events and processes that can affect the stock prices of water companies:

**Factors Impacting Water Availability:**

1. **Climate Change**: Changes in temperature and precipitation patterns can alter water availability, leading to droughts or floods.
2. **Population Growth**: Increasing population puts pressure on existing water resources, leading to scarcity and competition for water.
3. **Agricultural Demand**: Irrigation for agriculture is a significant user of water resources, and changes in agricultural practices or crop yields can impact water availability.
4. **Industrial Demand**: Industrial processes, such as manufacturing and mining, require significant amounts of water, which can strain local water resources.
5. **Water Infrastructure**: Aging or inadequate water infrastructure, such as pipes and treatment plants, can lead to water losses and contamination.
6. **Water Pollution**: Pollution from industrial, agricultural, or domestic sources can contaminate water sources, making them unusable.
7. **Geological Events**: Earthquakes, landslides, or other geological events can disrupt water infrastructure and affect water availability.

**Factors Impacting Water Demand:**

1. **Population Growth**: Increasing population leads to higher demand for water for drinking, sanitation, and hygiene.
2. **Economic Growth**: Economic growth can lead to increased industrial and agricultural demand for water.
3. **Urbanization**: Urbanization can lead to increased demand for water for municipal uses, such as drinking water and sanitation.
4. **Agricultural Practices**: Changes in agricultural practices, such as the adoption of water-intensive crops, can increase demand for water.
5. **Climate Change**: Changes in temperature and precipitation patterns can alter water demand, with warmer temperatures leading to increased demand for water for cooling and irrigation.

**Factors Impacting Water Processing Methods:**

1. **Technological Advancements**: Advances in water treatment technologies, such as desalination and water recycling, can improve the efficiency and effectiveness of water processing.
2. **Regulatory Requirements**: Changes in regulatory requirements, such as stricter water quality standards, can impact the methods used for water processing.
3. **Energy Costs**: Changes in energy costs can impact the economic viability of different water processing methods, such as desalination.
4. **Water Quality**: Changes in water quality, such as the presence of contaminants, can require changes in water processing methods.

**Factors Impacting Water Prices:**

1. **Supply and Demand**: Imbalances in supply and demand can lead to changes in water prices.
2. **Energy Costs**: Changes in energy costs can impact the cost of water processing and distribution.
3. **Regulatory Requirements**: Changes in regulatory requirements, such as stricter water quality standards, can increase the cost of water processing and distribution.
4. **Infrastructure Costs**: The cost of maintaining and upgrading water infrastructure can impact water prices.
5. **Climate Change**: Changes in climate can alter the cost of water processing and distribution, with warmer temperatures leading to increased energy costs for cooling and pumping.

**Events and Processes that Can Impact Water Company Stocks:**

1. **Droughts and Water Scarcity**: Droughts and water scarcity can increase demand for water treatment and conservation services, benefiting companies that provide these services.
2. **Regulatory Changes**: Changes in regulatory requirements, such as stricter water quality standards, can increase demand for water treatment and testing services.
3. **Technological Advancements**: Advances in water treatment technologies can disrupt traditional business models and create new opportunities for companies that adopt these technologies.
4. **Mergers and Acquisitions**: Consolidation in the water industry can create new opportunities for companies that are acquired or that acquire other companies.
5. **Climate Change**: Companies that provide services related to climate change mitigation and adaptation, such as water conservation and efficiency services, may benefit from increased demand for these services.
6. **Economic Trends**: Economic trends, such as changes in interest rates or commodity prices, can impact the stock prices of water companies.
7. **Company-Specific Events**: Company-specific events, such as changes in management or the announcement of new projects, can impact the stock price of individual water companies.
8. **Natural Disasters**: Natural disasters, such as hurricanes or floods, can impact the stock prices of water companies that operate in affected areas.
9. **Government Policies**: Government policies, such as subsidies or tax credits for water conservation, can impact the stock prices of water companies that provide related services.
10. **Industry Trends**: Industry trends, such as the adoption of new technologies or changes in consumer behavior, can impact the stock prices of water companies.

Some examples of events that can impact the stock prices of water companies include:

* The announcement of a new desalination plant or water treatment facility
* Changes in regulatory requirements, such as stricter water quality standards
* The acquisition of a water company by a larger company
* The announcement of a new contract or partnership with a major customer
* The release of a new product or technology related to water treatment or conservation
* A natural disaster, such as a hurricane or flood, that impacts the operations of a water company

I hope this helps! Let me know if you have any further questions.
"""

In [22]:
# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

# Method 1: Expand keywords with synonyms using NLTK
def generate_keywords_with_synonyms(base_keywords):
    expanded_keywords = set(base_keywords)
    for keyword in base_keywords:
        for syn in wordnet.synsets(keyword):
            for lemma in syn.lemmas():
                expanded_keywords.add(lemma.name().replace('_', ' '))
    return list(expanded_keywords)

# Method 2: Extract keywords from text using NLTK
def extract_keywords_from_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token.isalpha() and token.lower() not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Extract keywords (nouns)
    keywords = []
    for token in tokens:
        if nltk.pos_tag([token])[0][1].startswith("NN"):
            keywords.append(token)

    # Extract key phrases (sequences of nouns and adjectives)
    key_phrases = []
    for i in range(len(tokens) - 1):
        if nltk.pos_tag([tokens[i]])[0][1].startswith("NN") and nltk.pos_tag([tokens[i + 1]])[0][1].startswith("NN"):
            key_phrases.append(tokens[i] + " " + tokens[i + 1])

    return list(keywords + key_phrases)

keywords_from_synonyms = generate_keywords_with_synonyms(base_keywords)
keywords_with_synonyms = base_keywords + keywords_from_synonyms

keywords_from_text = extract_keywords_from_text(input_text)

# Generate keywords and phrases
keywords_and_phrases = keywords_with_synonyms + keywords_from_text

keywords_and_phrases = sorted(set(keywords_and_phrases))

print(keywords_and_phrases)
print(len(keywords_and_phrases))


['AI in water', 'AWK', 'Acquisitions', 'Acquisitions Consolidation', 'Advancements', 'Advancements Advances', 'Advances', 'Advances water', 'Availability', 'Availability Climate', 'CTWS', 'CWT', 'Change', 'Change Changes', 'Change Companies', 'Changes', 'Changes Changes', 'Changes climate', 'Changes energy', 'Changes temperature', 'Changes water', 'Climate', 'Climate Change', 'Companies', 'Companies provide', 'Company', 'Company Stocks', 'Consolidation', 'Consolidation water', 'Costs', 'Costs Changes', 'Costs cost', 'Demand', 'Demand Imbalances', 'Demand Industrial', 'Demand Irrigation', 'Demand Population', 'Disasters', 'Droughts', 'Droughts Water', 'Droughts water', 'EPA water standards', 'ESG', 'Earthquakes', 'Earthquakes landslide', 'Energy', 'Energy Costs', 'Environmental, Social, Governance', 'European water policy', 'Events', 'Events Earthquakes', 'Events Processes', 'Events event', 'Factors', 'Government', 'Government Policies', 'Government policy', 'Growth', 'H2O', 'Imbalances