In [2]:
##============ Dependencies and libraries ============##
from dotenv import load_dotenv
import os

import requests
from collections import Counter
from prawcore.exceptions import RequestException
import praw

import pandas as pd
import time
import json

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from deep_translator import GoogleTranslator

from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load environment variables from the .env file
load_dotenv()

# Retrieve credentials from environment variables
REDDIT_APP_ID = os.getenv('REDDIT_APP_ID')
REDDIT_SECRET = os.getenv('REDDIT_SECRET')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cezarykubinski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cezarykubinski/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cezarykubinski/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Keyword preparation

In [4]:
##============ Load json dictionary ============##

# Path to the JSON file
file_path = "waterCompanies.json"

# Load JSON data into a Python dictionary
with open(file_path, "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Access the loaded JSON data
print(json_data)


{'company_names': {'major_water_companies': {'American_Water_Works': {'en': ['American Water Works', 'AWK'], 'es': ['American Water Works']}, 'Veolia_Environnement': {'en': ['Veolia Environnement', 'VE'], 'fr': ['Veolia Environnement'], 'es': ['Veolia Medio Ambiente', 'Veolia'], 'de': ['Veolia Umwelt']}, 'Essential_Utilities': {'en': ['Essential Utilities', 'Aqua America', 'WTRG'], 'es': ['Servicios Esenciales']}, 'Xylem_Inc': {'en': ['Xylem Inc.', 'XYL'], 'fr': ['Xylem France'], 'de': ['Xylem Deutschland']}, 'California_Water_Service_Group': {'en': ['California Water Service Group', 'CWT'], 'es': ['Grupo de Servicios de Agua de California']}, 'Suez_SA': {'fr': ['Suez Environnement', 'SEV'], 'es': ['Suez Medio Ambiente'], 'de': ['Suez Umwelt']}, 'Global_Water_Resources': {'en': ['Global Water Resources', 'GWRS'], 'es': ['Recursos Hídricos Globales']}}}, 'general_industry': {'water_supply': {'en': ['water utility', 'water supply', 'municipal water', 'public utility', 'water management',

In [5]:
##============ Extract keywords ============##


# Function to extract all keywords from the JSON dictionary
def extract_keywords(keywords_dict):
    keywords = []
    
    # Extract company names
    for company_group in keywords_dict.get('company_names', {}).values():
        for company in company_group.values():
            for lang_keywords in company.values():
                keywords.extend(lang_keywords)
    
    # Extract general industry terms
    for category in keywords_dict.get('general_industry', {}).values():
        for lang_keywords in category.values():
            keywords.extend(lang_keywords)

    # Extract event and issue-related terms
    for category in keywords_dict.get('events_and_issues', {}).values():
        for lang_keywords in category.values():
            keywords.extend(lang_keywords)

    # Extract financial and market sentiment-related terms
    for category in keywords_dict.get('financial_and_market_sentiment', {}).values():
        for lang_keywords in category.values():
            keywords.extend(lang_keywords)

    # Extract technical and innovation-related terms
    for category in keywords_dict.get('technical_and_innovation', {}).values():
        for lang_keywords in category.values():
            keywords.extend(lang_keywords)

    # Extract regional-specific terms (optional)
    for region in keywords_dict.get('regional', {}).values():
        for lang_keywords in region.get('regional_terms', []):
            keywords.append(lang_keywords)

    # Remove duplicates
    return list(set(keywords))

keywords = extract_keywords(json_data)

print(keywords)


['revenu', 'Infrastrukturgesetz', 'Dürre', 'AI in water', 'crecimiento', 'Global Water Resources', 'Klimawandel', 'Investitionen im Wassersektor', 'Wasserressourcen', 'inversión', 'política ambiental', 'suministro de agua', 'Gewinn', 'dividendos', 'Wasserversorgung', 'IA dans l’eau', 'Wasserautomatisierung', 'water infrastructure', 'Social y Gobernanza)', 'water pollution', 'Lecksuche', 'drought', 'detección de fugas', 'water crisis', 'Wasserinfrastruktur', 'infraestructura de agua', 'water regulation', 'ressources en eau', 'Recursos Hídricos Globales', 'water recycling', 'ESG (Ambiental', 'Essential Utilities', 'gasto en infraestructura', 'Pénurie d’eau', 'sostenibilidad', 'gestión del agua', 'approvisionnement en eau', 'Wasserknappheit', 'infrastructure investments', 'Aqua America', 'dessalement', 'loi sur l’eau propre', 'IoT dans l’eau', 'météo extrême', 'Wasserverschmutzung', 'contaminación del agua', 'reciclaje de agua', 'Xylem France', 'IA en agua', 'Blei im Wasser', 'Xylem Deuts

In [6]:
##============ Enrich keywords with translations from deep ============##

def batch_translate(keywords, target_language):
    joined_words = "; ".join(keywords)  # Combine words with a separator
    translated = GoogleTranslator(source='auto', target=target_language).translate(joined_words)
    return translated.split("; ")  # Split back into individual words

languages = ['en', 'fr', 'de', 'es', 'zh-CN', 'pt', 'pa']
batch_translated_keywords = {
    lang: batch_translate(keywords, lang) for lang in languages
}
print(batch_translated_keywords)


{'en': ['revenue', 'Infrastrukturgesetz', 'Dürre', 'AI in water', 'growth', 'Global Water Resources', 'Klimawandel', 'Investitionen im Wassersektor', 'Wasserressourcen', 'inversion', 'environmental policy', 'suministro de agua', 'Gewinn', 'dividends', 'Wasserversorgung', 'AI in the water', 'Wasserautomatisierung', 'water infrastructure', 'Social and Governance)', 'water pollution', 'Lecksuche', 'drought', 'leak detection', 'water crisis', 'Wasserinfrastruktur', 'water infrastructure', 'water regulation', 'resources in water', 'Global Hídricos Recursos', 'water recycling', 'ESG (Environmental', 'Essential Utilities', 'infrastructure gas', 'Water supply', 'sustainability', 'water management', 'water supply', 'Wasterknappheit', 'infrastructure investments', 'Aqua America', 'dessalement', "loi sur l'eau propre", "IoT dans l'eau", 'météo extreme water recycling', 'Xylem France', 'water management', 'water management', '', 'ندرة المياه', 'WTRG', 'profit', 'IoT in water', 'المياه', 'public se

In [7]:
##============ Enrich keywords with synonyms from NLTK ============##

from nltk.corpus import wordnet

def generate_synonyms(word_list):
    synonyms = set()
    for word in word_list:
        synsets = wordnet.synsets(word)
        for syn in synsets:
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())
    return list(synonyms)

# Generate synonyms for each language
augmented_keywords = {
    lang: generate_synonyms(batch_translated_keywords[lang]) for lang in batch_translated_keywords
}

# Combine original and augmented keywords
final_keywords = {
    lang: list(set(batch_translated_keywords[lang] + augmented_keywords[lang]))
    for lang in batch_translated_keywords
}

final_keywords


{'en': ['',
  'climatic change',
  'Infrastrukturgesetz',
  'Water service options',
  'Dürre',
  'AI in water',
  'ESG (Umwelt',
  'Global Water Resources',
  'Klimawandel',
  'Investitionen im Wassersektor',
  'investment_funds',
  'Wasserressourcen',
  'desalinization',
  'suministro de agua',
  'infrastructure gas',
  'Gewinn',
  'digitalis Wasser',
  'sexual_inversion',
  'entering',
  'secheresse',
  'beguile',
  'glut',
  'Efficiency',
  'shortage of water',
  'Wasserversorgung',
  'Wasserautomatisierung',
  'intelligent water systems',
  'lead into water',
  'emergence',
  'drain water',
  'bewitch',
  'water infrastructure',
  'water pollution',
  'Lecksuche',
  'drought',
  'spellbind',
  'water crisis',
  'gross',
  'Wasserinfrastruktur',
  "IoT dans l'eau",
  'effectiveness',
  'water utilities',
  'enduringness',
  'water regulation',
  'enchant',
  'water recycling',
  'Essential Utilities',
  'sustainability',
  'lucre',
  'profits',
  'ontogeny',
  'increment',
  'autom

In [8]:
##============ Clean keywords ============##

def clean_keywords(final_keywords):
    # Deduplicate and clean keywords within and across languages
    global_set = set()  # Track all unique words across languages
    cleaned_dict = {}

    for lang, words in final_keywords.items():
        # Flatten list into a single string to clean up inconsistencies
        if isinstance(words, list):
            cleaned_words = [
                re.sub(r'[“”‘’"\';]', '', word).strip()  # Remove extra quotes and delimiters
                .replace("\xa0", "")  # Remove \xa0
                for word in words
            ]
        else:
            cleaned_words = words.split(",")  # Handle string lists if needed

        # Deduplicate and normalize within the language
        cleaned_words = set(cleaned_words)
        # Remove any empty strings or excessively short tokens
        cleaned_words = {word for word in cleaned_words if len(word) > 1}

        # Add unique words to global set and maintain cross-language uniqueness
        unique_words = cleaned_words - global_set
        global_set.update(unique_words)

        cleaned_dict[lang] = list(unique_words)

    return cleaned_dict

# Example Usage
final_keywords_cleaned = clean_keywords(final_keywords)

# Print result
print(json.dumps(final_keywords_cleaned, ensure_ascii=False, indent=4))


{
    "en": [
        "climatic change",
        "Dürre",
        "AI in water",
        "Investitionen im Wassersektor",
        "suministro de agua",
        "Gewinn",
        "entering",
        "secheresse",
        "glut",
        "Efficiency",
        "lead into water",
        "bewitch",
        "water infrastructure",
        "drought",
        "enduringness",
        "water recycling",
        "sustainability",
        "do_good",
        "quintal",
        "Aqua America",
        "swamp",
        "inversion in the water sector",
        "Actions de services publics deau",
        "rising_tide",
        "microplastics",
        "water technologies",
        "increase",
        "eversion",
        "revenue",
        "leak detection",
        "crise de leau",
        "المياه",
        "water contamination",
        "KI im Wasser",
        "trance",
        "ley de agua limpia",
        "Wassereinsparung",
        "desalinisation",
        "clean water act",
        "water automat

In [3]:
# Initialize the transformer model for NLP-based filtering
classifier = pipeline("text-classification", model="bert-base-uncased")

# Function to optimize and clean keywords
def optimize_keywords(keywords):
    optimized = {}
    for lang, terms in keywords.items():
        terms = set(terms)  # Remove duplicates
        terms = [term for term in terms if len(term) <= 50]  # Exclude overly long terms
        optimized[lang] = terms
    return optimized

# Function to search subreddits by language and keywords
def search_by_language(language_terms, reddit, batch_size=5, delay=1):
    subreddit_data = []
    for lang, terms in language_terms.items():
        for i in range(0, len(terms), batch_size):
            batch = terms[i:i + batch_size]
            for term in batch:
                try:
                    subreddits = reddit.subreddits.search(term, limit=10)
                    for sub in subreddits:
                        print(f"Searching in language {lang} for keyword '{term}' in r/{sub}...")
                        subreddit_data.append({
                            "name": sub.display_name,
                            "subscribers": sub.subscribers,
                            "language": lang,
                            "description": sub.public_description,
                            "url": f"https://reddit.com{sub.url}"
                        })
                except RequestException as e:
                    print(f"Error fetching term '{term}' in language '{lang}': {e}")
                time.sleep(delay)  # Delay to prevent rate-limiting
    return subreddit_data

# Function to classify subreddit descriptions using NLP
def classify_subreddit_relevance(descriptions):
    results = {}
    for desc in descriptions:
        result = classifier(desc)
        results[desc] = result
    return results

# Save subreddit data to a Parquet file
def save_to_parquet(subreddit_data, filename="subreddits.parquet"):
    df = pd.DataFrame(subreddit_data)
    df.to_parquet(filename, engine='pyarrow', compression='snappy')

# Load subreddit data from a Parquet file
def load_from_parquet(filename="subreddits.parquet"):
    return pd.read_parquet(filename, engine='pyarrow')

# Function to discover related subreddits based on a given subreddit
def discover_related_subreddits(subreddit_name, reddit):
    related = []
    try:
        subreddit = reddit.subreddit(subreddit_name)
        for suggestion in subreddit.related_subreddits():
            related.append(suggestion.display_name)
    except Exception as e:
        print(f"Error finding related subreddits for {subreddit_name}: {e}")
    return related

# Function to search subreddits in multiple languages
def search_multilingual_subreddits(keywords, reddit, languages, batch_size=5, delay=1):
    subreddit_data = []
    # Optimize keywords before searching
    keywords = optimize_keywords(keywords)
    
    for lang, terms in keywords.items():
        for i in range(0, len(terms), batch_size):
            batch = terms[i:i + batch_size]
            for term in batch:
                try:
                    subreddits = reddit.subreddits.search(term, limit=10)
                    for sub in subreddits:
                        subreddit_data.append({
                            "name": sub.display_name,
                            "subscribers": sub.subscribers,
                            "language": lang,
                            "description": sub.public_description,
                            "url": f"https://reddit.com{sub.url}"
                        })
                except RequestException as e:
                    print(f"Error fetching term '{term}' in language '{lang}': {e}")
                time.sleep(delay)  # Delay to prevent rate-limiting
    return subreddit_data

# Perform the search and classification
subreddit_data = search_multilingual_subreddits(final_keywords_cleaned, reddit, languages)
descriptions = [sub['description'] for sub in subreddit_data]

# Classify relevance of each subreddit description
classification_results = classify_subreddit_relevance(descriptions)

# Save the data to a database
save_to_parquet(subreddit_data, "subreddits.parquet")

# Load the data back to check
loaded_data = load_from_parquet("subreddits.parquet")
print(loaded_data.head())


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.

### Data scraping

In [None]:
##============ Get subreddit posts ============##


# Authenticate with Reddit API
reddit = praw.Reddit(
    client_id=REDDIT_APP_ID,       # Replace with your client ID
    client_secret=REDDIT_SECRET,  # Replace with your client secret
    user_agent="WaterDataCollector",    # A custom user-agent
)

# Define Subreddits, Keywords, and Parameters
subreddits = [
    "worldnews", "news", "geopolitics", "environment", "sustainability", "climate", "globalclimatechange", "conservation", "drought", "wallstreetbets", "StockMarket",
    "investing", "stocks", "Infrastructure", "India", "Australia", "Europe", "engineering", "UrbanPlanning", "UnitedKingdom", "MiddleEast", "AgTech", "foodsecurity", "GlobalHealth", "desalination"
]

# Define Search Function with comments
def search_reddit_with_comments(subreddits, keywords, limit=50):
    collected_data = []
    for subreddit in subreddits:
        for keyword in keywords:
            print(f"Searching '{keyword}' in r/{subreddit}...")
            try:
                for post in reddit.subreddit(subreddit).search(keyword, sort="new", time_filter="all", limit=limit): # Search for specific time periods using the time_filter parameter ('all', 'day', 'week', 'month', 'year')
                    post.comments.replace_more(limit=0)  # Load all comments
                    comments = [comment.body for comment in post.comments.list()]  # Extract all comments

                    collected_data.append({
                        "Subreddit": subreddit,
                        "Keyword": keyword,
                        "Title": post.title,
                        "Author": post.author.name if post.author else "N/A",
                        "Upvotes": post.score,
                        "Comments": post.num_comments,
                        "Created At": time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(post.created_utc)),
                        "URL": post.url,
                        "Content": post.selftext[:500],  # Truncate content for storage
                        "Top Comments": comments[:10]  # Collect up to 10 top comments
                    })
            except Exception as e:
                print(f"Error fetching data from r/{subreddit}: {e}")
    return collected_data

# Execute Search and Save Data
data = search_reddit(subreddits, keywords, limit=100)
df = pd.DataFrame(data)
df.to_csv("reddit_data.csv", index=False)

print(f"Collected {len(df)} posts. Data saved to 'reddit_data.csv'.")

# Load the CSV file into a DataFrame
reddit_data = pd.read_csv('reddit_data.csv')

reddit_data


Searching '益处' in r/worldnews...
Searching 'Emiratos Árabes Unidos' in r/worldnews...
Searching 'Réglementation environnementale' in r/worldnews...
Searching 'ਪਾਣੀ ਦਾ ਲੂਣੀਕਰਨ' in r/worldnews...
Searching 'Soziales und Governance)' in r/worldnews...
Searching 'Systèmes d'eau intelligents' in r/worldnews...
Searching 'gestão de água' in r/worldnews...
Searching 'conservation funds' in r/worldnews...
Searching '水服务行动' in r/worldnews...
Searching '威立雅环境' in r/worldnews...
Searching 'Umweltpolitik' in r/worldnews...
Searching 'automatisation de l’eau' in r/worldnews...
Searching 'Reservas de agua' in r/worldnews...
Searching 'ਡੀਸਲੀਨੇਸ਼ਨ' in r/worldnews...
Searching 'eau numérique' in r/worldnews...
Searching 'market share' in r/worldnews...
Searching 'Tratamiento de aguas residuales' in r/worldnews...
Searching 'Xilema Alemania' in r/worldnews...
Searching 'ਪਾਣੀ ਦਾ ਕਾਨੂੰਨ' in r/worldnews...
Searching 'ਬੁਨਿਆਦੀ ਢਾਂਚਾ ਖਰਚ' in r/worldnews...
Searching 'Services essentiels' in r/worldnews...
Sea

### Data procesing

In [None]:
##============ Data Preprocessing ============##

def clean_text(text):
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)  # Remove special characters
    return text.lower()


In [None]:
##============ Data Filtering ============##

filtered_data = df[(df['Upvotes'] > 50) & (df['Comments'] > 10)]


In [None]:
##============ Sentiment Analysis ============##
# Compare sentiment trends before and after major events

analyzer = SentimentIntensityAnalyzer()
sentiment = analyzer.polarity_scores(text)