In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from googletrans import Translator
import spacy
from gensim import corpora
from gensim.models import LdaModel
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import warnings
warnings.filterwarnings('ignore')

In [1]:

"""
Professional NLP Pipeline for Rainfall-Related News Analysis in Eastern Nepal.
Scrapes articles from provided URLs, processes Nepali and English text, and performs
sentiment analysis, named entity recognition, topic modeling, and text summarization.
Prioritizes 18 rainfall stations for relevance to rainfall analysis.
"""

import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from googletrans import Translator
import spacy
from gensim import corpora
from gensim.models import LdaModel
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import logging
import warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define file paths
PREPROCESSED_PATH = '../Data/Preprocessed'
OUTPUT_PATH = '../Rainfall_app/Data'

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Download NLTK resources
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Define 18 rainfall stations
RAINFALL_STATIONS = [
    'Okhaldhunga', 'Khotang Bazaar', 'Siraha', 'Rajbiraj', 'Barmajhiya', 'Chainpur (East)',
    'Pakhribas', 'Dhankuta', 'Biratnagar Airport', 'Tarhara', 'Dingla', 'Taplejung',
    'Ilam Tea Garden', 'Damak', 'Anarmani Birta', 'Chandri Gadhi', 'Phidim (Panchthar)',
    'Kanyam Tea Estate', 'Gaida (Kankai)'
]

class NLPPipeline:
    """A modular NLP pipeline for processing rainfall-related news articles in Eastern Nepal."""
    
    def __init__(self):
        """Initialize the pipeline with necessary components."""
        self.translator = Translator()
        self.sia = SentimentIntensityAnalyzer()
        self.stop_words = set(stopwords.words('english'))
        self.nlp = spacy.load('en_core_web_sm', disable=['parser'])
        self.stations = RAINFALL_STATIONS
        self.logger = logging.getLogger(__name__)

    def scrape_article(self, url):
        """
        Scrape full text from a news article or social media post.
        
        Args:
            url (str): URL of the article or post.
        
        Returns:
            str: Extracted text or None if scraping fails.
        """
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, timeout=10, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Try specific content selectors
            content = (
                soup.find('article') or
                soup.find('div', class_=re.compile('content|article-body|news-content|post-content')) or
                soup.find('div', class_=re.compile('post|fb-post|status'))
            )
            if content:
                paragraphs = content.find_all('p')
                text = ' '.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            else:
                # Fallback to all paragraphs
                paragraphs = soup.find_all('p')
                text = ' '.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            
            if not text.strip():
                self.logger.warning(f"No text extracted from {url}")
                return None
            return text
        except Exception as e:
            self.logger.error(f"Error scraping {url}: {e}")
            return None

    def translate_text(self, text, src='ne', dest='en'):
        """
        Translate text from source language to destination language.
        
        Args:
            text (str): Text to translate.
            src (str): Source language code (default: 'ne' for Nepali).
            dest (str): Destination language code (default: 'en' for English).
        
        Returns:
            str: Translated text or original text if translation fails.
        """
        if not text:
            return ''
        try:
            translation = self.translator.translate(text, src=src, dest=dest)
            return translation.text if translation.text else text
        except Exception as e:
            self.logger.error(f"Error translating text: {e}")
            return text

    def preprocess_text(self, text, language='en'):
        """
        Preprocess text for NLP tasks.
        
        Args:
            text (str): Input text.
            language (str): Language code (default: 'en').
        
        Returns:
            str: Cleaned text.
        """
        if not isinstance(text, str) or not text.strip():
            return ''
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)
        tokens = word_tokenize(text)
        tokens = [t for t in tokens if t.isalpha() and t not in self.stop_words]
        return ' '.join(tokens)

    def sentiment_analysis(self, texts):
        """
        Perform sentiment analysis using VADER.
        
        Args:
            texts (list): List of cleaned texts.
        
        Returns:
            pd.DataFrame: Sentiment scores (neg, neu, pos, compound).
        """
        sentiments = []
        for text in texts:
            if text:
                scores = self.sia.polarity_scores(text)
                sentiments.append({
                    'neg': scores['neg'],
                    'neu': scores['neu'],
                    'pos': scores['pos'],
                    'compound': scores['compound']
                })
            else:
                sentiments.append({'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0})
        return pd.DataFrame(sentiments)

    def named_entity_recognition(self, texts):
        """
        Extract locations, events, and station mentions using spaCy.
        
        Args:
            texts (list): List of texts (translated to English).
        
        Returns:
            pd.DataFrame: Locations, events, and station mentions per text.
        """
        entities = []
        for text in texts:
            if text:
                doc = self.nlp(text)
                locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
                events = [
                    ent.text for ent in doc.ents 
                    if ent.label_ in ['EVENT', 'DATE'] or 
                    any(keyword in ent.text.lower() for keyword in ['flood', 'landslide', 'rainfall'])
                ]
                # Check for station mentions
                station_mentions = [
                    station for station in self.stations 
                    if re.search(rf'\b{re.escape(station)}\b', text, re.IGNORECASE)
                ]
                entities.append({
                    'locations': locations,
                    'events': events,
                    'station_mention': station_mentions
                })
            else:
                entities.append({
                    'locations': [],
                    'events': [],
                    'station_mention': []
                })
        return pd.DataFrame(entities)

    def topic_modeling(self, texts, num_topics=3):
        """
        Perform topic modeling using LDA.
        
        Args:
            texts (list): List of cleaned texts.
            num_topics (int): Number of topics to extract.
        
        Returns:
            tuple: List of topics (keywords) and dominant topics per text.
        """
        tokenized_texts = [word_tokenize(text) for text in texts if text]
        if not tokenized_texts:
            self.logger.warning("No valid texts for topic modeling.")
            return [], [None] * len(texts)
        
        dictionary = corpora.Dictionary(tokenized_texts)
        corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
        try:
            lda_model = LdaModel(
                corpus, num_topics=num_topics, id2word=dictionary, 
                passes=10, random_state=42, minimum_probability=0.0
            )
            topics = [
                {'topic': i, 'keywords': [word for word, _ in lda_model.show_topic(i, topn=5)]}
                for i in range(num_topics)
            ]
            dominant_topics = []
            for text in texts:
                if text:
                    bow = dictionary.doc2bow(word_tokenize(text))
                    topic_dist = lda_model[bow]
                    dominant_topic = max(topic_dist, key=lambda x: x[1])[0] if topic_dist else None
                    dominant_topics.append(dominant_topic)
                else:
                    dominant_topics.append(None)
            return topics, dominant_topics
        except Exception as e:
            self.logger.error(f"Error in topic modeling: {e}")
            return [], [None] * len(texts)

    def summarize_text(self, text, sentences_count=3):
        """
        Summarize text using extractive summarization.
        
        Args:
            text (str): Input text.
            sentences_count (int): Number of sentences in summary.
        
        Returns:
            str: Summary text.
        """
        if not text:
            return ''
        try:
            parser = PlaintextParser.from_string(text, Tokenizer('english'))
            summarizer = LsaSummarizer()
            summary = summarizer(parser.document, sentences_count)
            return ' '.join([str(sentence) for sentence in summary])
        except Exception as e:
            self.logger.error(f"Error summarizing text: {e}")
            return ''

    def process_articles(self, articles):
        """
        Process articles through the NLP pipeline.
        
        Args:
            articles (list): List of dicts with url, source, date, language.
        
        Returns:
            pd.DataFrame: Processed data with NLP outputs.
        """
        self.logger.info("Starting article processing...")
        article_data = []
        for article in articles:
            self.logger.info(f"Scraping {article['url']}...")
            text = self.scrape_article(article['url'])
            if text:
                language = article['language']
                translated_text = self.translate_text(text, src='ne', dest='en') if language == 'ne' else text
                article_data.append({
                    'text': text,
                    'translated_text': translated_text,
                    'clean_text': self.preprocess_text(translated_text, language='en'),
                    'source': article['source'],
                    'date': article['date'],
                    'type': 'article',
                    'language': article['language']
                })
        
        if not article_data:
            self.logger.error("No articles collected.")
            return pd.DataFrame()
        
        df = pd.DataFrame(article_data)
        self.logger.info("Performing sentiment analysis...")
        sentiment_df = self.sentiment_analysis(df['clean_text'])
        df = pd.concat([df, sentiment_df], axis=1)
        
        self.logger.info("Performing named entity recognition...")
        ner_df = self.named_entity_recognition(df['translated_text'])
        df = pd.concat([df, ner_df], axis=1)
        
        self.logger.info("Performing topic modeling...")
        topics, dominant_topics = self.topic_modeling(df['clean_text'])
        df['dominant_topic'] = dominant_topics
        
        self.logger.info("Performing text summarization...")
        df['summary'] = df['translated_text'].apply(self.summarize_text)
        
        # Save topics
        with open(os.path.join(OUTPUT_PATH, 'lda_topics.txt'), 'w', encoding='utf-8') as f:
            for topic in topics:
                f.write(f"Topic {topic['topic']}: {', '.join(topic['keywords'])}\n")
        
        self.logger.info("Article processing completed.")
        return df

def main():
    """Execute the NLP pipeline with predefined articles."""
    pipeline = NLPPipeline()
    
    # Define articles with provided URLs
    articles = [
        {'url': 'https://www.facebook.com/okhaldhungakhabar/photos/ओखलढुंगामा-भारी-वर्षा-ओखलढुंगामा-पछिल्लो-२४-घण्टामा-भारी-वर्षा-भएको-छ-जल-तथा-मौस/1230986501622455/?_rdr', 'source': 'Okhaldhunga Khabar', 'date': '2024-10-01', 'language': 'ne'},
        {'url': 'https://janaprashasan.com/2023/08/25/41455/', 'source': 'Janaprashasan', 'date': '2023-08-25', 'language': 'ne'},
        {'url': 'https://emountaintv.com/ne/255134/', 'source': 'eMountain TV', 'date': '2024-09-30', 'language': 'ne'},
        {'url': 'https://khabarhub.com/2024/28/693305/', 'source': 'Khabarhub', 'date': '2024-09-28', 'language': 'en'},
        {'url': 'https://www.makalukhabar.com/2019/07/971905/', 'source': 'Makalu Khabar', 'date': '2019-07-12', 'language': 'ne'},
        {'url': 'https://ekantipur.com/madhesh-pradesh/2019/07/14/156309591531712873.html', 'source': 'Kantipur', 'date': '2019-07-14', 'language': 'ne'},
        {'url': 'https://www.enepalese.com/2019/07/238090.html', 'source': 'eNepalese', 'date': '2019-07-12', 'language': 'en'},
        {'url': 'https://kathmandupress.com/detail/48024', 'source': 'Kathmandu Press', 'date': '2024-09-30', 'language': 'ne'},
        {'url': 'https://maitrinews.com/2020/07/20/55953', 'source': 'Maitri News', 'date': '2020-07-20', 'language': 'ne'},
        {'url': 'https://www.prasashan.com/2024/10/01/599004/', 'source': 'Prasashan', 'date': '2024-10-01', 'language': 'ne'},
        {'url': 'https://hamrosanchar.com/archives/180678', 'source': 'Hamro Sanchar', 'date': '2024-09-30', 'language': 'ne'},
        {'url': 'https://nepalkhabar.com/society/133560-2022-6-10-7-49-21', 'source': 'Nepal Khabar', 'date': '2022-06-10', 'language': 'ne'},
        {'url': 'https://www.facebook.com/brok.curse0911/', 'source': 'Facebook Brok Curse', 'date': '2024-10-01', 'language': 'ne'},
        {'url': 'https://sangalokhabar.com/113458', 'source': 'Sangalo Khabar', 'date': '2024-09-30', 'language': 'ne'},
        {'url': 'https://nepalstatus.com/news/2706963715', 'source': 'Nepal Status', 'date': '2024-09-30', 'language': 'ne'},
        {'url': 'https://nepallive.com/story/222923', 'source': 'Nepal Live', 'date': '2024-09-30', 'language': 'ne'},
        {'url': 'https://www.setopati.com/social/74197', 'source': 'Setopati', 'date': '2017-08-13', 'language': 'ne'},
        {'url': 'https://nayapage.com/archives/588345', 'source': 'Naya Page', 'date': '2024-09-30', 'language': 'ne'},
        {'url': 'https://www.nayapatrikadaily.com/news-details/73227/2021-10-23', 'source': 'Naya Patrika', 'date': '2021-10-23', 'language': 'ne'},
        {'url': 'https://gorkhapatraonline.com/news/113627', 'source': 'Gorkhapatra', 'date': '2024-09-30', 'language': 'ne'},
        {'url': 'https://hamrakura.com/news-details/129237/video', 'source': 'Hamrakura', 'date': '2022-10-23', 'language': 'ne'},
        {'url': 'https://www.facebook.com/mfd.nepal.5', 'source': 'MFD Nepal', 'date': '2024-10-01', 'language': 'ne'},
        {'url': 'https://www.onlinekhabar.com/2023/06/1322861/बाढी-पहिरोले-ताप्लेजुङका', 'source': 'Online Khabar', 'date': '2023-06-18', 'language': 'ne'},
        {'url': 'https://www.bbc.com/nepali/news/2015/06/150611_taplejung', 'source': 'BBC Nepali', 'date': '2015-06-11', 'language': 'ne'},
        {'url': 'https://nagariknews.nagariknetwork.com/social-affairs/1209461-1687151534.html', 'source': 'Nagarik News', 'date': '2023-06-19', 'language': 'ne'},
        {'url': 'https://baahrakhari.com/detail/391043', 'source': 'Baahrakhari', 'date': '2024-09-30', 'language': 'ne'},
        {'url': 'https://ekantipur.com/ampnews/2017-07-25/20170725074750.html', 'source': 'Kantipur', 'date': '2017-07-25', 'language': 'ne'},
        {'url': 'https://newssanjal.com/content/2022/09/25/24460/', 'source': 'News Sanjal', 'date': '2022-09-25', 'language': 'ne'},
        {'url': 'https://kantipurtv.com/news/2024/10/03/1727921265.html', 'source': 'Kantipur TV', 'date': '2024-10-03', 'language': 'ne'},
        {'url': 'https://www.bbc.com/nepali/articles/cd1nz5zn1n1o', 'source': 'BBC Nepali', 'date': '2024-10-01', 'language': 'ne'}
    ]
    
    # Process articles
    print("\n--- Processing Rainfall-Related Articles ---")
    nlp_df = pipeline.process_articles(articles)
    
    if nlp_df.empty:
        print("No data to process. Exiting.")
        return
    
    # Save NLP results
    nlp_output_file = os.path.join(OUTPUT_PATH, 'nlp_results.csv')
    nlp_df.to_csv(nlp_output_file, index=False, encoding='utf-8')
    print(f"NLP results saved to {nlp_output_file}")
    
    # Print sample results
    print("\nSample NLP Results:")
    print(nlp_df[['source', 'language', 'compound', 'locations', 'events', 'station_mention', 'dominant_topic', 'summary']].head())

if __name__ == "__main__":
    main()

2025-04-28 15:23:46,120 - INFO - Starting article processing...
2025-04-28 15:23:46,121 - INFO - Scraping https://www.facebook.com/okhaldhungakhabar/photos/ओखलढुंगामा-भारी-वर्षा-ओखलढुंगामा-पछिल्लो-२४-घण्टामा-भारी-वर्षा-भएको-छ-जल-तथा-मौस/1230986501622455/?_rdr...



--- Processing Rainfall-Related Articles ---


2025-04-28 15:23:47,585 - INFO - Scraping https://janaprashasan.com/2023/08/25/41455/...
2025-04-28 15:23:50,878 - INFO - Scraping https://emountaintv.com/ne/255134/...
2025-04-28 15:23:53,321 - INFO - Scraping https://khabarhub.com/2024/28/693305/...
2025-04-28 15:23:54,954 - INFO - Scraping https://www.makalukhabar.com/2019/07/971905/...
2025-04-28 15:23:56,036 - INFO - Scraping https://ekantipur.com/madhesh-pradesh/2019/07/14/156309591531712873.html...
2025-04-28 15:24:01,859 - INFO - Scraping https://www.enepalese.com/2019/07/238090.html...
2025-04-28 15:24:02,660 - INFO - Scraping https://kathmandupress.com/detail/48024...
2025-04-28 15:24:03,650 - INFO - Scraping https://maitrinews.com/2020/07/20/55953...
2025-04-28 15:24:06,065 - ERROR - Error translating text: the JSON object must be str, bytes or bytearray, not NoneType
2025-04-28 15:24:06,081 - INFO - Scraping https://www.prasashan.com/2024/10/01/599004/...
2025-04-28 15:24:09,298 - INFO - Scraping https://hamrosanchar.com/ar

NLP results saved to ../Rainfall_app/Data/nlp_results.csv

Sample NLP Results:
          source language  compound  \
0  Janaprashasan       ne    0.4019   
1   eMountain TV       ne   -0.2732   
2      Khabarhub       en    0.0000   
3  Makalu Khabar       ne   -0.9773   
4       Kantipur       ne    0.8225   

                                           locations  \
0  [Lamjung, Syndunga, Syndung, Sindhunga, Kathma...   
1                         [Okhaldhu, Koshi, Gandaki]   
2                                                 []   
3  [Siraha, Lahan, Siraha, Kiraha, Janakppaldham,...   
4  [Siraha, hurricanes, gagan, Kandonha, Water, J...   

                                              events        station_mention  \
0                                            [Today]  [Okhaldhunga, Siraha]   
1  [today, the previous year, the previous year, ...                     []   
2                                                 []                     []   
3  [July 27, the last five days, 