In [5]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Ensure necessary NLTK datasets/libraries are downloaded
nltk.download('vader_lexicon')
nltk.download('stopwords')

# Define the URL of the webpage containing medical reviews
url = 'https://www.who.int/health-topics/coronavirus'

# Set the user-agent header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}

# Send an HTTP GET request to the URL with the custom user-agent header
response = requests.get(url, headers=headers)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
reviews = soup.find_all('div', class_='review-container')  # Adjust based on your webpage's structure

# Initialize NLTK's sentiment analyzer
sia = SentimentIntensityAnalyzer()

for review in reviews:
    # Extract the review text
    review_text = review.find('p', class_='review-text').text  # Adjust based on your webpage's structure

    # Tokenize the review text
    tokens = word_tokenize(review_text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Calculate sentiment scores
    sentiment_scores = sia.polarity_scores(review_text)

    print("Review Text:")
    print(review_text)
    print("\nTokens:")
    print(filtered_tokens)
    print("\nSentiment Scores:")
    print(sentiment_scores)
    print("\n---\n")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
