In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import numpy as np
from textblob import TextBlob
from geopy.geocoders import Nominatim
from lexrank import LexRank
from IPython.display import display
import folium
from folium.plugins import MarkerCluster
import dash
from dash import dcc, html, Dash
import dash_leaflet as dl
from dash.dependencies import Input, Output

In [5]:
base_url = "https://apnews.com/"
page = requests.get(base_url)
soup = BeautifulSoup(page.content, 'html.parser')

article_links = [base_url + link['href'] for link in soup.select('a.Link')]

article_data = pd.DataFrame(columns=['title', 'location', 'text'])

In [6]:
def extract_location(input_text):
    location_pattern = "^(.*?)\\s*\\(AP\\)"
    location_match = re.search(location_pattern, input_text, re.DOTALL)
    
    if location_match:
        extracted_location = location_match.group(1).strip()
        return extracted_location
    else:
        return "Not found"

In [7]:
# Initialize article_data as an empty DataFrame
article_data = []



for link in article_links:
    try:
        article_page = requests.get(link)
        article_soup = BeautifulSoup(article_page.content, 'html.parser')
        
        article_title = article_soup.find('h1').get_text()
        article_text = " ".join([p.get_text() for p in article_soup.find_all('p')])
        
        location_div = article_soup.find('div', class_='RichTextStoryBody RichTextBody')
        location_text = ""
        if location_div:
            location_text = location_div.find('p').get_text()

        extracted_location = extract_location(location_text)
        print("Extracted Location:", extracted_location)
        
        # Append data as a dictionary to article_data list
        article_data.append({'title': article_title, 'location': extracted_location, 'text': article_text})
        
        time.sleep(5)  # Delay between requests
    except Exception as e:
        print(f"Error: {str(e)} URL: {link}")

# Convert the list of dictionaries to a DataFrame
article_df = pd.DataFrame(article_data)

Extracted Location: Not found
Extracted Location: DETROIT
Extracted Location: TAMPA, Fla.
Extracted Location: CAPE CANAVERAL, Fla.
Extracted Location: NEW YORK
Extracted Location: RIO DE JANEIRO
Extracted Location: RIO DE JANEIRO
Extracted Location: RIO DE JANEIRO
Extracted Location: Juma Indigenous Territory, Amazonas, Brazil
Extracted Location: ALTO RIO GUAMA INDIGENOUS TERRITORY, Brazil
Extracted Location: DUBAI, United Arab Emirates
Extracted Location: KYIV, Ukraine
Extracted Location: KYIV, Ukraine
Extracted Location: WASHINGTON
Extracted Location: WASHINGTON
Extracted Location: TOMPKINSVILLE, Ky.
Extracted Location: TOMPKINSVILLE, Ky.
Extracted Location: INDIANAPOLIS
Extracted Location: INDIANAPOLIS
Extracted Location: LOS ANGELES
Extracted Location: LOS ANGELES
Extracted Location: MISSOULA, Mont.
Extracted Location: MISSOULA, Mont.
Extracted Location: SAN FRANCISCO
Extracted Location: SAN FRANCISCO
Extracted Location: NEWARK, N.J.
Extracted Location: ROME
Extracted Location: ROM

In [8]:
print(article_data)



In [9]:
# Convert article_data to a DataFrame
article_data = pd.DataFrame(article_data)

# Filter out rows with location as "Not found"
cleaned_article_data = article_data[article_data['location'] != "Not found"].copy()

# Remove duplicated rows
cleaned_article_data = cleaned_article_data.drop_duplicates()

In [10]:
# Create a new DataFrame to store clean text
cleaned_article_data['clean_text'] = cleaned_article_data['text'].str.lower()
cleaned_article_data['clean_text'] = cleaned_article_data['clean_text'].str.replace(r'[^\w\s]', '')
cleaned_article_data['clean_text'] = cleaned_article_data['clean_text'].str.replace(r'\s+', ' ', regex=True)


In [11]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob


In [12]:
# Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
cleaned_article_data['clean_text'] = cleaned_article_data['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("spacytextblob")
def subjectivity_sentiment_score(text):
    doc = nlp(text)
    return doc._.blob.polarity, doc._.blob.subjectivity

# Calculate sentiment scores
cleaned_article_data['sentiment_score'], cleaned_article_data['subjectivity'] = zip(*cleaned_article_data['text'].apply(subjectivity_sentiment_score))

In [17]:
geolocator = Nominatim(timeout = 10, user_agent="myGeoLocator")
def geocode_city(city_name):
    location = geolocator.geocode(city_name)
    if location:
        return location.latitude, location.longitude
    else:
        return None, None

cleaned_article_data['latitude'], cleaned_article_data['longitude'] = zip(*cleaned_article_data['location'].apply(geocode_city))



KeyboardInterrupt: 

In [None]:
cleaned_article_data = cleaned_article_data.dropna()

In [14]:
# Define a function to assign marker color based on sentiment
def get_color(sentiment_score_clean):
    if sentiment_score_clean < 0:
        return 'red'
    else:
        return 'green'

In [16]:
m = folium.Map()
# Create markers and add them to the map
for index, row in cleaned_article_data.iterrows():
    marker = folium.Marker(
        location=[row['latitude'], row['longitude']],
        tooltip=row['title'],
        popup=row['text'],
        icon=folium.Icon(icon='info-sign', color=get_color(row['sentiment_score'])),
    )
    marker.add_to(m)

# Display the map
m

ValueError: Location values cannot contain NaNs.