In [2]:
import datetime
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import date
from datetime import datetime
from newspaper import Article

In [3]:
# Define the range of years for data collection
years = range(2013, 2024)

In [4]:
def web_scraper(url: str): 
    company_name = url.split('/')[-1]
    # Initialize a DataFrame to store news data
    news_data = pd.DataFrame(columns=['Date', 'Source', 'Headline', 'Link', 'Content'])   
    for i, year in enumerate(years):
        # Set the start date for each year
        start_date = datetime(year, 1, 1).date()
        end_date = datetime(year, 12, 31).date()
        current_date = date.today()

        # Number of pages to scrape for each year
        pages = 50

        for page_num in range(1, pages + 1):
            # Check if current date is within the year range
            if current_date > start_date and current_date <= end_date:
                page_url = f'{url}/{page_num}'
                try:
                    response = requests.get(page_url)
                    # Skip the page if server error occurs
                    if response.status_code == 502:
                        continue

                    # Parse the HTML content of the page
                    soup = BeautifulSoup(response.content, 'html.parser')
                    articles = soup.find_all('li', {'class': "common-articles-item js-article-item"})

                    # Iterate over each article in the page
                    for article_tag in articles:
                        time_tag = article_tag.find('time', {'datetime':article_tag.find('ul', attrs={'class': "details-list"}).find('time')['data-timestamp']}).text
                        if 'ago' not in time_tag:
                            article_date = datetime.strptime(time_tag,"%b %d, %Y %H:%M").date()                                             
                        else:
                            article_date = date.today()
                            article_date = article_date.strftime("%b")+' '+article_date.strftime("%d")+','+' '+article_date.strftime("%Y")+' '+'00:00'
                        # Extract article source, headline, and link
                        source = article_tag.find('ul', attrs={'class': "details-list"}).find('li').text
                        headline = article_tag.find('h3', attrs={'class': "title"})['title']
                        link = 'https://in.investing.com' + article_tag.find('h3', attrs={'class': "title"}).find('a')['href']

                        # Download and parse the article content
                        article_instance = Article(link)
                        article_instance.download()
                        article_instance.parse()
                        content = article_instance.text

                        # Create a row of data for the article
                        news_row = {'Date': article_date, 'Source': source, 'Headline': headline, 'Link': link, 'Content': content}
                        # Add the row to the DataFrame
                        news_data = pd.concat([news_data, pd.DataFrame([news_row])], ignore_index=True)

                except requests.exceptions.RequestException as e:
                    print(f"Error on page {page_num}: {e}")

        # Reset the index of the DataFrame after each year
        news_data.reset_index(drop=True, inplace=True)
    
    # Export the data to a CSV file (optional)
    news_data.to_csv(f'./data/news_data_{company_name}_{pages}.csv', index=False)

In [5]:
meta_str = 'https://in.investing.com/equities/facebook-inc-news'
apple_str = 'https://in.investing.com/equities/apple-computer-inc-news'
amazon_str = 'https://in.investing.com/equities/amazon-com-inc-news'
netflix_str = 'https://in.investing.com/equities/netflix,-inc.-news'
google_str = 'https://in.investing.com/equities/google-inc-c-news'

In [6]:
# web_scraper(meta_str)
# web_scraper(apple_str)
#web_scraper(amazon_str)
web_scraper(netflix_str)
web_scraper(google_str)