In [1]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import requests
from datetime import datetime



# Define the URL and tickers

market_insiderUrl = 'https://markets.businessinsider.com/stocks/'
tickers = ['AMZN','TSLA','META','MSFT','GOOG','NVDA','AAPL']

news_tables = {}
for ticker in tickers:
    news_table=[]
    #change range to increase or decrease pages scraped, increasing or decreasing the time range of the web scraper
    for i in range(1,6):
        url = market_insiderUrl + ticker + '-stock/news?p='+str(i)



        # Use BeautifulSoup to parse the loaded page
        print(url)
        response = requests.get(url)
        response.raise_for_status()  # This will raise an exception for HTTP errors

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        news_table.extend(soup.findAll('a', class_='news-link'))
    news_tables[ticker] = news_table

print(len(news_tables['MSFT']))




https://markets.businessinsider.com/stocks/CRM-stock/news?p=1
https://markets.businessinsider.com/stocks/CRM-stock/news?p=2
https://markets.businessinsider.com/stocks/NFLX-stock/news?p=1
https://markets.businessinsider.com/stocks/NFLX-stock/news?p=2
https://markets.businessinsider.com/stocks/BA-stock/news?p=1
https://markets.businessinsider.com/stocks/BA-stock/news?p=2
https://markets.businessinsider.com/stocks/DIS-stock/news?p=1
https://markets.businessinsider.com/stocks/DIS-stock/news?p=2


KeyError: 'MSFT'

In [3]:
# Initialize empty lists to store parsed data
parsed_data = []
p_data = []

# Iterate over each ticker and its corresponding news table
for ticker, news_table in news_tables.items():
    print(ticker)
    # Iterate over each element in the news table
    for elem in news_table:
        try:
            articleInfo = ''
            # Check if the element exists
            if elem:
                # Extract the link from the element
                link = elem.get('href')
                newUrl = 'https://markets.businessinsider.com' + link 

                # Use requests library to get HTML content from the URL
                response = requests.get(newUrl)
                response.raise_for_status()  # Raise an exception for HTTP errors

                # Parse the HTML content using BeautifulSoup
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Extract title and date information from the HTML
                h1tag = soup.find('h1',class_='article-title')
                title = h1tag.text
                date = soup.find('span',class_='news-post-quotetime warmGrey')
                date = date.text
                date = date.split(',')
                date = date[0] + ','+ date[1];
                date_format = "%b. %d, %Y"
                date_object = datetime.strptime(date, date_format)
                iso_format_date = date_object.isoformat()

                # Extract content from the HTML, filtering out irrelevant phrases and tags
                content = soup.find('div', class_='col-xs-12 news-content no-padding')
                irrelevant_phrases = ["InvestorPlace - Stock Market News","Shutterstock","Source:","See also", "Read next", "Read also", "TipRanks has tracked 36,000", "Now Read", "Read now"]

                if content:
                    p_tags = content.find_all('p')
                    restIrelevant = False
                    pList=[]
                    for p in p_tags:
                        relevant = True
                        if(len(p.text.strip())==0):
                            continue;
                        if(len(p.text.strip().split())<7):
                            relevant=False
                        if(p.text.strip()=="InvestorPlace - Stock Market News, Stock Advice & Trading Tips"):
                            relevant=False
                        if(restIrelevant):
                            relevant = False
                        if 'InvestorPlace.com'.lower() in p.text.strip().lower():
                            relevant = False
                            restIrelevant=True
                        for phrase in irrelevant_phrases:
                            if phrase.lower() in p.text.strip().lower(): 
                                relevant = False
                                break
                        pList.append(p.text.strip())
                        p_data.append([ticker,link,p.text.strip(),relevant])
                    
                    articleInfo = ' '.join(p.text.strip() for p in p_tags if len(p.text.strip()) != 0)

                # Append the parsed data to the parsed_data list
                parsed_data.append([ticker, title,iso_format_date, link, articleInfo,pList])
                
        except Exception as e:
            print(f"An error occurred while fetching or parsing the article: {e}")
            continue  # Skip to the next loop iteration if an error occurs

# Create DataFrame from the parsed data
df = pd.DataFrame(parsed_data, columns=['ticker', 'title','date', 'link', 'articleInfo','paragraphList'])

# Create DataFrame from the paragraph data
p_df = pd.DataFrame(p_data, columns=['ticker','link','paragraph','relevant'])

# Print the first few rows of the DataFrame
print(df.head())


CRM
1
https://markets.businessinsider.com/news/stocks/new-buy-rating-for-salesforce-crm-the-technology-giant-1033257309
2
https://markets.businessinsider.com/news/etf/tech-layoffs-remote-work-push-office-vacancies-to-19-6-highest-since-1979-1033255026
3
https://markets.businessinsider.com/news/stocks/the-cathie-wood-stock-shuffle-4-names-shes-loving-3-shes-leaving-1033254760
4
https://markets.businessinsider.com/news/stocks/salesforce-s-strategic-and-financially-sound-acquisition-of-informatica-backed-by-analyst-s-buy-rating-1033255009
5
https://markets.businessinsider.com/news/stocks/analysts-offer-insights-on-technology-companies-xiaomi-otherxiacf-kenvue-inc-kvue-and-salesforce-crm-1033255010
6
https://markets.businessinsider.com/news/stocks/salesforce-com-a-strong-buy-on-integrated-cloud-services-and-economies-of-scale-1033253989
7
https://markets.businessinsider.com/news/stocks/buy-rating-affirmed-strategic-growth-and-financial-prudence-in-salesforce-s-potential-informatica-acquisi

45
https://markets.businessinsider.com/news/stocks/bigcommerce-is-likely-to-perform-much-worse-than-this-major-peer-says-bearish-analyst-1033194009
46
https://markets.businessinsider.com/news/stocks/wells-fargo-reaffirms-their-hold-rating-on-salesforce-crm-1033190579
47
https://markets.businessinsider.com/news/stocks/3-stocks-with-the-most-potential-to-upstage-the-magnificent-7-in-2024-1033189958
48
https://markets.businessinsider.com/news/stocks/3-stocks-that-could-benefit-most-from-the-ai-revolution-1033186611
49
https://markets.businessinsider.com/news/stocks/where-salesforce-stands-with-analysts-1033184964
50
https://markets.businessinsider.com/news/stocks/ai-winners-a-cheat-sheet-1033184705
51
https://markets.businessinsider.com/news/stocks/profiting-from-ai-current-and-upcoming-champions-1033174205
52
https://markets.businessinsider.com/news/stocks/protect-your-portfolio-3-stocks-to-own-in-an-unpredictable-market-1033165034
53
https://markets.businessinsider.com/news/stocks/7-dow

97
https://markets.businessinsider.com/news/stocks/analysts-conflicted-on-these-technology-names-salesforce-crm-fidelity-national-info-fis-and-okta-okta-1033116791
98
https://markets.businessinsider.com/news/stocks/7-stocks-set-to-bounce-back-post-layoffs-1033115340
99
https://markets.businessinsider.com/news/stocks/salesforce-com-inc-q4-earnings-summary-1033114883
100
https://markets.businessinsider.comhttps://seekingalpha.com/news/4073111-salesforce-stumbles-tepid-guidance-overshadows-q4-results?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
An error occurred while fetching or parsing the article: HTTPSConnectionPool(host='markets.businessinsider.comhttps', port=443): Max retries exceeded with url: //seekingalpha.com/news/4073111-salesforce-stumbles-tepid-guidance-overshadows-q4-results?utm_source=businessinsider&utm_medium=referral&feed_item_type=news (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x127a4ef50>: Failed to establish a 

27
https://markets.businessinsider.com/news/stocks/ubs-keeps-their-buy-rating-on-netflix-nflx-1033248668
28
https://markets.businessinsider.comhttps://seekingalpha.com/news/4089249-earnings-week-ahead-bac-jnj-tsm-unh-abt-slb-ual-pg-nflx-and-more?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
An error occurred while fetching or parsing the article: HTTPSConnectionPool(host='markets.businessinsider.comhttps', port=443): Max retries exceeded with url: //seekingalpha.com/news/4089249-earnings-week-ahead-bac-jnj-tsm-unh-abt-slb-ual-pg-nflx-and-more?utm_source=businessinsider&utm_medium=referral&feed_item_type=news (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x127e35b50>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
29
https://markets.businessinsider.com/news/stocks/netflix-q1-subscriber-estimates-too-high-analyst-cautions-on-elevated-expectations-how-big-is-the-beat-mentality-1033245533
3

71
https://markets.businessinsider.com/news/stocks/3-consumer-stocks-to-snatch-up-while-the-market-snoozes-1033182333
72
https://markets.businessinsider.comhttps://seekingalpha.com/news/4081655-tko-group-jumps-after-settling-lawsuit-for-335m?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
An error occurred while fetching or parsing the article: HTTPSConnectionPool(host='markets.businessinsider.comhttps', port=443): Max retries exceeded with url: //seekingalpha.com/news/4081655-tko-group-jumps-after-settling-lawsuit-for-335m?utm_source=businessinsider&utm_medium=referral&feed_item_type=news (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x1272f8f50>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
73
https://markets.businessinsider.com/news/stocks/netflix-to-rally-around-14-here-are-10-top-analyst-forecasts-for-wednesday-1033180284
74
https://markets.businessinsider.com/news/stocks/netflix-s

12
https://markets.businessinsider.com/news/stocks/boeing-buys-sustainable-fuel-to-support-2024-us-commercial-operations-1033254168
13
https://markets.businessinsider.comhttps://seekingalpha.com/news/4090144-boeing-makes-largest-purchase-of-blended-sustainable-aviation-fuel?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
An error occurred while fetching or parsing the article: HTTPSConnectionPool(host='markets.businessinsider.comhttps', port=443): Max retries exceeded with url: //seekingalpha.com/news/4090144-boeing-makes-largest-purchase-of-blended-sustainable-aviation-fuel?utm_source=businessinsider&utm_medium=referral&feed_item_type=news (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x126575010>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
14
https://markets.businessinsider.com/news/stocks/3-airline-stocks-to-buy-now-q2-edition-1033254251
15
https://markets.businessinsider.com/news/

50
https://markets.businessinsider.comhttps://seekingalpha.com/news/4087273-goldman-sachs-spotlights-q1-winners-and-losers-naming-bitcoin-nvda-ba-tsla-and-more?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
An error occurred while fetching or parsing the article: HTTPSConnectionPool(host='markets.businessinsider.comhttps', port=443): Max retries exceeded with url: //seekingalpha.com/news/4087273-goldman-sachs-spotlights-q1-winners-and-losers-naming-bitcoin-nvda-ba-tsla-and-more?utm_source=businessinsider&utm_medium=referral&feed_item_type=news (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x12716d990>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
51
https://markets.businessinsider.com/news/stocks/is-it-time-to-buy-the-dows-worst-performing-stocks-of-2024-3-to-consider-1033216144
52
https://markets.businessinsider.com/news/stocks/analysts-offer-insights-on-industrial-goods-companies-fed

85
https://markets.businessinsider.com/news/stocks/maintaining-buy-rating-on-boeing-long-term-success-over-short-term-setbacks-1033191626
86
https://markets.businessinsider.comhttps://seekingalpha.com/news/4083074-airbus-may-buy-some-spirit-aerosystems-assets-cfo-says?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
An error occurred while fetching or parsing the article: HTTPSConnectionPool(host='markets.businessinsider.comhttps', port=443): Max retries exceeded with url: //seekingalpha.com/news/4083074-airbus-may-buy-some-spirit-aerosystems-assets-cfo-says?utm_source=businessinsider&utm_medium=referral&feed_item_type=news (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x127695210>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
87
https://markets.businessinsider.com/news/stocks/time-to-load-up-3-stocks-sinking-to-new-52-week-lows-1033183883
88
https://markets.businessinsider.com/news/stoc

26
https://markets.businessinsider.com/news/stocks/walt-disney-s-strategic-financial-mastery-and-market-outperformance-justify-buy-rating-1033217515
27
https://markets.businessinsider.com/news/etf/s-p-500-s-q1-standouts-energy-communication-sectors-in-the-lead-—-10-top-performing-stocks-1033215807
28
https://markets.businessinsider.com/news/stocks/disney-stock-set-for-20-upside-as-bob-iger-s-turnaround-strategy-gains-momentum-says-bofa-appears-to-be-in-command-and-control-and-on-a-growth-offensive-1033214713
29
https://markets.businessinsider.com/news/stocks/3-dow-stocks-to-sell-in-april-before-they-crash-burn-1033212180
30
https://markets.businessinsider.com/news/stocks/buy-rating-affirmed-for-disney-following-favorable-settlement-and-positive-growth-prospects-1033211165
31
https://markets.businessinsider.com/news/stocks/ttwo-lyv-dis-which-entertainment-stock-is-the-strongest-buy-1033208995
32
https://markets.businessinsider.comhttps://seekingalpha.com/news/4084488-meta-nflx-top-chart

56
https://markets.businessinsider.com/news/stocks/7-smart-stocks-to-buy-for-your-childrens-portfolio-1033154395
57
https://markets.businessinsider.com/news/etf/oppenheimer-scoops-the-96th-academy-awards-with-7-oscars-emma-stone-a-second-actress-award-for-poor-things-1033152624
58
https://markets.businessinsider.com/news/stocks/analysts-offer-insights-on-communication-services-companies-walt-disney-dis-and-bilibili-bili-1033149919
59
https://markets.businessinsider.com/news/stocks/the-top-3-sp-500-stocks-to-buy-in-march-2024-1033147748
60
https://markets.businessinsider.com/news/stocks/mattel-s-entertainment-push-analyst-eyes-booming-toy-sales-with-disney-s-moana-2-and-frozen-3-in-the-pipeline-1033147419
61
https://markets.businessinsider.com/news/stocks/big-money-bets-3-stocks-the-biggest-hedge-funds-are-buying-1033146507
62
https://markets.businessinsider.com/news/stocks/why-is-grom-social-grom-stock-up-40-today-1033143886
63
https://markets.businessinsider.com/news/stocks/the-top-3-

96
https://markets.businessinsider.com/news/stocks/the-next-disney-3-entertainment-stocks-that-investors-shouldnt-ignore-1033062737
97
https://markets.businessinsider.com/news/stocks/disney-to-roll-out-ai-powered-ad-tool-for-tailored-commercials-1033052455
98
https://markets.businessinsider.comhttps://seekingalpha.com/news/4065215-stock-market-news-today-nasdaq-sp500-dow-jones?utm_source=businessinsider&utm_medium=referral&feed_item_type=news
An error occurred while fetching or parsing the article: HTTPSConnectionPool(host='markets.businessinsider.comhttps', port=443): Max retries exceeded with url: //seekingalpha.com/news/4065215-stock-market-news-today-nasdaq-sp500-dow-jones?utm_source=businessinsider&utm_medium=referral&feed_item_type=news (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x12724a710>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
99
https://markets.businessinsider.com/news/stocks/top-5-tec

In [4]:
print(df['articleInfo'])

0      In a report released yesterday, Joshua Tilton ...
1      The commercial real estate industry continues ...
2      InvestorPlace - Stock Market News, Stock Advic...
3      Keith Weiss, an analyst from Morgan Stanley, m...
4      There’s a lot to be optimistic about in the Te...
                             ...                        
338    Investing in your 40s can be a daunting task, ...
339    InvestorPlace - Stock Market News, Stock Advic...
340    (RTTNews) - The Walt Disney Co. (DIS) is beta ...
341    As of Feb. 9, 2024, five stocks in the communi...
342    InvestorPlace - Stock Market News, Stock Advic...
Name: articleInfo, Length: 343, dtype: object


In [5]:
df.to_csv('stockNews-1.csv')
p_df.to_csv('paragraph.csv')