In [38]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

def adjust_url(url):
    if 'https://' not in url:
        return 'https://www.nasdaq.com' + url
    else:
        return url

def scrape_news_text(news_url):
 
    news_html = requests.get(news_url).content
 
    '''convert html to BeautifulSoup object'''
    news_soup = BeautifulSoup(news_html , 'lxml')
 
    paragraphs = [par.text for par in news_soup.find_all('p')]
    news_text = '\n'.join(paragraphs)
 
    return news_text

def get_news_urls(links_site):
    '''scrape the html of the site'''
    resp = requests.get(links_site)
 
    if not resp.ok:
        return None
 
    html = resp.content
 
    '''convert html to BeautifulSoup object'''
    soup = BeautifulSoup(html , 'lxml')
 
    '''get list of all links on webpage'''
    links = soup.find_all('a')
 
    urls = [link.get('href') for link in links]
    urls = [url for url in urls if url is not None]
 
    '''Filter the list of urls to just the news articles'''
    news_urls = [url for url in urls if '/articles/' in url]
 
    return news_urls

def scrape_all_articles(ticker , upper_page_limit = 5):
 
    landing_site = 'https://www.nasdaq.com/market-activity/stocks/' + ticker + '/news-headlines'
 
    raw_news_urls = get_news_urls(landing_site)
    all_news_urls = [adjust_url(url) for url in raw_news_urls]
 
    current_urls_list = all_news_urls.copy()
 
    index = 2
 
    '''Loop through each sequential page, scraping the links from each'''
    while (current_urls_list is not None) and (current_urls_list != []) and (index <= upper_page_limit):
 
        '''Construct URL for page in loop based off index'''
        current_site = landing_site + '?page=' + str(index)
        current_urls_list = get_news_urls(current_site)
 
        '''Append current webpage's list of urls to all_news_urls'''
        all_news_urls = all_news_urls + current_urls_list
 
        index = index + 1
 
    all_news_urls = list(set(all_news_urls))
    all_news_urls = [adjust_url(url) for url in all_news_urls]
 
    '''Now, we have a list of urls, we need to actually scrape the text'''
    all_articles = [scrape_news_text(news_url) for news_url in all_news_urls]
 
    return all_articles

myText = scrape_all_articles('aapl', 5)
print(myText)

['\n\n<!--/*--><![CDATA[/* ><!--*/\n\n<!--/*--><![CDATA[/* ><!--*/\n#block-survival-2 {\n  display: none;\n}\n.symbol-ticker--large-symbol.symbol-ticker--down .symbol-ticker__symbol-data .symbol-ticker__arrow::before {\n  color: #fd6e70;\n  -webkit-transform: scaleY(-1) rotate(180deg);\n  transform: scaleY(-1) rotate(180deg);\n}\n\n\n.path-frontpage.page-node-type-homepage .header-group {\n  margin-bottom: -10rem;\n  padding-top: 5rem;\n}\n\n.primary-nav__links[aria-hidden="true"] {\n  pointer-events: none;\n}\n\n.primary-nav__mega[aria-hidden="true"] {\n  pointer-events: none;\n}\n\n/*--><!]]]]><![CDATA[>*/\n\n/*--><!]]>*/\n\n\nAnother one bites the dust. The streaming wars continue to heat up and Roku (NASDAQ:) is the latest company to fall victim. Last Friday, the company’s shares fell below $100 — which is quite a large drop from its 52-week high of $176.55.\xa0\n\n\n \n last week alone, making it the company’s worst week since it went public two years earlier. The stock fell after