In [11]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo

In [12]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [13]:
# Define database and collection
db = client.nhl_db
collection = db.articles

In [14]:
# URL of page to be scraped
url = 'https://www.nhl.com/news'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [18]:
# Retrieve the parent divs for all articles
results = soup.find_all('div', class_='article-item__top')

# loop over results to get article data
for result in results:
    # scrape the article header 
    header = result.find('h1', class_='article-item__headline').text
    
    # scrape the article subheader
    subheader = result.find('h2', class_='article-item__subheader').text
    
    # scrape the datetime
    datetime = result.find('span', class_='article-item__date')['data-date'] 
    
    # get only the date from the datetime
    date = datetime.split('T')[0]
    
    # print article data
    print('-----------------')
    print(header)
    print(subheader)
    print(date)

    # Dictionary to be inserted into MongoDB
    post = {
        'header': header,
        'subheader': subheader,
        'date': date,
    }

    # Insert dictionary into MongoDB as a document
    collection.update_one(
        {
            'header':header,
            'subheader':subheader,
            'date':'date'
        },
        {'$set': {
            'header':header,
            'subheader':subheader,
            'date':date
            }
        },
        upsert=True
    )

-----------------
NHL season would start Jan. 13, play 56 games in agreement with NHLPA
Tentative deal includes Canada division, needs approval
2020-12-18
-----------------
MacKinnon says Avalanche may be favorites to win Stanley Cup
Hart Trophy finalist helped Colorado reach Western Conference Second Round last season
2020-12-18
-----------------
Dach named Canada captain for World Juniors
Blackhawks center learns he was chosen for role in McDavid video announcement
2020-12-18
-----------------
Lundqvist won't play for Capitals this season because of heart condition
Goalie signed one-year contract Oct. 9 to share time with Samsonov after bought out by Rangers
2020-12-17
-----------------
Finland all-time WJC starting lineup includes Teravainen, Laine
Tikkanen, Numminen, Lehtonen also on team picked by NHL.com
2020-12-19
-----------------
Reverse Retro alternate jerseys for all 31 teams unveiled by NHL, adidas
Now available for purchase, will be worn multiple times this season
2020-12-

In [6]:
# Display the MongoDB records created above
articles = db.articles.find()
for article in articles:
    print(article)

{'_id': ObjectId('5ee7e54aadf691a8f760f84f'), 'header': '5 questions for Canadiens in Stanley Cup Qualifiers', 'subheader': 'Play of Price, Domi status among unknowns heading into best-of-5 series vs. Penguins', 'date': '2020-06-15'}
{'_id': ObjectId('5ee7e54aadf691a8f760f850'), 'header': 'Phase 2 Buzz: Holtby, Kuznetsov, Carlson on ice for Capitals', 'subheader': 'Farabee among Flyers who skated Monday; Krug, Bergeron, Marchand participate for Bruins', 'date': '2020-06-15'}
{'_id': ObjectId('5ee7e54aadf691a8f760f851'), 'header': "Draisaitl of Oilers should win Hart Trophy, Bruins' Pastrnak says", 'subheader': 'Edmonton forward led NHL in points; Boston right wing tied for most goals', 'date': '2020-06-15'}
{'_id': ObjectId('5ee7e54aadf691a8f760f852'), 'header': '5 questions for Blackhawks in Stanley Cup Qualifiers', 'subheader': 'Play of DeBrincat, Crawford among unknowns heading into best-of-5 series vs. Oilers', 'date': '2020-06-15'}
{'_id': ObjectId('5ee7e54aadf691a8f760f853'), 'he