In [1]:
import requests
from bs4 import BeautifulSoup
import yfinance as yf
from datetime import datetime
from datetime import timedelta
import pandas as pd
import re
import json

def changes_from_press(stock_data, press_date, period):
    next_day = timedelta(days=1)
    time_after_release = timedelta(days=period)
    

    next_trading_day = press_date
    day_1_price = stock_data[stock_data['Date'] == press_date]['Open'].values

    while not day_1_price:
        next_trading_day = next_trading_day + next_day

        day_1_price = stock_data[stock_data['Date'] == (next_trading_day)]['Open'].values

    next_trading_day = next_trading_day + time_after_release
    day_2_price = stock_data[stock_data['Date'] == (next_trading_day)]['Close'].values

    while not day_2_price:
        next_trading_day = next_trading_day + next_day

        day_2_price = stock_data[stock_data['Date'] == (next_trading_day + next_day)]['Close'].values


    pct_change = ((day_2_price - day_1_price) / day_2_price)*100
    pct_change = pct_change[0]
    return pct_change

def get_df(ticker):
    
    stock_data = yf.Ticker(ticker)

    # get historical market data
    stock_hist = stock_data.history(period="max")
    
    # make date a column not index
    stock_hist.reset_index(inplace=True) 

    stock_hist['Date'] = pd.to_datetime(stock_hist['Date']).dt.date

    stock_hist['Pct_Close'] = stock_hist['Close'].pct_change()*100

    return stock_hist

In [12]:
data = []
page_num = 0

ticker = 'msft'

stock_hist = get_df(ticker)


while True:
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"}
    
    s = requests.Session()
    html = requests.get(f'https://news.microsoft.com/category/press-releases/page/{page_num}/', headers=headers)
    

    soup = BeautifulSoup(html.content) 
    content = soup.find('section', attrs={'content-area'})
    if content == None:
        break

    articles = content.find_all('article')
    
    for article in articles:
        date = article.find('div', attrs={'c-paragraph-3 c-meta-text'}).text.lstrip()
        title = article.find('a').text
        
        
        date = date.lstrip().rstrip()    # remove starting and trailing whitespaces
        title = title.lstrip().rstrip() 
        
        date = date.replace(',', '')
        
        
        date = datetime.strptime(date, '%B %d %Y').date()
        pct_change = changes_from_press(stock_hist, date, 1)
        
        data.append([date, title, pct_change])
        
    page_num = page_num + 1
    
data = pd.DataFrame(data, columns=['date', 'press title', '1d change'])

dates_str = [date.strftime("%m/%d/%Y") for date in data['date'].tolist()]

dict_data = {
    'ticker': ticker,
    
    'data': {
        'dates':dates_str,
        'title': data['press title'].tolist(),
        'priceChange': data['1d change'].tolist(),
    },
}

with open(f'./data/{ticker}.json', 'w', encoding='utf-8') as f:
    json.dump(dict_data, f, ensure_ascii=False, indent=4)



  while not day_1_price:
  while not day_2_price:

KeyboardInterrupt

