In [3]:
import requests
from bs4 import BeautifulSoup
import yfinance as yf
from datetime import datetime
from datetime import timedelta
import pandas as pd
import json
import numpy as np

def changes_from_press(stock_data, press_date, period):
    next_day = timedelta(days=1)
    time_after_release = timedelta(days=period)
    

    next_trading_day = press_date
    
    # try to obtain share price at open on press release date
    day_1_price = stock_data[stock_data['Date'] == press_date]['Open'].values


    # increase the date till the first available open price is found
    num_days = 1
    while not day_1_price:
        if num_days > 30:
            return None
        
        next_trading_day = next_trading_day + next_day

        day_1_price = stock_data[stock_data['Date'] == (next_trading_day)]['Open'].values
        num_days = num_days + 1

    # get next day if available
    next_trading_day = next_trading_day + time_after_release
    day_2_price = stock_data[stock_data['Date'] == (next_trading_day)]['Close'].values

    # increase the date till the second available close price is found
    num_days = 1
    while not day_2_price:
        if num_days > 30: 
            return None
        
        next_trading_day = next_trading_day + next_day

        day_2_price = stock_data[stock_data['Date'] == (next_trading_day + next_day)]['Close'].values
        num_days = num_days + 1
    
    # calcualte percent difference between share prices
    pct_change = ((day_2_price - day_1_price) / day_2_price)*100
    pct_change = pct_change[0]
    return pct_change

def get_df(ticker):
    
    stock_data = yf.Ticker(ticker)

    # get historical market data
    stock_hist = stock_data.history(period="max")

    stock_hist.reset_index(inplace=True)

    stock_hist['Date'] = pd.to_datetime(stock_hist['Date']).dt.date

    stock_hist['Pct_Close'] = stock_hist['Close'].pct_change()*100

    return stock_hist

In [36]:
data = []
offset = 100

ticker = 'fslr'

stock_hist = get_df(ticker)

year = datetime.today().year

years = np.arange(year, 2003, -1)

# create break
for year in years:
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"}
    
    s = requests.Session()
    html = requests.get(f'https://investor.firstsolar.com/news/{year}/default.aspx', headers=headers)
    

    soup = BeautifulSoup(html.content) 

    articles = soup.find_all('div',attrs={'class':'module_item'})
    
    if not articles:
        break
        
    for article in articles:
        date = article.find('span', attrs={'class':'module_date-text'}).text.lstrip()
        title = article.find('a', attrs={'class':'module_headline-link'}).text
        
        
        date = date.lstrip().rstrip()    # remove starting and trailing whitespaces
        title = title.lstrip().rstrip() 
        
        date = date.replace(',', '')
        
        
        date = datetime.strptime(date, '%B %d %Y').date()
        pct_change = changes_from_press(stock_hist, date, 1)
        
        data.append([date, title, pct_change])
    
    offset = offset + 100
    
data = pd.DataFrame(data, columns=['date', 'press title', '1d change'])

data = data.dropna()

dates_str = [date.strftime("%m/%d/%Y") for date in data['date'].tolist()]
result = data.to_json(orient="values", index=True)


dict_data = {
    'ticker': ticker,
    'data': json.loads(result),
    },


with open(f'./data/{ticker}.json', 'w', encoding='utf-8') as f:
    json.dump(dict_data, f, ensure_ascii=False, indent=4)

  while not day_2_price:
  while not day_1_price:


In [34]:
dict_data

{'ticker': 'fslr',
 'data': {'[[1681084800000, "First Solar, Inc. to Announce First Quarter 2023 Financial Results on April 27, 2023", 3.3518989321], [1680134400000, "EDP Renewables Places Multi-Year Order for 1.8 GW of First Solar Modules", -0.5195424749], [1680048000000, "Indian Government Awards Production Linked Incentives to First Solar\\u2019s India Manufacturing Facility", 0.5437856845], [1677542400000, "First Solar, Inc. Announces Fourth Quarter and Full Year 2022 Financial Results and 2023 Guidance", 15.0449653948], [1677542400000, "Lightsource bp Places Multi-Year Order for 4 GW of First Solar Modules", 15.0449653948], [1676332800000, "First Solar, Inc. to Announce Fourth Quarter and Full Year 2022 Financial Results and 2023 Financial Guidance on February 28, 2023", 6.0222817626], [1673481600000, "First Solar Completes Sale of 141 MW Luz del Norte Power Plant to Toesca", 3.9347334712], [1671062400000, "National Grid Renewables Orders 1.6 GW of First Solar Modules", -2.2053121

In [31]:
result = data.to_json(orient="values", index=True)
json.dumps(json.loads(result))

[[1681084800000,
  'First Solar, Inc. to Announce First Quarter 2023 Financial Results on April 27, 2023',
  3.3518989321],
 [1680134400000,
  'EDP Renewables Places Multi-Year Order for 1.8 GW of First Solar Modules',
  -0.5195424749],
 [1680048000000,
  'Indian Government Awards Production Linked Incentives to First Solar’s India Manufacturing Facility',
  0.5437856845],
 [1677542400000,
  'First Solar, Inc. Announces Fourth Quarter and Full Year 2022 Financial Results and 2023 Guidance',
  15.0449653948],
 [1677542400000,
  'Lightsource bp Places Multi-Year Order for 4 GW of First Solar Modules',
  15.0449653948],
 [1676332800000,
  'First Solar, Inc. to Announce Fourth Quarter and Full Year 2022 Financial Results and 2023 Financial Guidance on February 28, 2023',
  6.0222817626],
 [1673481600000,
  'First Solar Completes Sale of 141 MW Luz del Norte Power Plant to Toesca',
  3.9347334712],
 [1671062400000,
  'National Grid Renewables Orders 1.6 GW of First Solar Modules',
  -2.2053

In [10]:
list_of_jsons = data.to_json(orient='index', lines=True).splitlines()
list_of_jsons


ValueError: 'lines' keyword only valid when 'orient' is records