### Libraries

Following [these](https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/) instructions

In [37]:
# import sys
# !{sys.executable} -m pip install quandl
# !{sys.executable} -m pip install yfinance
# !{sys.executable} -m pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0


In [39]:
import os
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import quandl 
import yfinance as yf

# NLTK VADER for sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
finwiz_url = 'https://finviz.com/quote.ashx?t='

# API key
from dotenv import load_dotenv
load_dotenv('Cindy_Quandl_API.env')

True

### Helper functions

In [31]:
#Quandl Python Tables API Function
def grab_quandl_table(
    table_path,
    avoid_download=False,
    replace_existing=False,
    date_override=None,
    allow_old_file=False,
    **kwargs,
):
    root_data_dir = os.path.join(os.environ["HOME"], "Documents", "DS4A","quandl_data_table_downloads")
    data_symlink = os.path.join(root_data_dir, f"{table_path}_latest.zip")
    if avoid_download and os.path.exists(data_symlink):
        print(f"Skipping any possible download of {table_path}")
        return data_symlink

    table_dir = os.path.dirname(data_symlink)
    if not os.path.isdir(table_dir):
        print(f'Creating new data dir {table_dir}')
        os.makedirs(table_dir)

    if date_override is None:
        my_date = dt.datetime.now().strftime("%Y%m%d")
    else:
        my_date = date_override
    data_file = os.path.join(root_data_dir, f"{table_path}_{my_date}.zip")

    if os.path.exists(data_file):
        file_size = os.stat(data_file).st_size
        if replace_existing or not file_size > 0:
            print(f"Removing old file {data_file} size {file_size}")
        else:
            print(
                f"Data file {data_file} size {file_size} exists already, no need to download"
            )
            return data_file

    api_key = os.environ.get('Cindy_Quandl_API')
    dl = quandl.export_table(
        table_path, filename=data_file, api_key=api_key, **kwargs
    )
    file_size = os.stat(data_file).st_size
    if os.path.exists(data_file) and file_size > 0:
        print(f"Download finished: {file_size} bytes")
        if not date_override:
            if os.path.exists(data_symlink):
                print(f"Removing old symlink")
                os.unlink(data_symlink)
            print(f"Creating symlink: {data_file} -> {data_symlink}")
            os.symlink(
                data_file, data_symlink,
            )
    else:
        print(f"Data file {data_file} failed download")
        return
    return data_symlink if (date_override is None or allow_old_file) else "NoFileAvailable"

In [18]:
def fetch_quandl_table(table_path, avoid_download=True, **kwargs):
    return pd.read_csv(
        grab_quandl_table(table_path, avoid_download=avoid_download, **kwargs)
    )

In [19]:
def Get_News(tickers):
    """
    Retrives news headline of what time frame?
    """
    news_tables = {}
    parsed_news = []

    for ticker in tickers:
        url = finwiz_url + ticker
        req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'}) 
        response = urlopen(req)    
        # Read the contents of the file into 'html'
        html = BeautifulSoup(response)
        # Find 'news-table' in the Soup and load it into 'news_table'
        news_table = html.find(id='news-table')
        # Add the table to our dictionary
        news_tables[ticker] = news_table

    # Iterate through the news to splite the data
    for file_name, news_table in news_tables.items():
        # Iterate through all tr tags in 'news_table'
        for x in news_table.findAll('tr'):
            # read the text from each tr tag into text
            # get text from a only
            try:
                text = x.a.get_text() 
                # splite text in the td tag into a list 
                date_scrape = x.td.text.split()
                # if the length of 'date_scrape' is 1, load 'time' as the only element
            except:
                continue

            if len(date_scrape) == 1:
                time = date_scrape[0]

            # else load 'date' as the 1st element and 'time' as the second    
            else:
                date = date_scrape[0]
                time = date_scrape[1]
            # Extract the ticker from the file name, get the string up to the 1st '_'  
            ticker = file_name.split('_')[0]

            # Append ticker, date, time and headline as a list to the 'parsed_news' list
            parsed_news.append([ticker, date, time, text])

    return pd.DataFrame(parsed_news, columns = ['ticker','date','time','headline'])

### News

- Get_News function with specified tickers name list as input will return a dataframe with 4 columns: 'ticker','date','time','headline';

- BeautifulSoup package will only gather 100 news for each ticker specified;

Let's only use two tickers, 'AMZN' and 'AAPL'.

In [20]:
tickers = ['AMZN','AAPL']
news = Get_News(tickers)
news['date'] = pd.to_datetime(news['date'])
#news['time'] = pd.to_datetime(news['time'], format='%I:%M%p').dt.time
news

Unnamed: 0,ticker,date,time,headline
0,AMZN,2023-06-17,12:00PM,5 Dollar Tree Items That Make Great Gifts
1,AMZN,2023-06-17,10:31AM,10 Stocks That Could Be the Next Apple or Amazon
2,AMZN,2023-06-17,10:05AM,Amazon Stock: Bear vs. Bull
3,AMZN,2023-06-17,09:30AM,"Shopify Up 85% This Year, Time to Sell the Stock?"
4,AMZN,2023-06-17,09:24AM,4 Reasons to Buy This Tesla Rival Hand Over Fist
...,...,...,...,...
195,AAPL,2023-06-11,05:51AM,Here's the Next AI Stock Most Likely to Join t...
196,AAPL,2023-06-11,05:17AM,This week in tech: A tepid response to Apple's...
197,AAPL,2023-06-10,08:45AM,Is AI Taking Tech to the Top?
198,AAPL,2023-06-10,08:00AM,"Entertainment, Guilds, Streaming, and AI"


In [53]:
news['date'].nunique()

8

### Stock prices data

In [40]:
full_price = fetch_quandl_table('QUOTEMEDIA/PRICES', avoid_download=False)

Download finished: 1421193738 bytes
Creating symlink: /Users/amberlee/Documents/DS4A/quandl_data_table_downloads/QUOTEMEDIA/PRICES_20230617.zip -> /Users/amberlee/Documents/DS4A/quandl_data_table_downloads/QUOTEMEDIA/PRICES_latest.zip


In [59]:
full_price.head()

Unnamed: 0,ticker,date,open,high,low,close,volume,dividend,split,adj_open,adj_high,adj_low,adj_close,adj_volume
0,JTKWY,2022-03-11,6.17,7.32,5.79,6.72,9440097.0,0.0,1.0,6.17,7.32,5.79,6.72,9440097.0
1,JTKWY,2022-03-10,6.16,6.175,5.935,6.07,2261623.0,0.0,1.0,6.16,6.175,5.935,6.07,2261623.0
2,FG_1,2020-06-01,8.1,8.39,8.1,8.39,3086317.0,0.0,1.0,8.1,8.39,8.1,8.39,3086317.0
3,FLWS,2022-03-09,14.57,14.9588,14.41,14.45,662492.0,0.0,1.0,14.57,14.9588,14.41,14.45,662492.0
4,RENW_,2020-01-29,21.9768,21.99,21.97,21.99,319.0,0.0,1.0,21.9768,21.99,21.97,21.99,319.0


**Full dataset exploration**

In [45]:
full_price['ticker'].nunique() # number of unique tickers

21117

In [49]:
print(max(full_price['date']))
print(min(full_price['date']))

2023-06-16
1962-01-02


**Filter to relevant tickers**

In [50]:
tickers_price = full_price[(full_price['ticker'].isin(tickers)) & 
                   ((full_price['date'].isin(news.date.unique()))) ].sort_values(by = 'ticker')
tickers_price

Unnamed: 0,ticker,date,open,high,low,close,volume,dividend,split,adj_open,adj_high,adj_low,adj_close,adj_volume


In [56]:
full_price['ticker'].isin(tickers)

0           False
1           False
2           False
3           False
4           False
            ...  
45951495    False
45951496    False
45951497    False
45951498    False
45951499    False
Name: ticker, Length: 45951500, dtype: bool

In [58]:
sum(full_price['date'].isin(news.date.unique()))

0