In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import json
import os
from decouple import config
from langdetect import detect
from lib.datetime_util import timestamp2datetime
from lib.finnhub_api import FinnHub_init, Finnhub
from lib.project import url2text, polarity_score, text_tonkenize

In [702]:
%run ./lib/project.py

In [7]:
with open ('./finnhub/finnhub_key.pkl', 'rb') as fin:
    finnhub_key= pickle.load(fin)
    fin.close()

In [9]:
apple = Finnhub(finnhub_key, "2021-05-03", "2021-05-07", "AAPL")

In [10]:
apple_news = apple.company_news()

In [14]:
apple_finnhub_df = pd.DataFrame(apple_news)

In [15]:
apple_finnhub_df.head()

Unnamed: 0,category,datetime,headline,id,image,related,source,summary,url
0,company,1620430108,Epic v. Apple Will Have Ramifications for Big ...,67685543,https://s.yimg.com/hd/cp-video-transcode/prod/...,AAPL,Yahoo,May.07 -- Brookings Institution Visiting Fello...,https://finnhub.io/api/news?id=a05b585f9aa8113...
1,company,1620430088,The Epic Battle Over Apple's App Store,67685545,https://s.yimg.com/hd/cp-video-transcode/prod/...,AAPL,Yahoo,May.07 -- Epic Games claims that Apple is abus...,https://finnhub.io/api/news?id=8c81f40d85036c5...
2,company,1620426540,Apple v. Epic: Time is running out for Epic to...,67685548,,AAPL,Yahoo,As Epic Games Inc. methodically made its case ...,https://finnhub.io/api/news?id=994d1806d0596ef...
3,company,1620426370,3 Tech Stocks to Buy Now After Disappointing J...,67678468,https://s.yimg.com/uu/api/res/1.2/PHAQWn.zDysu...,AAPL,Yahoo,"Clearly, hiring could rebound in May, but let'...",https://finnhub.io/api/news?id=ff7e5602436ea5b...
4,company,1620417960,"With a $78 Billion Market Cap, Dogecoin Needs ...",67682478,https://images.barrons.com/im-335443/social,AAPL,MarketWatch,The cryptocurrency lacks the mainstream appeal...,https://finnhub.io/api/news?id=eded7686d0fa5c8...


In [16]:
apple_finnhub_df.to_csv("./data/apple_finnhub_raw_df.csv")

In [19]:
# please enable Javascript and cookies in your browser
articles = url2text(apple_finnhub_df.url.to_list())

500/500 100% https://finnhub.io/api/news?id=7fd339eca187c4c2f3320ebf578fa775f63404ce424818a502a4b559e355e12a

In [20]:
with open("./data/apple_finnhub_articles.pkl","wb") as f:
    pickle.dump(articles, f)
    f.close()

In [21]:
apple_finnhub_articles_df = pd.Series(articles, name='articles')
apple_finnhub_articles_df.to_csv("./data/apple_finnhub_articles.csv")

In [22]:
apple_finnhub_articles_df[:5]

0    May.07 -- Brookings Institution Visiting Fello...
1    May.07 -- Epic Games claims that Apple is abus...
2    As Epic Games Inc. methodically made its case ...
3    \n        Show Ad\n          The U.S. economy ...
4    Dogecoin soared above 69 cents on Wednesday—no...
Name: articles, dtype: object

In [23]:
apple_finnhub_articles_df = text_tonkenize(apple_finnhub_articles_df)

In [24]:
apple_finnhub_scores = polarity_score(apple_finnhub_articles_df)

In [25]:
apple_finnhub_scores.head()

Unnamed: 0,text,negative,neutral,positive,compound,sentiment
0,may brookings institution visiting fellow bill...,0.079,0.819,0.101,0.9985,positive
1,may epic games claims that apple is abusing it...,0.08,0.819,0.102,0.9985,positive
2,as epic games inc methodically made its case i...,0.079,0.82,0.101,0.9985,positive
3,show ad the u s economy fell well short of job...,0.067,0.821,0.112,0.9998,positive
4,dogecoin soared above cents on wednesday not b...,0.054,0.834,0.111,0.991,positive


## Data Scrubbing

In [26]:
# prepare our data concatenating the 3 main dataframe for scrubbing
apple_finnhub_df = pd.concat([apple_finnhub_df, apple_finnhub_scores], axis=1, ignore_index=False, sort=False).reindex()

In [29]:
# convert unix timestamp to datetime object
apple_finnhub_df['datetime'] = apple_finnhub_df['datetime'].map(lambda x: timestamp2datetime(x))

In [31]:
apple_finnhub_df.head(3)

Unnamed: 0,category,datetime,headline,id,image,related,source,summary,url,text,negative,neutral,positive,compound,sentiment
0,company,2021-05-07 19:28:28,Epic v. Apple Will Have Ramifications for Big ...,67685543,https://s.yimg.com/hd/cp-video-transcode/prod/...,AAPL,Yahoo,May.07 -- Brookings Institution Visiting Fello...,https://finnhub.io/api/news?id=a05b585f9aa8113...,may brookings institution visiting fellow bill...,0.079,0.819,0.101,0.9985,positive
1,company,2021-05-07 19:28:08,The Epic Battle Over Apple's App Store,67685545,https://s.yimg.com/hd/cp-video-transcode/prod/...,AAPL,Yahoo,May.07 -- Epic Games claims that Apple is abus...,https://finnhub.io/api/news?id=8c81f40d85036c5...,may epic games claims that apple is abusing it...,0.08,0.819,0.102,0.9985,positive
2,company,2021-05-07 18:29:00,Apple v. Epic: Time is running out for Epic to...,67685548,,AAPL,Yahoo,As Epic Games Inc. methodically made its case ...,https://finnhub.io/api/news?id=994d1806d0596ef...,as epic games inc methodically made its case i...,0.079,0.82,0.101,0.9985,positive


In [37]:
apple_finnhub_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   category    500 non-null    object 
 1   datetime    500 non-null    object 
 2   headline    500 non-null    object 
 3   id          500 non-null    int64  
 4   image       500 non-null    object 
 5   related     500 non-null    object 
 6   source      500 non-null    object 
 7   summary     500 non-null    object 
 8   url         500 non-null    object 
 9   text        474 non-null    object 
 10  negative    500 non-null    float64
 11  neutral     500 non-null    float64
 12  positive    500 non-null    float64
 13  compound    500 non-null    float64
 14  sentiment   474 non-null    object 
 15  word_count  500 non-null    float64
dtypes: float64(5), int64(1), object(10)
memory usage: 62.6+ KB


In [73]:
# we got just 27 missing scraped articles, those will be droped
# drop rows with condition on columns. we're droping only row by index for every Null value on TEXT column.
apple_finnhub_df.drop(apple_finnhub_df.loc[(apple_finnhub_df["text"].isna())].index, inplace=True)

In [35]:
apple_finnhub_df['word_count'] = apple_finnhub_df['text'].apply(lambda x: len(x.split()) if type(x)!=float else x)

In [76]:
# drop only rows with 0 count
apple_finnhub_df.drop(apple_finnhub_df[(apple_finnhub_df['word_count']==0)].index, inplace=True)

In [81]:
apple_finnhub_df.to_csv("./data/apple_finnhub_df.csv")

## Stock Market TimeSeries

In [776]:
apple_hist = apple.stock_candles("60")

In [21]:
hist_cols = ['close', 'high', 'low', 'open', 'status','timestamp', 'volume']

In [50]:
apple_hist_df = pd.DataFrame(apple_hist)
apple_hist_df.columns = hist_cols
apple_hist_df.head()

Unnamed: 0,close,high,low,open,status,timestamp,volume
0,131.86,132.3,131.75,131.9,ok,1620028800,65074
1,131.67,132.13,131.67,131.68,ok,1620036000,33445
2,132.04,132.23,131.66,131.74,ok,1620039600,237596
3,132.15,132.19,131.98,132.039,ok,1620043200,471064
4,133.21,133.355,131.83,132.13,ok,1620046800,11410327
