In [1]:
run __init__.py

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import json
import os
from decouple import config
from langdetect import detect
from datetime_util import timestamp2datetime
from finnhub_api import FinnHub_init, Finnhub
from project import polarity_score, text_tonkenize
from scrapy import Scrapy

In [23]:
with open ('./finnhub/finnhub_key.pkl', 'rb') as fin:
    finnhub_key= pickle.load(fin)
    fin.close()

In [36]:
apple = Finnhub(finnhub_key, "2021-05-07", "2021-05-07", "AAPL")

In [37]:
apple_news = apple.company_news()

In [85]:
apple_finnhub_df = pd.DataFrame(apple_news)

In [68]:
apple_finnhub_df.head(3)

Unnamed: 0,category,datetime,headline,id,image,related,source,summary,url
0,company,1620430108,Epic v. Apple Will Have Ramifications for Big ...,67685543,https://s.yimg.com/hd/cp-video-transcode/prod/...,AAPL,Yahoo,May.07 -- Brookings Institution Visiting Fello...,https://finnhub.io/api/news?id=a05b585f9aa8113...
1,company,1620430088,The Epic Battle Over Apple's App Store,67685545,https://s.yimg.com/hd/cp-video-transcode/prod/...,AAPL,Yahoo,May.07 -- Epic Games claims that Apple is abus...,https://finnhub.io/api/news?id=8c81f40d85036c5...
2,company,1620426540,Apple v. Epic: Time is running out for Epic to...,67685548,,AAPL,Yahoo,As Epic Games Inc. methodically made its case ...,https://finnhub.io/api/news?id=994d1806d0596ef...


In [40]:
scrap = Scrapy()

In [41]:
articles = scrap.scrap(apple_finnhub_df.url.to_list())

91/91 99% https://finnhub.io/api/news?id=cc27456ee0e323ec46d428c16310dcd5003d757ae0244e71507c0e41c846c62d

In [42]:
apple_finnhub_articles_df = pd.Series(articles, name='articles')

In [43]:
apple_finnhub_articles_df[:5]

0    [May.07 -- Brookings Institution Visiting Fell...
1    [(8, https://finnhub.io/api/news?id=01f0222ef7...
Name: articles, dtype: object

In [23]:
apple_finnhub_articles_df = text_tonkenize(apple_finnhub_articles_df)

In [24]:
apple_finnhub_scores = polarity_score(apple_finnhub_articles_df)

In [25]:
apple_finnhub_scores.head()

Unnamed: 0,text,negative,neutral,positive,compound,sentiment
0,may brookings institution visiting fellow bill...,0.079,0.819,0.101,0.9985,positive
1,may epic games claims that apple is abusing it...,0.08,0.819,0.102,0.9985,positive
2,as epic games inc methodically made its case i...,0.079,0.82,0.101,0.9985,positive
3,show ad the u s economy fell well short of job...,0.067,0.821,0.112,0.9998,positive
4,dogecoin soared above cents on wednesday not b...,0.054,0.834,0.111,0.991,positive


## Data Scrubbing

In [26]:
# prepare our data concatenating the 3 main dataframe for scrubbing
apple_finnhub_df = pd.concat([apple_finnhub_df, apple_finnhub_scores], axis=1, ignore_index=False, sort=False).reindex()

In [49]:
apple_finnhub_df.rename(columns={'datetime':'date'}, inplace=True)

In [51]:
# convert unix timestamp to datetime object
apple_finnhub_df['date'] = apple_finnhub_df['date'].map(lambda x: timestamp2datetime(x))

In [56]:
apple_finnhub_df['date'] = pd.to_datetime(apple_finnhub_df['date'], format='%Y-%m-%d %H:%M:%S')

In [57]:
apple_finnhub_df.head(3)

Unnamed: 0,category,date,headline,id,image,related,source,summary,url
0,company,2021-05-07 19:28:28,Epic v. Apple Will Have Ramifications for Big ...,67685543,https://s.yimg.com/hd/cp-video-transcode/prod/...,AAPL,Yahoo,May.07 -- Brookings Institution Visiting Fello...,https://finnhub.io/api/news?id=a05b585f9aa8113...
1,company,2021-05-07 19:28:08,The Epic Battle Over Apple's App Store,67685545,https://s.yimg.com/hd/cp-video-transcode/prod/...,AAPL,Yahoo,May.07 -- Epic Games claims that Apple is abus...,https://finnhub.io/api/news?id=8c81f40d85036c5...
2,company,2021-05-07 18:29:00,Apple v. Epic: Time is running out for Epic to...,67685548,,AAPL,Yahoo,As Epic Games Inc. methodically made its case ...,https://finnhub.io/api/news?id=994d1806d0596ef...


In [58]:
apple_finnhub_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   category  91 non-null     object        
 1   date      91 non-null     datetime64[ns]
 2   headline  91 non-null     object        
 3   id        91 non-null     int64         
 4   image     91 non-null     object        
 5   related   91 non-null     object        
 6   source    91 non-null     object        
 7   summary   91 non-null     object        
 8   url       91 non-null     object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 6.5+ KB


In [73]:
# we got just 27 missing scraped articles, those will be droped
# drop rows with condition on columns. we're droping only row by index for every Null value on TEXT column.
apple_finnhub_df.drop(apple_finnhub_df.loc[(apple_finnhub_df["text"].isna())].index, inplace=True)

In [35]:
apple_finnhub_df['word_count'] = apple_finnhub_df['text'].apply(lambda x: len(x.split()) if type(x)!=float else x)

In [76]:
# drop only rows with 0 count
apple_finnhub_df.drop(apple_finnhub_df[(apple_finnhub_df['word_count']==0)].index, inplace=True)