In [7]:
import requests
import os
import sys
import pandas as pd

NEWS_API_KEY = os.getenv('NEWS_API_KEY')
current_dir = os.path.dirname(os.path.abspath('__file__'))
plugins_dir = os.path.join(current_dir, '../plugins')
sys.path.append(plugins_dir)

In [2]:


def get_news(query: str, endpoint: str, from_date: str, to_date: str, sort_by='popularity', page_size=100, page_number=1, **kwargs) -> dict: 
    if endpoint not in ['top-headlines', 'everything']:
        raise ValueError('Endpoint must be either "top-headlines" or "everything"')
    
    if sort_by not in ['relevancy', 'popularity', 'publishedAt']:
        raise ValueError('Sort by must be either "relevancy", "popularity", or "publishedAt"')
    
    url = ('https://newsapi.org/v2/everything?'
        f'q={query}&'
        f'from={from_date}&'
        f'to={to_date}&'
        f'sortBy={sort_by}&'
        f'pageSize={page_size}&'
        f'page={page_number}&'
        f'language=en&'
        f'apiKey={NEWS_API_KEY}')
    
    response = requests.get(url)
    return response.json()

# get_news('bitcoin', 'everything', '2024-04-17', None, 'popularity', '5', '1')
get_news('bitcoin', 'everything', '2024-05-15', None, 'popularity', '100', '1')

{'status': 'ok',
 'totalResults': 1318,
 'articles': [{'source': {'id': 'business-insider',
    'name': 'Business Insider'},
   'author': 'fdemott@insider.com (Filip De Mott)',
   'title': 'Bitcoin could see a wave of forced selling as miners face the reality of lower rewards post-halving, research firm says',
   'description': '"If miners were forced to sell even a fraction of their holdings over the coming month this would have a negative impact on markets," Kaiki Research wrote.',
   'url': 'https://markets.businessinsider.com/news/currencies/bitcoin-price-supply-crypto-currency-miners-btc-sell-off-decline-2024-5',
   'urlToImage': 'https://i.insider.com/6643af41b4abc992e8c96de5?width=1200&format=jpeg',
   'publishedAt': '2024-05-15T12:17:10Z',
   'content': "Markets have long considered bitcoin's recent halving as a major price bolster, but it could bring a wave of selling from one corner of the sector,\xa0according to Kaiko Research.\r\nThe April halving is … [+1708 chars]"},
  {'

In [3]:
from operators.NewsAPIOperator import NewsAPIToDataframeOperator

operator = NewsAPIToDataframeOperator(
task_id="extract_api_data",
news_topic='bitcoin',
endpoint = "top-headlines",
from_date = '2024-04-18',
to_date = None,
sort_by = "popularity", 
page_size = 100, 
page_number = 1)

dataframe = operator.execute()




In [4]:
print (dataframe.iloc[0])
df = dataframe.copy()
print(df.iloc[0])

source               {'id': None, 'name': 'Yahoo Entertainment'}
author                                             Will Shanklin
title          Block reportedly greenlit transactions involvi...
description    Block appears to be squarely in the government...
url            https://consent.yahoo.com/v2/collectConsent?se...
urlToImage                                                  None
publishedAt                                 2024-05-01T18:12:23Z
content        If you click 'Accept all', we and our partners...
Name: 0, dtype: object
source               {'id': None, 'name': 'Yahoo Entertainment'}
author                                             Will Shanklin
title          Block reportedly greenlit transactions involvi...
description    Block appears to be squarely in the government...
url            https://consent.yahoo.com/v2/collectConsent?se...
urlToImage                                                  None
publishedAt                                 2024-05-01T18:12:23Z
co

In [15]:
from scripts.transform_news import transform_news_bronze
import pandas as pd
df = dataframe.copy()
bronze = transform_news_bronze(df, 'bitcoin', 100)
print(bronze.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   source        100 non-null    object             
 1   author        92 non-null     object             
 2   title         100 non-null    object             
 3   description   98 non-null     object             
 4   url           100 non-null    object             
 5   published_at  100 non-null    datetime64[ns, UTC]
 6   content       100 non-null    object             
 7   job_log_id    100 non-null    int64              
 8   topic         100 non-null    object             
dtypes: datetime64[ns, UTC](1), int64(1), object(7)
memory usage: 7.2+ KB
None


In [41]:
df = bronze.copy()
from transformers import pipeline

MIN_WORD_COUNT_CONTENT = 10
def transform_news_silver(df):
    # Basic Transformations
    df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')
    df['published_date'] = df['published_at'].dt.date.astype('datetime64[s]')
    df["year"] = df["published_date"].dt.year
    df["month"] = df["published_date"].dt.month
    df["day"] = df["published_date"].dt.day

    df = df[df['content'].apply(lambda x: len(x.split(' ')) >= MIN_WORD_COUNT_CONTENT)]
    
    # Sentiment Analysis
    sentiment_pipeline = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment', tokenizer='mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis')
    def get_sentiment(text):
        result = sentiment_pipeline(text)
        return {'label': result[0]['label'], "score": result[0]['score']}
    
    df['title_sentiment'] = df['title'].apply(lambda x: get_sentiment(x).get('label'))
    df['title_score'] = df['title'].apply(lambda x: get_sentiment(x).get('score'))

    df['content_sentiment'] = df['content'].apply(lambda x: get_sentiment(x).get('label'))
    df['content_score'] = df['content'].apply(lambda x: get_sentiment(x).get('score'))

    # Drop duplicates
    df.drop_duplicates(inplace=True)
    return df 

silver = transform_news_silver(df)

In [43]:
silver.info()
silver[0:10]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   source             100 non-null    object             
 1   author             92 non-null     object             
 2   title              100 non-null    object             
 3   description        98 non-null     object             
 4   url                100 non-null    object             
 5   published_at       100 non-null    datetime64[ns, UTC]
 6   content            100 non-null    object             
 7   job_log_id         100 non-null    int64              
 8   topic              100 non-null    object             
 9   published_date     100 non-null    datetime64[s]      
 10  year               100 non-null    int32              
 11  month              100 non-null    int32              
 12  day                100 non-null    int32           

Unnamed: 0,source,author,title,description,url,published_at,content,job_log_id,topic,published_date,year,month,day,title_sentiment,title_score,content_sentiment,content_score
0,Yahoo Entertainment,Will Shanklin,Block reportedly greenlit transactions involvi...,Block appears to be squarely in the government...,https://consent.yahoo.com/v2/collectConsent?se...,2024-05-01 18:12:23+00:00,"If you click 'Accept all', we and our partners...",100,bitcoin,2024-05-01,2024,5,1,3 stars,0.293727,1 star,0.347548
1,Wired,Joel Khalili,FTX Creditors Say Payout Deal Is 'an Insult'—a...,FTX has a plan to repay its former crypto cust...,https://www.wired.com/story/ftx-creditors-cryp...,2024-05-08 17:00:02+00:00,Some creditors of the bankrupt crypto exchange...,100,bitcoin,2024-05-08,2024,5,8,1 star,0.559867,2 stars,0.287953
2,Wired,Paul Ford,Generative AI Is Totally Shameless. I Want to ...,The best thing about brain-melting software li...,https://www.wired.com/story/generative-ai-tota...,2024-05-14 12:00:00+00:00,AI has a lot of problems. It helps itself to t...,100,bitcoin,2024-05-14,2024,5,14,3 stars,0.387485,1 star,0.548095
3,Wired,Andy Greenberg,A Vast New Dataset Could Supercharge the AI Hu...,"Blockchain analysis firm Elliptic, MIT, and IB...",https://www.wired.com/story/ai-crypto-tracing-...,2024-05-01 13:00:00+00:00,"As a test of their resulting AI tool, the rese...",100,bitcoin,2024-05-01,2024,5,1,3 stars,0.311581,1 star,0.423237
4,Business Insider,fdemott@insider.com (Filip De Mott),Bitcoin could see a wave of forced selling as ...,"""If miners were forced to sell even a fraction...",https://markets.businessinsider.com/news/curre...,2024-05-15 12:17:10+00:00,Markets have long considered bitcoin's recent ...,100,bitcoin,2024-05-15,2024,5,15,1 star,0.307611,1 star,0.444365
5,Business Insider,yzhan@insider.com (Yuheng Zhan),Bitcoin's bull run may be over and the next mo...,"""It has happened. It is real. You may not want...",https://markets.businessinsider.com/news/curre...,2024-04-30 14:47:48+00:00,Bitcoin may have peaked at its high around $73...,100,bitcoin,2024-04-30,2024,4,30,1 star,0.415429,1 star,0.478706
6,Business Insider,mfox@businessinsider.com (Matthew Fox),The state of Wisconsin purchased $163 million ...,The state of Wisconsin Investment Board purcha...,https://markets.businessinsider.com/news/curre...,2024-05-15 19:47:07+00:00,The state of Wisconsin purchased $163 million ...,100,bitcoin,2024-05-15,2024,5,15,1 star,0.303289,1 star,0.33492
7,Business Insider,yzhan@insider.com (Yuheng Zhan),Stock market today: Indexes mixed as jobless c...,Stock futures pared some losses heading into t...,https://markets.businessinsider.com/news/stock...,2024-05-09 13:52:08+00:00,Major stock indexes were mixed on Thursday as ...,100,bitcoin,2024-05-09,2024,5,9,1 star,0.716533,1 star,0.425159
8,Business Insider,yzhan@insider.com (Yuheng Zhan),Stock market today: Stocks slide for 5th sessi...,The S&P 500 hit its worst losing streak since ...,https://markets.businessinsider.com/news/stock...,2024-04-18 20:06:04+00:00,US stocks on Thursday extended their losing st...,100,bitcoin,2024-04-18,2024,4,18,1 star,0.665576,1 star,0.577832
9,Business Insider,tmohamed@insider.com (Theron Mohamed),Jamie Dimon warns the world order is being cha...,"JPMorgan's CEO warned of sticky inflation, ris...",https://www.businessinsider.com/jpmorgan-jamie...,2024-04-24 11:33:26+00:00,"Most people are financially healthy, but econo...",100,bitcoin,2024-04-24,2024,4,24,2 stars,0.48456,1 star,0.380538


In [8]:
from email_sender.email_sender import EmailSender

email_sender = EmailSender()
dq_check_parameters = {
    "asset_name": 'test',
    "file": f"Job Log Id: {5}",
    "error_exceptions": 'big problem',
    "data_docs_site": 'help me',
    "timestamp": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
}

email_sender.send_dq_notification_email(parameters=dq_check_parameters)

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
japg gbou cgug ufxu
Email sent!
