In [1]:
from keys import keys

import json
import pandas as pd
import argparse
import io
import os
import requests
from datetime import date
import time

# XE api
from xecd_rates_client import XecdClient

# Google News api
from newsapi import NewsApiClient

# Google Cloud client library api
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types


In [2]:
class Create_DF:
    def __init__(self, cur_original, cur_new, news_api_key, json_google_cloud, xecd_client, rapid_api_host, rapid_api_key):
        self.today = date.today().strftime("%Y/%m/%d").replace("/","-")
        self.original = cur_original
        self.new = cur_new
        self.newsapi = NewsApiClient(api_key=news_api_key)
        self.client = language.LanguageServiceClient.from_service_account_json(json_google_cloud)
        self.xecd = xecd_client
        self.rapid_api_host = rapid_api_host
        self.rapid_api_key = rapid_api_key
        self.approved_classifications = [
            'Finance',
            'Business',
            'Politics',
            'Jobs & Education',
            'Business News',
            'Business & Industrial',
        ]
        
        
    # classify text into categories   
    def __classify(self, text):
        client = self.client
        document = language.types.Document(content=text, type=language.enums.Document.Type.PLAIN_TEXT)
        response = client.classify_text(document)
        categories = response.categories

        result = []

        for category in categories:
            categories = category.name.split("/")
            categories.pop(0)
            for i in range(len(categories)):
                result.append(categories[i])
        return result
    
    def __getRate(self, date, fromCur, toCur):
        xecd = self.xecd
        rateResult = xecd.historic_rate(date, "12:00", fromCur, toCur, 1)
        rateObj={}
        rateObj['date'] = rateResult['timestamp'][:10]
        rateObj['rate'] = rateResult['to'][0]['mid']
        return rateObj['rate']
    
    
    # retrieve articles from google news api
    def __retrieve_articles(self):
        formatted_articles = []
        pageNum = 1
        
        print("===============Start Scraping===============")
        
        curr_dict = {
            'CAD' : 'Canada Canadian',
            'USD' : 'United States American',
            'GBP' : 'Great Britain Pound',
            'AUD' : 'Australia Australia',
            'JPY' : 'Japan Japanese Yen',
            'EUR' : 'Europe Euro European',
            'CNY' : 'China Chinese Yen',
            'INR' : 'India Indian Rupee'
        }
        
        while True:
            scraped = 0;
            url = "https://contextualwebsearch-websearch-v1.p.rapidapi.com/api/Search/NewsSearchAPI"
            querystring = {"fromPublishedDate":"2019-06-14 00:00:00","toPublishedDate":"2019-09-13 23:59:59","autoCorrect":"false","pageNumber":str(pageNum),"pageSize":"50","q":curr_dict[self.original],"safeSearch":"false"}
            headers = {
                'x-rapidapi-host': self.rapid_api_host,
                'x-rapidapi-key': self.rapid_api_key
            }

            all_articles = requests.request("GET", url, headers=headers, params=querystring).json()
            iterationsLeft = (all_articles['totalCount'] - pageNum*50) // 50
            articles = all_articles['value']

            # creating new article object
            for article in articles:
                try:
                    info = str(article['title']) + " " + str(article['description']) + " " + str(article['body'])
                    if len(info.split(" ")) > 21:
                        classifications = self.__classify(info)
                        if len(set(classifications).intersection(set(self.approved_classifications))) != 0:
                            individual_article = {}
                            individual_article['date'] = article['datePublished'][:10]

                            info = info.replace("'","") 
                            individual_article['text'] = info

                            formatted_articles.append(individual_article)

                            scraped = scraped + 1;
                except:
                    continue
    #         print(json.dumps(self.formatted_articles, indent=4, sort_keys=True))
            if iterationsLeft <= 0:
                print("Finished Scraping")
                break
            else:
                time.sleep(5)
                print("Scraped Page: ", pageNum, ' Items: ', scraped)
                pageNum += 1
            
        self.formatted_articles = formatted_articles
    
    
    # Sentiment analysis using google cloud
    def __sentiment_analysis(self):
        client = self.client
        tone_analyzed_articles = {'date':[],'score':[]}
        formatted_articles = self.formatted_articles.copy()

        # The text to analyze
        for i in range(len(formatted_articles)):
            analyzed_article = {}
            article = formatted_articles[i]
            text = article['text']

            document = types.Document(content = text, type = enums.Document.Type.PLAIN_TEXT)

            # Detects the sentiment of the text
            sentiment = client.analyze_sentiment(document=document).document_sentiment
            
            # Appends to dataframe
            tone_analyzed_articles['date'].append(article['date'])
            tone_analyzed_articles['score'].append(sentiment.score)
        
#         print(json.dumps(tone_analyzed_articles, indent=4, sort_keys=True))        
        
        return pd.DataFrame(tone_analyzed_articles)
        
    # Dump the dataframe
    def __dump_df(self, df):
        df.to_csv(r'data/articles/' + self.original + '.csv')
        
        
    # Create the dataframe and save it
    def df_create(self):
        self.__retrieve_articles()
        df = self.__sentiment_analysis()
        self.__dump_df(df)
        return df

## Input Parameter

In [3]:
news_api_key = keys['news']
json_google_cloud = keys['json_google_cloud']
xecd_client = keys['xe']
rapid_api_host = keys['rapid_api_host']
rapid_api_key = keys['rapid_api_key']

In [4]:
print("Enter the original currency: ")
# cur_original = input()
print("Enter the new currency: ")
# cur_return = input()

Enter the original currency: 
Enter the new currency: 


In [5]:
cur_original = 'USD'
cur_return = 'CAD'

## Create Dataframe

In [6]:
created_df = Create_DF(cur_original, cur_return, news_api_key, json_google_cloud, xecd_client,rapid_api_host,rapid_api_key)

In [7]:
df = created_df.df_create()

Scraped Page:  1  Items:  9
Scraped Page:  2  Items:  15
Scraped Page:  3  Items:  14
Scraped Page:  4  Items:  19
Scraped Page:  5  Items:  15
Scraped Page:  6  Items:  10
Scraped Page:  7  Items:  12
Scraped Page:  8  Items:  13
Scraped Page:  9  Items:  4
Scraped Page:  10  Items:  9
Scraped Page:  11  Items:  20
Scraped Page:  12  Items:  19
Scraped Page:  13  Items:  17
Scraped Page:  14  Items:  17
Scraped Page:  15  Items:  22
Scraped Page:  16  Items:  19
Scraped Page:  17  Items:  16
Scraped Page:  18  Items:  17
Scraped Page:  19  Items:  14
Scraped Page:  20  Items:  13
Scraped Page:  21  Items:  0
Scraped Page:  22  Items:  0
Scraped Page:  23  Items:  0
Scraped Page:  24  Items:  0
Scraped Page:  25  Items:  0
Scraped Page:  26  Items:  0
Scraped Page:  27  Items:  0
Scraped Page:  28  Items:  0
Finished Scraping


In [8]:
df.shape

(294, 2)