# Download Data from NY Times API and Yahoo Package

- This script will download the data from the NY Times API and Yahoo! Finance.
- We will use the article data to run sentiment analysis and compare it to the stock price perfomance

### 1. Import Packages

In [1]:
# Lets us talk to other servers on the web
import requests

# APIs spit out data in JSON
import json

# Handling dates and times
from datetime import datetime

# DataFrames!
import pandas as pd
import numpy as np

# Data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb

# operating system commands
import os

# NYtimes Article API
from nytimesarticle import articleAPI

import time

# Yahoo package
from yahoo_historical import Fetcher

# Sentiment Analysis Package
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob import Blobber

### 2. Define functions

1. **parse_articles**: define a function that parses articles and makes it as a list of articles
2. **get_articles**: define a function that get articles from the NY Times and uses the above function to parse the article

In [2]:
def parse_articles(articles):

    news = []
    
    for i in articles['response']['docs']:
        
        dic = {}
        
        # general infos
        
        dic['id'] = i['_id']
        try:
            dic['abstract'] = i['abstract']
        except: 
            dic['abstract'] = ''
        dic['doctype'] = i['document_type']
        dic['headline'] = i['headline']['main']
        dic['paragraph'] = i['lead_paragraph']
        dic['desk'] = i['news_desk']
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        try:
            dic['snippet'] = i['snippet']
        except: 
            dic['snippet'] = ''
        try: 
            dic['source'] = i['source']
        except: 
            dic['source'] = ''    
        try: 
            dic['type'] = i['type_of_material']
        except: 
            dic['type'] = ''
        dic['url'] = i['web_url']
        dic['word_count'] = i['word_count']
        
        # organizations
        organizations = []
        for x in range(0, len(i['keywords'])):
            if 'organizations' in i['keywords'][x]['name']:
                organizations.append(i['keywords'][x]['value'])
        dic['organization'] = organizations
        
        # locations
        locations = []
        for x in range(0,len(i['keywords'])):
            if 'glocations' in i['keywords'][x]['name']:
                locations.append(i['keywords'][x]['value'])
        dic['locations'] = locations
        
        # subject
        subjects = []
        for x in range(0,len(i['keywords'])):
            if 'subject' in i['keywords'][x]['name']:
                subjects.append(i['keywords'][x]['value'])
        dic['subjects'] = subjects   
        
        news.append(dic)
        
    return(news) 

In [3]:
def get_articles(begindate, enddate, query, org, apikey):
    
    query_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?'
    all_articles = []
    
    for i in range(0, 100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
        query_url_full = query_url + 'begin_date=' + begindate + '&end_date=' + enddate + '&fq=organizations%3A(%22' + org + '%22)&page=' + str(i) + '&q=' + query + '&sort=oldest&api-key='+ apikey
        articles = requests.get(url = query_url_full).json()
        #print('okay', i)
        articles = parse_articles(articles)
        all_articles = all_articles + articles
        time.sleep(7)
        
    return(all_articles)

### 3. Download articles

**Apple**

- We will download all the articles on Apple Inc from January 1, 2014 to April 30, 2019
- Set query 'Apple' and organization 'Apple Inc'
- Since there is a data limit to download articles from the API, we will run the codes in several stpes
- Make sure to define api key

In [5]:
apikey = '<API-KEY>'
apple_news = get_articles('20140101', '20190430', 'Apple', 'Apple%20Inc', apikey)
last_date = apple_news[len(apple_news)-1]['date']
last_date2 = datetime.strptime(last_date, '%Y-%m-%d').strftime('%Y%m%d')
apple_news2 = get_articles(last_date2, '20190430', 'Apple', 'Apple%20Inc', apikey)

**If it is needed to run another round, run the below codes**
- Check whether the last date is 2019-04-30.

In [6]:
last_date = apple_news2[len(apple_news2)-1]['date']
print(last_date)
#last_date2 = datetime.datetime.strptime(last_date, '%Y-%m-%d').strftime('%Y%m%d')
#apple_news3 = get_articles('20181229', '20190430', 'Apple', 'Apple%20Inc')

2019-04-30


## Combine News 
- Combine all the articles and convert the data set into a panda data frame

In [7]:
# Combine all the news
# If apple_new3 exists, run the last code

apple_news_total = apple_news + [i for i in apple_news2 if i['id'] not in [j['id'] for j in apple_news]]
#apple_news_total = apple_news_total + [i for i in apple_news3 if i['id'] not in [j['id'] for j in apple_news2]]

# Convert it into the panda DataFrame
news_df = pd.DataFrame(apple_news_total)
news_df.head()

Unnamed: 0,abstract,date,desk,doctype,headline,id,locations,organization,paragraph,snippet,source,subjects,type,url,word_count
0,Apple’s iCloud data service works best with Ap...,2014-01-07,Business,article,Taking Along iCloud Calendars,5c9e779049f0eacbf109e38f,[],"[Apple Inc, iCloud, Samsung Group, Google Inc]",Q. Our family life depends on Apple shared cal...,Apple’s iCloud data service works best with Ap...,The New York Times,"[Android (Operating System), Smartphones]",Question,https://www.nytimes.com/2014/01/09/technology/...,634
1,"In March, Apple and Samsung Electronics are sc...",2014-01-09,,article,Apple and Samsung Chiefs to Meet a Mediator Ah...,5ca0ac6b49f0eacbf1f508e3,[],"[Apple Inc, Samsung Electronics Co]","In March, Apple and Samsung Electronics are sc...","In March, Apple and Samsung Electronics are sc...",The New York Times,"[Inventions and Patents, Smartphones, Suits an...",News,https://bits.blogs.nytimes.com/2014/01/09/appl...,146
2,Last year was the first in which personal comp...,2014-01-10,,article,"For PC Makers, the Good News on 2013 Is That I...",5ca0b20949f0eacbf1f6da72,[],"[Apple Inc, Dell Inc, Gartner Inc, Hewlett-Pac...",The two leading analysis companies tracking th...,Last year was the first in which personal comp...,The New York Times,"[Computers and the Internet, Desktop Computers]",News,https://bits.blogs.nytimes.com/2014/01/09/for-...,556
3,A summary of differences in the latest Windows...,2014-01-10,Business,article,"Changes in Windows 8.1, With Skype Replacing M...",5c9e779049f0eacbf109e391,[],"[Apple Inc, Microsoft Corporation, Skype Techn...",Meeting the New Messenger in Windows 8.1,A summary of differences in the latest Windows...,The New York Times,"[Windows (Operating System), Mobile Applicatio...",Question,https://www.nytimes.com/2014/01/10/technology/...,445
4,To draw young buyers and increase its market s...,2014-01-13,Business,article,Cost of Cool in India? An iPhone,5c9e916049f0eacbf10fe422,[India],[Apple Inc],"BANGALORE, India — After deliberating for mont...",To draw young buyers and increase its market s...,The New York Times,"[iPhone, Prices (Fares, Fees and Rates), Smart...",News,https://www.nytimes.com/2014/01/13/business/in...,1248


## Run the Sentiment Analysis on the Text

### 1. Use Pattern Analyzer
- Use Textblob package to compute sentiment scores (polarity and subjectivity)
- Default setting is to use Pattern Analyzer

In [8]:
# Compute Sentiment Polarity Score
news_abstract_polarity =  [TextBlob(i).sentiment.polarity for i in news_df['abstract']]
news_headline_polarity =  [TextBlob(i).sentiment.polarity for i in news_df['headline']]
news_snippet_polarity =   [TextBlob(i).sentiment.polarity for i in news_df['snippet']]
news_paragraph_polarity = [TextBlob(i).sentiment.polarity for i in news_df['paragraph']]

# Compute Sentiment Subjectivity Score 
news_abstract_subj =   [TextBlob(i).sentiment.subjectivity for i in news_df['abstract']]
news_headline_subj =   [TextBlob(i).sentiment.subjectivity for i in news_df['headline']]
news_snippet_subj =    [TextBlob(i).sentiment.subjectivity for i in news_df['snippet']]
news_paragraph_subj =  [TextBlob(i).sentiment.subjectivity for i in news_df['paragraph']]

### 2. Use Naive Bayes Analyzer
- Change the analyzer to Naive Bayes Analyzer
- This uses Naive Bayes Classifier trained by using the movie review data in NLTK package 
- Use Blobber function so that the analyzer does not train the data each time it goes through the data

In [9]:
blobber = Blobber(analyzer=NaiveBayesAnalyzer())
news_abstract_polarity_nba = [blobber(i).sentiment[1] for i in news_df['abstract']]
news_headline_polarity_nba = [blobber(i).sentiment[1] for i in news_df['headline']]
news_snippet_polarity_nba = [blobber(i).sentiment[1] for i in news_df['snippet']]
news_paragraph_polarity_nba = [blobber(i).sentiment[1] for i in news_df['paragraph']]

## Add Sentiment Scores to the Dataframe

In [10]:
news_df.insert(loc = len(news_df.columns), column = 'abstract_polarity', value = news_abstract_polarity)
news_df.insert(loc = len(news_df.columns), column = 'headline_polarity', value = news_headline_polarity)
news_df.insert(loc = len(news_df.columns), column = 'snippet_polarity', value = news_snippet_polarity)
news_df.insert(loc = len(news_df.columns), column = 'paragraph_polarity', value = news_paragraph_polarity)
news_df.insert(loc = len(news_df.columns), column = 'abstract_subjectivity', value = news_abstract_subj)
news_df.insert(loc = len(news_df.columns), column = 'headline_subjectivity', value = news_headline_subj)
news_df.insert(loc = len(news_df.columns), column = 'snippet_subjectivity', value = news_snippet_subj)
news_df.insert(loc = len(news_df.columns), column = 'paragraph_subjectivity', value = news_paragraph_subj)
news_df.insert(loc = len(news_df.columns), column = 'abstract_polarity_nba', value = news_abstract_polarity_nba)
news_df.insert(loc = len(news_df.columns), column = 'headline_polarity_nba', value = news_headline_polarity_nba)
news_df.insert(loc = len(news_df.columns), column = 'snippet_polarity_nba', value = news_snippet_polarity_nba)
news_df.insert(loc = len(news_df.columns), column = 'paragraph_polarity_nba', value = news_paragraph_polarity_nba)

## Export the Data Frame to CSV file

In [25]:
news_df.to_csv('news_data.csv', encoding='utf-8', header = True, index = False)

## Download Stock Prices from Yahoo Finance

In [4]:
data = Fetcher("AAPL", [2014,1,1], [2019,4,30])
shareprice_df = data.getHistorical()
#Use Adjusted Close Price
close_price = shareprice_df.iloc[:, [0,5]]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2014-01-02,79.382858,79.575714,78.860001,79.018570,66.707436,58671200
1,2014-01-03,78.980003,79.099998,77.204285,77.282860,65.242165,98116900
2,2014-01-06,76.778572,78.114288,76.228569,77.704285,65.597923,103152700
3,2014-01-07,77.760002,77.994286,76.845711,77.148575,65.128777,79302300
4,2014-01-08,76.972855,77.937141,76.955711,77.637146,65.541245,64632400
5,2014-01-09,78.114288,78.122856,76.478569,76.645714,64.704292,69787200
6,2014-01-10,77.118568,77.257141,75.872856,76.134285,64.272545,76244000
7,2014-01-13,75.701431,77.500000,75.697144,76.532860,64.609001,94623200
8,2014-01-14,76.888573,78.104286,76.808571,78.055717,65.894608,83140400
9,2014-01-15,79.074287,80.028572,78.808571,79.622856,67.217575,97909700


In [8]:
shareprice_df.to_csv('share_price_data.csv', header = True)