# Classifying Web Articles using NYTime API

In this lecture we will retrieve 100 articles from Art & Sports (50 documents in each category) and we will train a classifier to identify specific lexicon in each category so to automatically categorize new documents.

1. Generate Training Set

In [4]:
import time
import json
from nytimesarticle import articleAPI

api = articleAPI('65e6563256a340f5842cf5b6af85c8d5')
trainingFolder = 'C:\\tmp\\'
sampleSize = 5 #Number of pages included. Normally each page contains 10 articles.

#Functions are a modification of code made available by Rochelle Terman
#http://dlab.berkeley.edu/blog/scraping-new-york-times-articles-python-tutorial

def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []
    for i in articles['response']['docs']:
        dic = {}
        dic['id'] = i['_id']
        if i['abstract'] is not None:
            dic['abstract'] = i['abstract'].encode("utf8")
        dic['headline'] = i['headline']['main'].encode("utf8")
        dic['desk'] = i['news_desk']
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        dic['section'] = i['section_name']
        if i['snippet'] is not None:
            dic['snippet'] = i['snippet'].encode("utf8")
        dic['source'] = i['source']
        dic['type'] = i['type_of_material']
        dic['url'] = i['web_url']
        dic['word_count'] = i['word_count']
        # locations
        locations = []
        for x in range(0,len(i['keywords'])):
            if 'glocations' in i['keywords'][x]['name']:
                locations.append(i['keywords'][x]['value'])
        dic['locations'] = locations
        # subject
        subjects = []
        for x in range(0,len(i['keywords'])):
            if 'subject' in i['keywords'][x]['name']:
                subjects.append(i['keywords'][x]['value'])
        dic['subjects'] = subjects   
        news.append(dic)
    return(news) 

def get_articles(sampleSize,query,category,year):
    '''
    This function accepts a year in string format (e.g.'1980')
    and a query (e.g.'Amnesty International') and it will 
    return a list of parsed articles (in dictionaries)
    for that year.
    '''
    all_articles = []
    for i in range(0,sampleSize): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
        print ('generating %s block ...' % i)
        articles = api.search(q = query,
                              fq = {'news_desk':category},
                              begin_date = year + '0101',
                              end_date = year + '1231',
                              sort ='oldest',
                              page = str(i))
        articles = parse_articles(articles)
        all_articles = all_articles + articles
        # Sleep 1 second to avoid "Exceeded Request Quota" error
        time.sleep(1)
    return(all_articles)

def testAPI(query,category):
    content = []
    arts =  get_articles(1,query,category,'2016')
    print (arts)

Testing the API

In [None]:
testAPI("the","Arts")

Utility Methods to generate training sets

In [1]:
def generateTrainingSet(size,category,year):
    articles = get_articles(size,"the",category,year)
    fileName = "C:\\tmp\\training_"+category.lower()
    f = open(fileName, 'w')
    
    for doc in articles:
        if 'abstract' in doc:
            # Get the abstract
            abstract = doc['abstract']
            
            #Remove tabs
            abstract=abstract.replace('\t',' ')
            f.write('%s\n' % abstract)
    
    f.close()

Generate Training sets for ARTS and SPORTS

In [None]:
generateTrainingSet(5,"Arts","2016")
generateTrainingSet(5,"Sports","2016")