In this section the API selected will be used so that we load our ES db

we ll use docker for elastic search and fill the details in the .ENV file

- NYT-Times NewsWire API
- NYT-Books API
- GoodReads

// end points
// list of book categories : https://api.nytimes.com/svc/books/v3/lists/names.json
// number of reviewed book : https://api.nytimes.com/svc/books/v3/lists/best-sellers/history.json
// good reads : https://www.goodreads.com/book/review_counts.json
//

// need to establish the following :
//1. connection to the ES container
//
//2. loading the api request from the two api
//2. 1 for each API define loops and requirements


1. Connection to the container // Windows installation

1.Install Docker: Download and install Docker Desktop for Windows from the official Docker website. Follow the installation instructions specific to your Windows version.
2.Launch Docker: Once Docker is installed, launch Docker Desktop from the start menu or desktop shortcut. Make sure it's running before proceeding to the next steps.
3.Open a Command Prompt: Open the Command Prompt or PowerShell on your Windows machine.
4.Pull the Elasticsearch Docker Image: Use the following command to pull the official Elasticsearch Docker image from Docker Hub:

bash
Copy code
docker pull docker.elastic.co/elasticsearch/elasticsearch:8.8.1

5.Create a Docker Container
docker run -d --name elasticsearch_container -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:8.8.1

6.Verify Elasticsearch Installation: Open your web browser and navigate to http://localhost:9200.


Once the connection is established we want to analyse in details the API and load them one by one without overloading the API from NYT or GoodReads

To interact with ES firstly you need to : pip install elasticsearch


In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from dotenv import load_dotenv


# we prepare the mapping for the new ESindex
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

index_name = 'NYT-newswire'
newswire_index_body = {
    'mappings': {
        'properties': {
            'abstract': {'type': 'text'},
            'byline': {'type': 'text'},
            'created_date': {'type': 'date'},
            'des_facet': {'type': 'keyword'},
            'first_published_date': {'type': 'date'},
            'geo_facet': {'type': 'keyword'},
            'item_type': {'type': 'keyword'},
            'kicker': {'type': 'text'},
            'material_type_facet': {'type': 'keyword'},
            'multimedia': {
                'type': 'nested',
                'properties': {
                    'caption': {'type': 'text'},
                    'copyright': {'type': 'text'},
                    'format': {'type': 'keyword'},
                    'height': {'type': 'integer'},
                    'subtype': {'type': 'keyword'},
                    'type': {'type': 'keyword'},
                    'url': {'type': 'text'},
                    'width': {'type': 'integer'}
                }
            },
            'org_facet': {'type': 'keyword'},
            'per_facet': {'type': 'keyword'},
            'published_date': {'type': 'date'},
            'section': {'type': 'keyword'},
            'slug_name': {'type': 'keyword'},
            'source': {'type': 'text'},
            'subheadline': {'type': 'text'},
            'subsection': {'type': 'keyword'},
            'thumbnail_standard': {'type': 'text'},
            'title': {'type': 'text'},
            'updated_date': {'type': 'date'},
            'uri': {'type': 'keyword'},
            'url': {'type': 'text'}
        }
    }
}


 # Create the index
response = es.indices.create(index=index_name, body=newswire_index_body)

# Check the response
  
if response['acknowledged']:
    print('Index created successfully.')
else:
    print('Failed to create index.')

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from dotenv import load_dotenv

##########################################################################
# we prepare the mapping for the new ESindex
##########################################################################
##########################################################################
# Mapping and index for yhr NYT - Book API 
##########################################################################
#To be fixed 


es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

index_name = 'NYT-booksList'
books_list_index_body = {
     'mappings': {
        'properties': {
            'amazon_product_url': {'type': 'keyword'},
            'asterisk': {'type': 'integer'},
            'bestsellers_date': {'type': 'date'},
            'book_details': {
                'type': 'nested',
                'properties': {
                    'age_group': {'type': 'keyword'},
                    'author': {'type': 'keyword'},
                    'contributor': {'type': 'keyword'},
                    'contributor_note': {'type': 'text'},
                    'description': {'type': 'text'},
                    'price': {'type': 'float'},
                    'primary_isbn10': {'type': 'keyword'},
                    'primary_isbn13': {'type': 'keyword'},
                    'publisher': {'type': 'keyword'},
                    'title': {'type': 'text'}
                }
            },
            'dagger': {'type': 'integer'},
            'display_name': {'type': 'keyword'},
            'isbns': {
                'type': 'nested',
                'properties': {
                    'isbn10': {'type': 'keyword'},
                    'isbn13': {'type': 'keyword'}
                }
            },
            'list_name': {'type': 'keyword'},
            'published_date': {'type': 'date'},
            'rank': {'type': 'integer'},
            'rank_last_week': {'type': 'integer'},
            'reviews': {
                'type': 'nested',
                'properties': {
                    'article_chapter_link': {'type': 'keyword'},
                    'book_review_link': {'type': 'keyword'},
                    'first_chapter_link': {'type': 'keyword'},
                    'sunday_review_link': {'type': 'keyword'}
                }
            },
            'weeks_on_list': {'type': 'integer'}
        }
    }
}


 # Create the index
response = es.indices.create(index=index_name, body=books_list_index_body)

# Check the response
  
if response['acknowledged']:
    print('Index created successfully.')
else:
    print('Failed to create index.')

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from dotenv import load_dotenv

##########################################################################
# we prepare the mapping for the new ESindex
##########################################################################
##########################################################################
# Mapping and index for yhr NYT - Book API -- best sellers
##########################################################################
#To be fixed 


es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

index_name = 'NYT-books-best-sellers'
BS_books_index_body = {
     'mappings': {
         'properties': {
        'title': {'type': 'text'},
        'description': {'type': 'text'},
        'contributor': {'type': 'text'},
        'author': {'type': 'text'},
        'contributor_note': {'type': 'text'},
        'price': {'type': 'float'},
        'age_group': {'type': 'keyword'},
        'publisher': {'type': 'text'},
        'isbns': {
            'type': 'nested',
            'properties': {
                'isbn10': {'type': 'keyword'},
                'isbn13': {'type': 'keyword'},
            }
        },
        'ranks_history': {
            'type': 'nested',
            'properties': {
                'primary_isbn10': {'type': 'keyword'},
                'primary_isbn13': {'type': 'keyword'},
                'rank': {'type': 'integer'},
                'list_name': {'type': 'text'},
                'display_name': {'type': 'text'},
                'published_date': {'type': 'date'},
                'bestsellers_date': {'type': 'date'},
                'weeks_on_list': {'type': 'integer'},
                'ranks_last_week': {'type': 'integer', 'null_value': None},
                'asterisk': {'type': 'integer'},
                'dagger': {'type': 'integer'},
            }
        },
        'reviews': {
            'type': 'nested',
            'properties': {
                'book_review_link': {'type': 'keyword'},
                'first_chapter_link': {'type': 'keyword'},
                'sunday_review_link': {'type': 'keyword'},
                'article_chapter_link': {'type': 'keyword'},
              }
            },
         }
        }
    }



 # Create the index
response = es.indices.create(index=index_name, body=BS_books_index_body)

# Check the response
  
if response['acknowledged']:
    print('Index created successfully.')
else:
    print('Failed to create index.')

The following are the requests and loading in the Elastic search DB


In [None]:

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from dotenv import load_dotenv
import os
import pprint
import requests
import time
import json
pp = pprint.PrettyPrinter(indent=4)
load_dotenv()
api_key = os.getenv("API_KEY")

########################################################
########################################################
# NEWSWIRE API
########################################################
########################################################


#Define the variables used in the API
#Get the list of sections needed to call the API Newswire
reqSections =  requests.get(f'https://api.nytimes.com/svc/news/v3/content/section-list.json?&api-key={api_key}')
sectionsList= [item['section'] for item in reqSections.json()['results']]
print(sectionsList)


########################################################
########################################################
# for tests ONLY ###
########################################################
sectionsList = sectionsList[:2]
print(sectionsList)
########################################################
########################################################


# iterate through the list 
for section in sectionsList:
     
    # Create a connection to Elasticsearch  

    #Request the Api
    content = requests.get(f'https://api.nytimes.com/svc/news/v3/content/all/{section}.json?&api-key={api_key}')
    #save into the ES DB
    res = content.json()
    pp.pprint(res) 
    docs = res['results']


    """

    # Prepare documents for indexing
    actions = [
        {
            '_index': index_name,
            '_source': doc
         }
        for doc in docs
    ]



    # Define an index and document type for ES
    index_name = 'NYT-newswire'

    # Bulk index the documents
    headers = {
        'Content-Type': 'application/json'
    }

    index_body = {
        'settings': {
            'number_of_shards': 1,
            'number_of_replicas': 0
        },
        'mappings': {
            'properties': {
                'field1': {'type': 'text'},
                'field2': {'type': 'keyword'}
            }
        }
    }




    #response = es.bulk(index=index_name, body=actions, headers=headers)


  
    # Check the response
    if response['result'] == 'created':
        print(f'{section} saved successfully')
    else:
        print('Failed to save content.')
    
    ######################################################
    time.sleep(2) ##### TO MODIFY ACCORDING API ALLOWANCE
    ######################################################
    """


In [7]:

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from dotenv import load_dotenv
import os
import pprint
import requests
import time
import json
pp = pprint.PrettyPrinter(indent=4)
load_dotenv()
api_key = os.getenv("API_KEY")

########################################################
########################################################
# BOOKS API
########################################################
########################################################


#Define the variables used in the API
#Get the list of sections needed to call the API Newswire
reqBookList = requests.get(f"https://api.nytimes.com/svc/books/v3/lists/names.json?api-key={api_key}")
booksList= [item['list_name'] for item in reqBookList.json()['results']]


########################################################
########################################################
# for tests ONLY ###
########################################################
booksList = booksList[:2]
print(booksList)
########################################################
########################################################


# iterate through the list 
for blist in booksList:
     
    # Create a connection to Elasticsearch  

    #Request the Api
    content = requests.get(f"https://api.nytimes.com/svc/books/v3/lists.json?list={blist}&api-key={api_key}")
    #save into the ES DB
    res = content.json()
    pp.pprint(res) 
    docs = res['results']


    """    
    # Prepare documents for indexing
    actions = [
        {
            '_index': index_name,
            '_source': doc
         }
        for doc in docs
    ]



    # Define an index and document type for ES
    index_name = 'NYT-newswire'

    # Bulk index the documents
    headers = {
        'Content-Type': 'application/json'
    }

    index_body = {
        'settings': {
            'number_of_shards': 1,
            'number_of_replicas': 0
        },
        'mappings': {
            'properties': {
                'field1': {'type': 'text'},
                'field2': {'type': 'keyword'}
            }
        }
    }




    #response = es.bulk(index=index_name, body=actions, headers=headers)


  
    # Check the response
    if response['result'] == 'created':
        print(f'{section} saved successfully')
    else:
        print('Failed to save content.')
    
    """
    ######################################################
    time.sleep(1) ##### TO MODIFY ACCORDING API ALLOWANCE
    ######################################################
    


['Combined Print and E-Book Fiction', 'Combined Print and E-Book Nonfiction']
{   'copyright': 'Copyright (c) 2023 The New York Times Company.  All Rights '
                 'Reserved.',
    'last_modified': '2023-06-14T22:14:41-04:00',
    'num_results': 15,
    'results': [   {   'amazon_product_url': 'https://www.amazon.com/dp/0316404594?tag=NYTBSREV-20',
                       'asterisk': 0,
                       'bestsellers_date': '2023-06-10',
                       'book_details': [   {   'age_group': '',
                                               'author': 'James Patterson and '
                                                         'Brendan DuBois',
                                               'contributor': 'by James '
                                                              'Patterson and '
                                                              'Brendan DuBois',
                                               'contributor_note': '',
                     