In [5]:
from pyspark.sql import SparkSession
import requests
import json

In [6]:
# Initialize Spark session
spark = SparkSession.builder.appName("CrossrefService").master("spark://spark-master:7077").config("spark.cores.max", "2").config("spark.executor.memory", "512m").config("spark.eventLog.enabled", "true").config("spark.eventLog.dir", "file:///opt/workspace/events").getOrCreate()

# Read JSON files from HDFS
df = spark.read.json("all_papers.json")

def formatterParams(title, authors, id):
    '''
    Function to clean the title and authors name of the paper
    
    Parameters:
    title (str): the title of the paper
    authors (str): the name of the author
    
    Returns:
    title (str): the cleaned title of the paper
    authors (str): the cleaned name the name of the author
    '''
    if authors:
        authors_str = authors[0]['name']
    else:
        authors_str = ''
    return title, authors_str, id

def find_publisher_location(json_data):
    '''
    Function to find the publisher location in the JSON response

    Parameters:
    json_data (dict): the JSON response from the API

    Returns:
    str: the publisher location if found, otherwise None
    '''
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            if key == "publisher-location":
                return value
            elif isinstance(value, (dict, list)):
                result = find_publisher_location(value)
                if result:
                    return result
    elif isinstance(json_data, list):
        for item in json_data:
            result = find_publisher_location(item)
            if result:
                return result
    return None

def process_paper(paper):
    '''
    Function to process each paper asynchronously

    Parameters:
    paper (dict): paper information from JSON data
    '''
    title = paper['title']
    authors = paper['authors']
    id = paper['id']
    title, authors, id = formatterParams(title, authors, id)
    url = f"https://api.crossref.org/works?query.author={authors}&query.title={title}"
    
    with requests.Session() as session:
        response = session.get(url, stream=True)
        if response.status_code == 200:
            data = response.json()
            publisher_location = find_publisher_location(data)
            if publisher_location:
                # Añadir los datos del título del artículo y la ubicación del editor a la lista
                return {'id': id, 'publisher_location': publisher_location}
            else:
                return {'id': id, 'publisher_location': 'No location found'}
        else:
            return {'id': id, 'publisher_location': 'API call failed'}

# Convert DataFrame to RDD and apply process_paper function
rdd = df.rdd.map(lambda row: process_paper(row.asDict()))

# Convert RDD back to DataFrame
result_df = rdd.toDF(["id", "publisher_location"])

                                                                                

In [None]:
# Write the result to a CSV file
result_df.write.csv("dataout/paper_location", header=True, mode="overwrite")

[Stage 2:>                                                          (0 + 2) / 2]

In [None]:
# Write the result to a CSV file
#result_df.coalesce(1).write.csv("paper_location", header=True, mode="overwrite")

[Stage 2:>                                                          (0 + 1) / 1]

In [None]:
spark.stop()