# How to execute a REST API call on Apache Spark

In [47]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [1]:
import http
import json
import requests
from pyspark.sql.functions import udf, col, explode
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
from pyspark.sql import Row

In [42]:
headers = {
    'content-type': "application/json"
}

body = json.dumps({
})

## Now declare a function that will execute our REST API call
- There is nothing special about this function, except that the REST service response will be passed back as a JSON object.

In [43]:
# response function - udf
def executeRestApi(verb, url, headers, body):
  res = None
  # Make API request, get response object back, create dataframe from above schema.
  try:
    if verb == "get":
      res = requests.get(url, data=body, headers=headers)
    elif verb == "post":
      res = requests.post(url, data=body, headers=headers)
    else:
      print("another HTTP verb action")
  except Exception as e:
    return e

  if res != None and res.status_code == 200:
    return json.loads(res.text)

  return None

### Define the response schema and the UDF
- We can pick and chose what values we want from the JSON returned by the REST API call
- All we have to do is when declaring the schema
- We only need to identify what parts of the JSON we want.

In [44]:
schema = StructType([
  StructField("Count", IntegerType(), True),
  StructField("Message", StringType(), True),
  StructField("SearchCriteria", StringType(), True),
  StructField("Results", ArrayType(
    StructType([
      StructField("Make_ID", IntegerType()),
      StructField("Make_Name", StringType())
    ])
  ))
])

- Next we declare the UDF, making sure to set the return type as the schema that we declared
- This will ensure that the new column, which is used to execute the UDF, will eventually contain data as a structured object rather than plain JSON formatted text

In [45]:
udf_executeRestApi = udf(executeRestApi, schema)

### Create the Request DataFrame and Execute
- Create a DataFrame where each row represents a single REST API call
- The number of columns in the Dataframe are up to us but we will need at least one, which will host the URL and/or parameters required to execute the REST API call
- We are going to use four to reflect the number of individual parameters that the REST API call function needs.
- Using the US Goverments free-to-access vehicle make REST service, we would create a Dataframe as follows:

In [58]:
# requests
RestApiRequest = Row("verb", "url", "headers", "body")
request_df = spark.createDataFrame([
            RestApiRequest("get", "https://vpic.nhtsa.dot.gov/api/vehicles/getallmakes?format=json", headers, body)
          ])\
          .withColumn("execute", udf_executeRestApi(col("verb"), col("url"), col("headers"), col("body")))

In [60]:
request_df.show()

+----+--------------------+--------------------+----+--------------------+
|verb|                 url|             headers|body|             execute|
+----+--------------------+--------------------+----+--------------------+
| get|https://vpic.nhts...|{content-type -> ...|  {}|{10200, Response ...|
+----+--------------------+--------------------+----+--------------------+



In [56]:
my_df = request_df.select(explode(col("execute.Results")).alias("results"))\
    .select(col("results.Make_ID"), col("results.Make_Name"))

+-------+------------+
|Make_ID|   Make_Name|
+-------+------------+
|    440|ASTON MARTIN|
|    441|       TESLA|
|    442|      JAGUAR|
|    443|    MASERATI|
|    444|  LAND ROVER|
+-------+------------+
only showing top 5 rows



- The Row class is used to define the columns of the Dataframe, and using the createDataFrame method of the spark object, an instance of RestApiRequestRow is declared for each individual API call that we want to make.
- All being well, the Dataframe will look like:
<table>
<thead>
<tr>
<th>verb</th>
<th>url</th>
<th>headers</th>
<th>body</th>
</tr>
</thead>
<tbody>
<tr>
<td>get</td>
<td><a href="https://vpic.nhtsa.dot.gov/api/vehicles/getallmakes?format=json" rel="nofollow">https://vpic.nhtsa.dot.gov/api/vehicles/getallmakes?format=json</a></td>
<td>{'content-type': "application/json"}</td>
<td>{}</td>
</tr>
</tbody>
</table>

- The REST service returns a number of attributes and we're only interested in the one identified as Results (i.e. result.Results)

In [57]:
my_df.show(5)

+-------+------------+
|Make_ID|   Make_Name|
+-------+------------+
|    440|ASTON MARTIN|
|    441|       TESLA|
|    442|      JAGUAR|
|    443|    MASERATI|
|    444|  LAND ROVER|
+-------+------------+
only showing top 5 rows

