In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import SparkFiles
import json
import requests
import sys

In [None]:
spark = SparkSession.builder \
.master("local") \
.appName("Exercise2") \
.getOrCreate()

In [None]:
df = spark.read.csv("exampleData.csv", header=True, inferSchema=True)

In [None]:
staticSchema = df.schema

In [None]:
streamingDF = spark.readStream\
.schema(staticSchema)\
.option("maxFilesPerTrigger", 1)\
.format("csv")\
.option("header", "true")\
.load("/home/jovyan/*.csv")

In [None]:
streamingDF.isStreaming

In [None]:
filteredDF = streamingDF.filter((F.col('air_temperature')<5))

In [None]:
df = filteredDF.writeStream.format("memory") \
.queryName("weatherQuery") \
.outputMode("append") \
.start()

In [None]:
df.awaitTermination() # run and terminate by Keyboard Interruption (interrupt the kernel - square button)

In [None]:
spark.sql("""
SELECT *
FROM weatherQuery
""")\
.show(5)

## Read from URL 

In [None]:
url = "https://people.sc.fsu.edu/~jburkardt/data/csv/homes.csv"
# try your url here

In [None]:
spark.sparkContext.addFile(url)

In [None]:
df = spark.read.csv("file://"+SparkFiles.get('homes.csv'), header=True, inferSchema=True) # change home.csv to the filename according to your data

In [None]:
df.show(3)

## Read from API (open/public)

In [None]:
API_URL = "https://jsonplaceholder.typicode.com/posts" # try your url here

In [None]:
response = requests.get(API_URL)

In [None]:
if response.status_code == 200:
    # Convert the JSON response to dictionary
    data = json.loads(response.text)
    
    # Create df from data
    df = spark.createDataFrame(data)
    
    # Show df
    df.show(3)
else:
    print("Failed to retrieve data from API")
