In [1]:
import os
import sys

os.environ["JAVA_HOME"] = "../.JDK 8" 
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import  *
# from pyspark.sql.types import IntegerType

# from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Saving and Project").getOrCreate()


In [3]:
data = [(1, "Moby Dick", "Herman Melville", 1851),
        (2, "Treasure Island", "Robert Louis Stevenson", 1883),
        (3, "Robinson Crusoe", "Daniel Defoe", 1719),
        (4, "The Fellowship of the Ring", "J.R.R. Tolkien", 1954),
        (5, "The Hitchhiker's Guide to the Galaxy", "Douglas Adams", 1979)]

columns = ["ID", "book_name", "author", "publish_date"]

In [4]:
df_books = spark.createDataFrame(data, columns)
df_books.show()


                                                                                

+---+--------------------+--------------------+------------+
| ID|           book_name|              author|publish_date|
+---+--------------------+--------------------+------------+
|  1|           Moby Dick|     Herman Melville|        1851|
|  2|     Treasure Island|Robert Louis Stev...|        1883|
|  3|     Robinson Crusoe|        Daniel Defoe|        1719|
|  4|The Fellowship of...|      J.R.R. Tolkien|        1954|
|  5|The Hitchhiker's ...|       Douglas Adams|        1979|
+---+--------------------+--------------------+------------+



Once you have this downloaded, load it into your Python file and into a PySpark DataFrame. Firstly check the file to ensure that the data is of the right type, and there are no null values. Then I want you to tell me which day had the most lightning strikes recorded. Order the dataframe by descending number of strikes, then save it to a CSV file.

In [6]:
df_lightning = spark.read.csv("lightening strikes dataset.csv", header=True, inferSchema=True)

                                                                                

In [7]:
df_lightning.printSchema()

root
 |-- date: date (nullable = true)
 |-- number_of_strikes: integer (nullable = true)
 |-- center_point_geom: string (nullable = true)



In [8]:
df_lightning.show(5)

+----------+-----------------+-----------------+
|      date|number_of_strikes|center_point_geom|
+----------+-----------------+-----------------+
|2018-01-03|              194|    POINT(-75 27)|
|2018-01-03|               41|  POINT(-78.4 29)|
|2018-01-03|               33|  POINT(-73.9 27)|
|2018-01-03|               38|  POINT(-73.8 27)|
|2018-01-03|               92|    POINT(-79 28)|
+----------+-----------------+-----------------+
only showing top 5 rows



In [16]:
df_bydays = df_bydays = df_lightning.groupBy("date").agg(sum("number_of_strikes").alias("total_count")).orderBy(col("total_count").desc())

In [18]:
df_bydays.show()



+----------+-----------+
|      date|total_count|
+----------+-----------+
|2018-08-29|    1070457|
|2018-08-17|     969774|
|2018-08-28|     917199|
|2018-08-27|     824589|
|2018-08-30|     802170|
|2018-08-19|     786225|
|2018-08-18|     741180|
|2018-08-16|     734475|
|2018-08-31|     723624|
|2018-08-15|     673455|
|2018-08-20|     660501|
|2018-08-24|     652101|
|2018-08-21|     575985|
|2018-08-22|     535986|
|2018-08-23|     527955|
|2018-08-25|     512100|
|2018-08-26|     507894|
|2018-06-24|     431587|
|2018-07-23|     423031|
|2018-06-26|     376955|
+----------+-----------+
only showing top 20 rows



                                                                                

In [19]:
df_bydays.toPandas().to_csv("lightning_count_by_day.csv")

                                                                                

In [20]:
df_lightning.groupBy("center_point_geom").agg(round(avg("number_of_strikes"), 2).alias("avg_strikes")).show()



+-----------------+-----------+
|center_point_geom|avg_strikes|
+-----------------+-----------+
|POINT(-95.3 29.4)|      17.52|
|POINT(-92.6 24.9)|       6.65|
|POINT(-92.2 26.1)|       8.64|
|POINT(-91.3 28.1)|      11.44|
|POINT(-93.9 28.1)|       12.3|
|POINT(-87.4 25.2)|       8.94|
|POINT(-86.9 26.4)|      11.91|
|POINT(-76.3 26.7)|      14.53|
|  POINT(-85.5 26)|      10.03|
|POINT(-76.1 25.7)|       11.0|
|POINT(-99.9 33.6)|      26.79|
|  POINT(-77 26.5)|       9.83|
|POINT(-79.6 28.3)|      16.95|
|POINT(-78.7 31.9)|      16.14|
|POINT(-73.5 29.2)|       5.57|
|POINT(-96.5 27.7)|      19.76|
|POINT(-95.9 33.7)|      27.16|
|POINT(-95.1 34.3)|       34.9|
|POINT(-94.7 35.2)|      39.15|
|  POINT(-92.8 31)|      25.78|
+-----------------+-----------+
only showing top 20 rows



                                                                                

In [21]:
import pandas as pd
pandas_df = df_books.toPandas()
pandas_df.to_csv("book.csv")