In [1]:
import pandas as pd
from pyspark.sql import SparkSession

In [2]:
pd.set_option('display.width', 78)
pd.set_option('display.max_columns',6)

In [3]:
# initiate a Spark session and import CSV data
spark = SparkSession.builder \
   .getOrCreate()

In [4]:
landtemps = spark.read.option("header",True) \
     .csv("data/landtemps.tar.gz")


In [5]:
type(landtemps)

pyspark.sql.dataframe.DataFrame

In [6]:
landtemps.count()

16904868

In [7]:
landtemps.printSchema()

root
 |-- landtemps.csv                                                                                       000664  001750  001750  11063722557 13671275234 013547  0                                                                                                    ustar 00mike                            mike                            000000  000000                                                                                                                                                                         locationid: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- temp: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- stnelev: string (nullable = true)
 |-- station: string (nullable = true)
 |-- countryid: string (nullable = true)
 |-- country: string (nullable = true)



In [8]:
landtemps.select("station",'country','month','year','temp') \
    .show(5, False)

+-------+-------------------+-----+----+-----+
|station|country            |month|year|temp |
+-------+-------------------+-----+----+-----+
|SAVE   |Antigua and Barbuda|1    |1961|-0.85|
|SAVE   |Antigua and Barbuda|1    |1962|1.17 |
|SAVE   |Antigua and Barbuda|1    |1963|-7.09|
|SAVE   |Antigua and Barbuda|1    |1964|0.66 |
|SAVE   |Antigua and Barbuda|1    |1965|0.48 |
+-------+-------------------+-----+----+-----+
only showing top 5 rows



In [9]:
# change temp data type from string to float
landtemps = landtemps \
  .withColumn("temp",landtemps.temp.cast('float'))

In [10]:
landtemps.select("temp").dtypes

[('temp', 'float')]

In [11]:
landtemps.describe('temp').show()

+-------+------------------+
|summary|              temp|
+-------+------------------+
|  count|          14461547|
|   mean|10.880725773138437|
| stddev|11.509636369381871|
|    min|             -75.0|
|    max|             42.29|
+-------+------------------+



In [12]:
# load JSON data
allcandidatenews = spark.read \
     .json("data/allcandidatenewssample.json")

In [13]:
allcandidatenews \
  .select("source","title","story_position") \
  .show(5)

+--------------------+--------------------+--------------+
|              source|               title|story_position|
+--------------------+--------------------+--------------+
|            NBC News|Bloomberg cuts ti...|             6|
|Town & Country Ma...|Democratic Candid...|             3|
|                NULL|                NULL|          NULL|
|             TheHill|Sanders responds ...|             7|
|            CNBC.com|From Andrew Yang'...|             2|
+--------------------+--------------------+--------------+
only showing top 5 rows

