In [1]:
import pandas as pd
from pyspark.sql import SparkSession

In [2]:
pd.set_option('display.width', 78)
pd.set_option('display.max_columns',6)

In [3]:
# initiate a Spark session and import CSV data
spark = SparkSession.builder \
   .getOrCreate()

In [4]:
landtemps = spark.read.option("header",True) \
     .csv("data/landtemps.tar.gz")


In [5]:
type(landtemps)

pyspark.sql.dataframe.DataFrame

In [6]:
landtemps.count()

16904868

In [7]:
landtemps.printSchema()

root
 |-- landtemps.csv                                                                                       000664  001750  001750  11063722557 13671275234 013547  0                                                                                                    ustar 00mike                            mike                            000000  000000                                                                                                                                                                         locationid: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- temp: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- stnelev: string (nullable = true)
 |-- station: string (nullable = true)
 |-- countryid: string (nullable = true)
 |-- country: string (nullable = true)



In [8]:
landtemps.select("station",'country','month','year','temp') \
    .show(5, False)

+-------+-------------------+-----+----+-----+
|station|country            |month|year|temp |
+-------+-------------------+-----+----+-----+
|SAVE   |Antigua and Barbuda|1    |1961|-0.85|
|SAVE   |Antigua and Barbuda|1    |1962|1.17 |
|SAVE   |Antigua and Barbuda|1    |1963|-7.09|
|SAVE   |Antigua and Barbuda|1    |1964|0.66 |
|SAVE   |Antigua and Barbuda|1    |1965|0.48 |
+-------+-------------------+-----+----+-----+
only showing top 5 rows



In [9]:
# change temp data type from string to float
landtemps = landtemps \
  .withColumn("temp",landtemps.temp.cast('float'))

In [10]:
landtemps.select("temp").dtypes

[('temp', 'float')]

In [11]:
landtemps.describe('temp').show()

+-------+------------------+
|summary|              temp|
+-------+------------------+
|  count|          14461547|
|   mean|10.880725773138437|
| stddev|11.509636369381871|
|    min|             -75.0|
|    max|             42.29|
+-------+------------------+



In [12]:
# load JSON data
allcandidatenews = spark.read \
     .json("data/allcandidatenewssample.json")

In [13]:
allcandidatenews \
  .select("source","title","story_position") \
  .show(5)

+--------------------+--------------------+--------------+
|              source|               title|story_position|
+--------------------+--------------------+--------------+
|            NBC News|Bloomberg cuts ti...|             6|
|Town & Country Ma...|Democratic Candid...|             3|
|                NULL|                NULL|          NULL|
|             TheHill|Sanders responds ...|             7|
|            CNBC.com|From Andrew Yang'...|             2|
+--------------------+--------------------+--------------+
only showing top 5 rows



In [17]:
# display structure of JSON data
allcandidatenews.count()

60000

In [15]:
allcandidatenews.printSchema()

root
 |-- category: string (nullable = true)
 |-- date: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- panel_position: string (nullable = true)
 |-- query: string (nullable = true)
 |-- reason: string (nullable = true)
 |-- source: string (nullable = true)
 |-- story_position: long (nullable = true)
 |-- time: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [16]:
allcandidatenews \
   .describe('story_position') \
   .show()

+-------+-----------------+
|summary|   story_position|
+-------+-----------------+
|  count|            57618|
|   mean|5.249626852719636|
| stddev|2.889001922195635|
|    min|                1|
|    max|               10|
+-------+-----------------+



In [18]:
allcandidatenewsdf = allcandidatenews.toPandas()

In [19]:
allcandidatenewsdf.head()

Unnamed: 0,category,date,domain,...,time,title,url
0,,2019-12-25 10:00:00,www.nbcnews.com,...,18 hours ago,Bloomberg cuts ties with company using prison ...,https://www.nbcnews.com/politics/2020-election...
1,,2019-11-09 08:00:00,www.townandcountrymag.com,...,18 hours ago,Democratic Candidates React to Michael Bloombe...,https://www.townandcountrymag.com/society/poli...
2,,2019-09-11 18:00:00,,...,,,
3,,2019-08-08 06:00:00,thehill.com,...,15 hours ago,Sanders responds to de Blasio's invitation to ...,https://thehill.com/homenews/campaign/456579-s...
4,,2019-08-04 10:00:00,www.cnbc.com,...,1 day ago,From Andrew Yang's 'No Tie' look to Bernie Buc...,https://www.cnbc.com/2019/08/02/election-2020-...


In [20]:
allcandidatenewsdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   category        416 non-null    object 
 1   date            60000 non-null  object 
 2   domain          57618 non-null  object 
 3   panel_position  57618 non-null  object 
 4   query           57618 non-null  object 
 5   reason          2382 non-null   object 
 6   source          57618 non-null  object 
 7   story_position  57618 non-null  float64
 8   time            57618 non-null  object 
 9   title           57618 non-null  object 
 10  url             57618 non-null  object 
dtypes: float64(1), object(10)
memory usage: 5.0+ MB
