In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"

In [3]:
import configparser

config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))
print(config.sections())
print(config.options('default'))
access_id = config.get('default', "aws_access_key_id") 
access_key = config.get('default', "aws_secret_access_key")

['default']
['aws_access_key_id', 'aws_secret_access_key']


In [4]:
from pyspark import SparkContext
sc = SparkContext("local", "s3 test")

In [5]:
hadoop_conf = sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoop_conf.set("fs.s3n.awsAccessKeyId", access_id)
hadoop_conf.set("fs.s3n.awsSecretAccessKey", access_key)

In [6]:
from pyspark.sql import SQLContext

sqlc = SQLContext(sc)

In [7]:
df = sqlc.read.options(header = True).csv("s3n://celestopolis/census_data_1/Individual_Census_by_Borough__Community_District__and_Facility_Type.csv")
df.printSchema()


root
 |-- Report Date: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Community Districts: string (nullable = true)
 |-- Census Type: string (nullable = true)
 |-- Adult Family Commercial Hotel: string (nullable = true)
 |-- Adult Family Shelter: string (nullable = true)
 |-- Adult Shelter: string (nullable = true)
 |-- Adult Shelter Commercial Hotel: string (nullable = true)
 |-- Family Cluster: string (nullable = true)
 |-- Family with Children Commercial Hotel: string (nullable = true)
 |-- Family with Children Shelter: string (nullable = true)



In [36]:
df.show(10)

+-----------+-------+-------------------+-----------+-----------------------------+--------------------+-------------+------------------------------+--------------+-------------------------------------+----------------------------+
|Report Date|Borough|Community Districts|Census Type|Adult Family Commercial Hotel|Adult Family Shelter|Adult Shelter|Adult Shelter Commercial Hotel|Family Cluster|Family with Children Commercial Hotel|Family with Children Shelter|
+-----------+-------+-------------------+-----------+-----------------------------+--------------------+-------------+------------------------------+--------------+-------------------------------------+----------------------------+
| 10/31/2020|  Bronx|                  1|Individuals|                         null|                 189|           73|                           140|            31|                                   64|                        1202|
| 10/31/2020|  Bronx|                  2|Individuals|                   

In [39]:
df.registerTempTable("example")
sqlc.sql("select Borough, count(*) as n_entries from example group by 1").show(10)

+-------------+---------+
|      Borough|n_entries|
+-------------+---------+
|       Queens|      420|
|     Brooklyn|      540|
|Staten Island|       90|
|    Manhattan|      360|
|        Bronx|      360|
+-------------+---------+

