In [1]:
from pyspark.sql import SparkSession
import os
import configparser

# Load AWS credentials as env vars

In [2]:
config = configparser.ConfigParser()

# Read AWS config file
config.read_file(open(f"{os.path.expanduser('~')}/.aws/credentials"))

# Export AWS enviroment variables
os.environ["AWS_ACCESS_KEY_ID"]= config['default']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['default']['AWS_SECRET_ACCESS_KEY']

# Override AWS region to be able to connect to Udacity's S3 bucket
os.environ['AWS_REGION'] = 'us-west-2'

# Create spark session with hadoop-aws package. Pay attention to correct dependencies between Spark and Hadoop

In [3]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.3.1")\
                     .getOrCreate()



:: loading settings :: url = jar:file:/home/ubuntu/anaconda3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6c384033-eb3b-4987-96ae-cc37c4266165;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
downloading https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar ...
	[SUCCESSFUL ] org.apache.hadoop#hadoop-aws;3.3.1!hadoop-aws.jar (74ms)
downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.901/aws-java-sdk-bundle-1.11.901.jar ...
	[SUCCESSFUL ] com.amazonaws#aws-java-sdk-bundle;1.11.901!aws-java-sdk-bundle.jar (1865ms)
:: resolution report :: resolve 1893ms :: artifacts dl 1956ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [

In [4]:
df = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv")

23/07/19 06:17:04 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [5]:
df.printSchema()
df.show(5)

root
 |-- _c0: string (nullable = true)



[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+
|                 _c0|
+--------------------+
|payment_id;custom...|
|16050;269;2;7;1.9...|
|16051;269;1;98;0....|
|16052;269;2;678;6...|
|16053;269;2;703;0...|
+--------------------+
only showing top 5 rows



                                                                                

# Fix the schema by providing a schema config

In [7]:
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType

schema = StructType([ \
    StructField("payment_id",IntegerType()), \
    StructField("customer_id",IntegerType()), \
    StructField("staff_id",IntegerType()), \
    StructField("rental_id", IntegerType()), \
    StructField("amount", DoubleType()), \
    StructField("payment_date", DateType()) \
  ])

In [8]:
df_with_fixed_schema = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv", \
                                     sep=';', \
                                     schema=schema, \
                                     header=True)

In [10]:
df_with_fixed_schema.printSchema()
df_with_fixed_schema.limit(5).toPandas()

root
 |-- payment_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- staff_id: integer (nullable = true)
 |-- rental_id: integer (nullable = true)
 |-- amount: double (nullable = true)
 |-- payment_date: date (nullable = true)



                                                                                

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date
0,16050,269,2,7,1.99,2017-01-24
1,16051,269,1,98,0.99,2017-01-25
2,16052,269,2,678,6.99,2017-01-28
3,16053,269,2,703,0.99,2017-01-29
4,16054,269,1,750,4.99,2017-01-29


# Do a SQL statement

In [11]:
df_with_fixed_schema.createOrReplaceTempView("payment")
spark.sql("""
    SELECT month(payment_date) as m, sum(amount) as revenue
    FROM payment
    GROUP by m
    order by revenue desc
""").show()

[Stage 4:>                                                          (0 + 1) / 1]

+---+------------------+
|  m|           revenue|
+---+------------------+
|  4|28559.460000003943|
|  3|23886.560000002115|
|  2| 9631.879999999608|
|  1| 4824.429999999856|
|  5|  514.180000000001|
+---+------------------+



                                                                                