In [1]:
from pyspark.sql import SparkSession




In [3]:
spark = SparkSession.builder.appName('workingWithData').getOrCreate()

In [4]:
spark

# section 2 Working with data
# chapter 5 Partitioning Data

In [9]:
def debug(iterator):
    print("elements=", list(iterator))

In [10]:
# partition 
numbers = [1,2,3,4,5,6,7,8,9,10,11,12]
rdd = spark.sparkContext.parallelize(numbers)
num_partition = rdd.getNumPartitions()
num_partition

8

In [17]:
rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

22/09/05 13:42:02 INFO TaskSetManager: Starting task 2.0 in stage 0.0 (TID 2, avinash, executor driver, partition 2, PROCESS_LOCAL, 7360 bytes)
22/09/05 13:42:02 INFO TaskSetManager: Starting task 3.0 in stage 0.0 (TID 3, avinash, executor driver, partition 3, PROCESS_LOCAL, 7387 bytes)
22/09/05 13:42:03 INFO TaskSetManager: Starting task 4.0 in stage 0.0 (TID 4, avinash, executor driver, partition 4, PROCESS_LOCAL, 7360 bytes)
22/09/05 13:42:03 INFO TaskSetManager: Starting task 5.0 in stage 0.0 (TID 5, avinash, executor driver, partition 5, PROCESS_LOCAL, 7387 bytes)
22/09/05 13:42:03 INFO TaskSetManager: Starting task 6.0 in stage 0.0 (TID 6, avinash, executor driver, partition 6, PROCESS_LOCAL, 7360 bytes)
22/09/05 13:42:03 INFO TaskSetManager: Starting task 7.0 in stage 0.0 (TID 7, avinash, executor driver, partition 7, PROCESS_LOCAL, 7387 bytes)
22/09/05 13:42:03 INFO Executor: Running task 5.0 in stage 0.0 (TID 5)
22/09/05 13:42:03 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)      
22/09/05 13:42:03 INFO Executor: Running task 3.0 in stage 0.0 (TID 3)      
22/09/05 13:42:03 INFO Executor: Running task 2.0 in stage 0.0 (TID 2)      
22/09/05 13:42:03 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)      
22/09/05 13:42:03 INFO Executor: Running task 6.0 in stage 0.0 (TID 6)      
22/09/05 13:42:03 INFO Executor: Running task 7.0 in stage 0.0 (TID 7)      
22/09/05 13:42:03 INFO Executor: Running task 4.0 in stage 0.0 (TID 4)
elements= [10]
22/09/05 13:42:04 INFO PythonRunner: Times: total = 737, boot = 722, init = 15, finish = 0
elements= [1]
22/09/05 13:42:04 INFO PythonRunner: Times: total = 1348, boot = 1348, init = 0, finish = 0
elements= [5, 6]
22/09/05 13:42:05 INFO PythonRunner: Times: total = 1992, boot = 1992, init = 0, finish = 0
elements= [11, 12]
22/09/05 13:42:06 INFO PythonRunner: Times: total = 2619, boot = 2619, init = 0, finish = 0
elements= [8, 9]
22/09/05 13:42:06 INFO PythonRunner: Times: total = 3248, boot = 3248, init = 0, finish = 0
elements= [2, 3]
22/09/05 13:42:07 INFO PythonRunner: Times: total = 3879, boot = 3879, init = 0, finish = 0
elements= [4]
22/09/05 13:42:07 INFO PythonRunner: Times: total = 4503, boot = 4503, init = 0, finish = 0
elements= [7]
22/09/05 13:42:08 INFO PythonRunner: Times: total = 5145, boot = 5145, init = 0, finish = 0

In [4]:
df  = spark.read.option("inferschema","true")\
    .csv('data/customers_with_date.txt').toDF('customer_id','date','trnx_Id','amount')

In [5]:
df.show()

+-----------+---------+-------+------+
|customer_id|     date|trnx_Id|amount|
+-----------+---------+-------+------+
|         c1| 2/9/2019|  T0011|    20|
|         c1| 2/9/2019|  T0012|    12|
|         c1| 3/9/2019|  T0013|    30|
|         c1| 3/9/2019|  T0014|    42|
|         c1|4/12/2019|  T0023|    48|
|         c1|4/12/2018|  T0051|    28|
|         c1|4/12/2019|  T0043|    42|
|         c1|4/12/2018|  T0091|    29|
|         c1| 1/3/2018|  T0002|    12|
|         c1| 4/3/2018|  T0003|    44|
|         c2|2/10/2019|  T0511|    20|
|         c2|2/10/2019|  T0612|    17|
|         c2| 2/9/2019|  T0061|    25|
|         c2| 2/9/2019|  T0062|    78|
|         c2|3/12/2019|  T0513|    67|
|         c2|3/12/2019|  T0014|    42|
|         c2|4/10/2019|  T0023|    48|
|         c2|4/10/2018|  T0051|    28|
|         c2|4/12/2019|  T0043|    42|
|         c2|4/12/2018|  T0091|    29|
+-----------+---------+-------+------+
only showing top 20 rows



In [6]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType,StringType

#-------------------------------------
# date_as_str: day/month/year
@udf(returnType=IntegerType())
def get_year(date_as_str):
    tokens = date_as_str.split("/")
    return int(tokens[2])
#end-def
#-------------------------------------
# date_as_str: day/month/year
@udf(returnType=IntegerType())
def get_month(date_as_str):
    tokens = date_as_str.split("/")
    return int(tokens[1])

In [7]:
# add column with year and month
df2 = df.withColumn("year",get_year(df.date))\
    .withColumn("month",get_month(df.date))

In [9]:
df2.show()

+-----------+---------+-------+------+----+-----+
|customer_id|     date|trnx_Id|amount|year|month|
+-----------+---------+-------+------+----+-----+
|         c1| 2/9/2019|  T0011|    20|2019|    9|
|         c1| 2/9/2019|  T0012|    12|2019|    9|
|         c1| 3/9/2019|  T0013|    30|2019|    9|
|         c1| 3/9/2019|  T0014|    42|2019|    9|
|         c1|4/12/2019|  T0023|    48|2019|   12|
|         c1|4/12/2018|  T0051|    28|2018|   12|
|         c1|4/12/2019|  T0043|    42|2019|   12|
|         c1|4/12/2018|  T0091|    29|2018|   12|
|         c1| 1/3/2018|  T0002|    12|2018|    3|
|         c1| 4/3/2018|  T0003|    44|2018|    3|
|         c2|2/10/2019|  T0511|    20|2019|   10|
|         c2|2/10/2019|  T0612|    17|2019|   10|
|         c2| 2/9/2019|  T0061|    25|2019|    9|
|         c2| 2/9/2019|  T0062|    78|2019|    9|
|         c2|3/12/2019|  T0513|    67|2019|   12|
|         c2|3/12/2019|  T0014|    42|2019|   12|
|         c2|4/10/2019|  T0023|    48|2019|   10|


In [12]:
# write to partition
df2.write.partitionBy("year",'month')\
    .parquet('output/parquetdata')

In [13]:
from pyspark.sql.types import IntegerType,StringType

#-------------------------------------
# date_as_str: day/month/year
@udf(returnType=StringType())
def get_year1(date_as_str):
    tokens = date_as_str.split("/")
    return tokens[2]
#end-def
#-------------------------------------
# date_as_str: day/month/year
@udf(returnType=StringType())
def get_month1(date_as_str):
    tokens = date_as_str.split("/")
    return tokens[1]

In [14]:
# add column with year and month
df3 = df.withColumn("year",get_year1(df.date))\
    .withColumn("month",get_month1(df.date))

In [15]:
df3.show()

+-----------+---------+-------+------+----+-----+
|customer_id|     date|trnx_Id|amount|year|month|
+-----------+---------+-------+------+----+-----+
|         c1| 2/9/2019|  T0011|    20|2019|    9|
|         c1| 2/9/2019|  T0012|    12|2019|    9|
|         c1| 3/9/2019|  T0013|    30|2019|    9|
|         c1| 3/9/2019|  T0014|    42|2019|    9|
|         c1|4/12/2019|  T0023|    48|2019|   12|
|         c1|4/12/2018|  T0051|    28|2018|   12|
|         c1|4/12/2019|  T0043|    42|2019|   12|
|         c1|4/12/2018|  T0091|    29|2018|   12|
|         c1| 1/3/2018|  T0002|    12|2018|    3|
|         c1| 4/3/2018|  T0003|    44|2018|    3|
|         c2|2/10/2019|  T0511|    20|2019|   10|
|         c2|2/10/2019|  T0612|    17|2019|   10|
|         c2| 2/9/2019|  T0061|    25|2019|    9|
|         c2| 2/9/2019|  T0062|    78|2019|    9|
|         c2|3/12/2019|  T0513|    67|2019|   12|
|         c2|3/12/2019|  T0014|    42|2019|   12|
|         c2|4/10/2019|  T0023|    48|2019|   10|


In [17]:
df3.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- trnx_Id: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [16]:
# write to partition
df3.write.partitionBy("year",'month')\
    .text('output/textdata')

AnalysisException: Text data source does not support int data type.;

# chapter 7. interacting with external Data sources

## mysql 

In [7]:
# load(path=None,format=None,schema=None,**options)
# read rdbms table   spark.read.format('jdbc').options(url=url,driver=driver,dbtable=dbtable,user=username,password=password).load()
dataframe_mysql = spark.read\
    .format("jdbc")\
    .option('url',"jdbc:mysql://localhost")\
    .option("driver","com.mysql.jdbc.Driver")\
    .option('dbtable','metadb.dept')\
    .option('user','root')\
    .option('password','root')\
    .load()

In [8]:
dataframe_mysql.show()

+-----------+----------+-------------+-------+
|dept_number| dept_name|dept_location|manager|
+-----------+----------+-------------+-------+
|         10|ACCOUNTING| NEW YORK, NY|   alex|
|         20|  RESEARCH|   DALLAS, TX|   alex|
|         30|     SALES|  CHICAGO, IL|   jane|
|         40|OPERATIONS|   BOSTON, MA|   jane|
|         50| MARKETING|Sunnyvale, CA|   jane|
|         60|  SOFTWARE| Stanford, CA|   jane|
|         70|  HARDWARE|   BOSTON, MA| sophia|
+-----------+----------+-------------+-------+



In [9]:
dataframe_mysql.count()

7

In [10]:
dataframe_mysql.printSchema()

root
 |-- dept_number: integer (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- dept_location: string (nullable = true)
 |-- manager: string (nullable = true)



In [11]:
# show dept_number and manageer 
dataframe_mysql.select("dept_number",'manager').show()

+-----------+-------+
|dept_number|manager|
+-----------+-------+
|         10|   alex|
|         20|   alex|
|         30|   jane|
|         40|   jane|
|         50|   jane|
|         60|   jane|
|         70| sophia|
+-----------+-------+



In [12]:
dataframe_mysql.select("dept_number","manager").groupBy("manager").count().show()

+-------+-----+
|manager|count|
+-------+-----+
|   jane|    4|
|   alex|    2|
| sophia|    1|
+-------+-----+



In [14]:
# to run fulley fledged quiery against dataframe first register dataframe as table

dataframe_mysql.registerTempTable('mydept')

spark.sql("select * from mydept").show()

+-----------+----------+-------------+-------+
|dept_number| dept_name|dept_location|manager|
+-----------+----------+-------------+-------+
|         10|ACCOUNTING| NEW YORK, NY|   alex|
|         20|  RESEARCH|   DALLAS, TX|   alex|
|         30|     SALES|  CHICAGO, IL|   jane|
|         40|OPERATIONS|   BOSTON, MA|   jane|
|         50| MARKETING|Sunnyvale, CA|   jane|
|         60|  SOFTWARE| Stanford, CA|   jane|
|         70|  HARDWARE|   BOSTON, MA| sophia|
+-----------+----------+-------------+-------+



In [17]:
spark.sql("select * from mydept where dept_number < 40 order by dept_name").show()

+-----------+----------+-------------+-------+
|dept_number| dept_name|dept_location|manager|
+-----------+----------+-------------+-------+
|         10|ACCOUNTING| NEW YORK, NY|   alex|
|         20|  RESEARCH|   DALLAS, TX|   alex|
|         30|     SALES|  CHICAGO, IL|   jane|
+-----------+----------+-------------+-------+



In [19]:
triplets = [ ("alex", 60, 18000),
... ("adel", 40, 45000),
... ("adel", 50, 77000),
... ("jane", 40, 52000),
... ("jane", 60, 81000),
... ("alex", 50, 62000),
... ("mary", 50, 92000),
... ("mary", 60, 63000),
... ("mary", 40, 55000),
... ("mary", 40, 55000)
... ]

triplet_df = spark.createDataFrame(triplets,['name','age','salary'])
triplet_df.show()

+----+---+------+
|name|age|salary|
+----+---+------+
|alex| 60| 18000|
|adel| 40| 45000|
|adel| 50| 77000|
|jane| 40| 52000|
|jane| 60| 81000|
|alex| 50| 62000|
|mary| 50| 92000|
|mary| 60| 63000|
|mary| 40| 55000|
|mary| 40| 55000|
+----+---+------+



In [23]:
# write to mysql
driver = "com.mysql.jdbc.Driver"
url = "jdbc:mysql://localhost"
username= 'root'
password = 'root'
dbtable='metadb.triplets'


triplet_df.write.format('jdbc').options(url=url,driver=driver,dbtable=dbtable,user=username,password=password)\
    .save()

In [None]:
# modes = (append,overwrite,ignore,error(default if data aleready existst))

## csv file

In [27]:
# creatting new dataframe from csv files
emp_df = spark.read.csv('data/emp.csv',inferSchema=True,header=True)

In [28]:
emp_df.show()

+-----+----+-----+
| dept|name|hours|
+-----+----+-----+
|Sales|Barb|   40|
|Sales| Dan|   20|
|   IT|Alex|   22|
|   IT|Jane|   24|
|   HR|Alex|   20|
|   HR|Mary|   30|
+-----+----+-----+



In [29]:
emp_df1 = spark.read\
    .format('csv')\
    .options(header=True,inferschema=True)\
    .load('data/emp.csv')

In [30]:
emp_df1.show()

+-----+----+-----+
| dept|name|hours|
+-----+----+-----+
|Sales|Barb|   40|
|Sales| Dan|   20|
|   IT|Alex|   22|
|   IT|Jane|   24|
|   HR|Alex|   20|
|   HR|Mary|   30|
+-----+----+-----+



In [31]:
emp_df1.printSchema()

root
 |-- dept: string (nullable = true)
 |-- name: string (nullable = true)
 |-- hours: integer (nullable = true)



In [32]:
emp_df.write.csv('employee.csv',header=True)

## Json files

In [34]:
# json files
data_path = 'data/emp.json'
json_df = spark.read.json(data_path)
json_df.show()

+-------+------+
|   name|salary|
+-------+------+
|Michael|  3000|
|   Andy|  4500|
| Justin|  3500|
|  Berta|  4000|
+-------+------+



In [35]:
json_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [36]:
df2 = spark.read.format('json')\
    .load([data_path, data_path])

df2.show()

+-------+------+
|   name|salary|
+-------+------+
|Michael|  3000|
|   Andy|  4500|
| Justin|  3500|
|  Berta|  4000|
|Michael|  3000|
|   Andy|  4500|
| Justin|  3500|
|  Berta|  4000|
+-------+------+



In [37]:
# write json
data = [("name", "alex"), ("gender", "male"), ("state",
"CA")]

df =spark.createDataFrame(data,['key','value'])
df.show()

+------+-----+
|   key|value|
+------+-----+
|  name| alex|
|gender| male|
| state|   CA|
+------+-----+



In [39]:
df.write.json('tmp/data') # want only one file -> df.repartition(1).write.json('data)

## Reading and writing form Amazon S3

In [None]:
# writing to aws S3
## http://s3.<region>.amazonaws.com/<bucket>/<key>
## http://s3.us-east-1.amazonaws.com/project-dev/dna/sample123.vcf

# # spark: SparkSession
# sc = spark.sparkContext
# # set access key
# sc._jsc.hadoopConfiguration().set("fs.s3.awsAccessKeyId", "AKIAI74O5KPLUQGVOJWQ")
# # set secret key
# sc._jsc.hadoopConfiguration().set("fs.s3.awsSecretAccessKey", "LmuKE7afdasdfxK2vj1nfA0Bp")

In [None]:
# s3_object_path = "s3n://bucket-name/object-path"
# df = spark.read.text(s3_object_path)

In [40]:
# use boto3
import boto3
s3 = boto3.resource('s3')
bucket = 'avinashhivedata'
key = 'usdata1.csv'
obj = s3.Object(bucket,key)

In [42]:
re = obj.get()

In [None]:
re.keys()

In [12]:

s3_object_path='s3a://avinashhivedata/usdata1.csv'
df = spark.read.text(s3_object_path)

In [14]:
df.show()

+--------------------+
|               value|
+--------------------+
|first_name,last_n...|
|James,Butt,"Bento...|
|Josephine,Darakjy...|
|Art,Venere,"Cheme...|
|Lenna,Paprocki,Fe...|
|Donette,Foller,Pr...|
|Simona,Morasca,"C...|
|Mitsue,Tollner,Mo...|
|Leota,Dilliard,Co...|
|Sage,Wieser,Truhl...|
|Kris,Marrier,"Kin...|
|Minna,Amigon,"Dor...|
|Abel,Maclead,Rang...|
|Kiley,Caldarera,F...|
|Graciela,Ruta,Buc...|
|Cammy,Albares,"Ro...|
|Mattie,Poquette,C...|
|Meaghan,Garufi,"B...|
|Gladys,Rim,T M By...|
|Yuki,Whobrey,Farm...|
+--------------------+
only showing top 20 rows



In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("test").getOrCreate()

# Enable hadoop s3a settings
spark.sparkContext._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", \
                                     "com.amazonaws.auth.InstanceProfileCredentialsProvider,com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3A")





In [4]:
Access_key_ID="AKIA4UOXHMLTI6TODZO6"
Secret_access_key="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"


spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key",Access_key_ID)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key",Secret_access_key)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")

In [5]:
s3_uri = "s3a://avinashhivedata/usdata1.csv"
data="s3a://s3databucket/input/us-500.csv"
df=spark.read.format('csv').option("header","true").option("inferSchema","true").load(s3_uri)
df.show()

+----------+---------+--------------------+--------------------+-------------+--------------+-----+-----+---+------------+------------+--------------------+--------------------+
|first_name|last_name|        company_name|             address|         city|       country|state|  zip|age|      phone1|      phone2|               email|                 web|
+----------+---------+--------------------+--------------------+-------------+--------------+-----+-----+---+------------+------------+--------------------+--------------------+
|     James|     Butt|   Benton, John B Jr|  6649 N Blue Gum St|  New Orleans|       Orleans|   LA|70116|  9|504-621-8927|504-845-1427|     jbutt@gmail.com|http://www.benton...|
| Josephine|  Darakjy|Chanay, Jeffrey A...| 4 B Blue Ridge Blvd|     Brighton|    Livingston|   MI|48116|  8|810-292-9388|810-374-9840|josephine_darakjy...|http://www.chanay...|
|       Art|   Venere| Chemel, James L Cpa|8 W Cerritos Ave #54|   Bridgeport|    Gloucester|   NJ| 8014|  7|8

In [6]:
spark

In [7]:
df_ = spark.read.parquet('output/parquetdata')

In [8]:
df_.show()

+-----------+---------+-------+------+----+-----+
|customer_id|     date|trnx_Id|amount|year|month|
+-----------+---------+-------+------+----+-----+
|         c1|4/12/2018|  T0051|    28|2018|   12|
|         c1|4/12/2018|  T0091|    29|2018|   12|
|         c2|4/12/2018|  T0091|    29|2018|   12|
|         c1|4/12/2019|  T0023|    48|2019|   12|
|         c1|4/12/2019|  T0043|    42|2019|   12|
|         c2|3/12/2019|  T0513|    67|2019|   12|
|         c2|3/12/2019|  T0014|    42|2019|   12|
|         c2|4/12/2019|  T0043|    42|2019|   12|
|         c1| 2/9/2019|  T0011|    20|2019|    9|
|         c1| 2/9/2019|  T0012|    12|2019|    9|
|         c1| 3/9/2019|  T0013|    30|2019|    9|
|         c1| 3/9/2019|  T0014|    42|2019|    9|
|         c2| 2/9/2019|  T0061|    25|2019|    9|
|         c2| 2/9/2019|  T0062|    78|2019|    9|
|         c2|2/10/2019|  T0511|    20|2019|   10|
|         c2|2/10/2019|  T0612|    17|2019|   10|
|         c2|4/10/2019|  T0023|    48|2019|   10|


In [10]:
s3_path = 's3a://avinashhivedata/parquet_part'
df_.write.partitionBy("year",'month')\
    .parquet(s3_path)

In [9]:
s3_path = 's3a://avinashhivedata/output'
df_.write.format("csv").mode("overwrite").save(s3_path)

## HDFS

In [None]:
# pending

In [15]:
spark.stop()

In [8]:
spark.stop()