In [2]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("SimpleApp") \
    .getOrCreate()


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder\
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
from pyspark.sql.types import*


sc = spark.sparkContext
lines = sc.textFile('./data/people.txt')
# part=lines.map(lambda l: l.rstrip())
parts = lines.map(lambda l: l.split(","))
# print(parts.count())
people = parts.map(lambda p: Row(name=p[0],age=int(p[1])))
# print(people.count())
peopleDf = spark.createDataFrame(people)
peopleDf.show()

+------+---+
|  name|age|
+------+---+
| Luffy| 17|
|  Nami| 26|
| Robin| 38|
| Brook|108|
|shanks| 42|
| Buggy| 40|
+------+---+



In [5]:

people = parts.map(lambda p: Row(name=p[0],age=int(p[1].strip())))
schemaString = "name age"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
spark.createDataFrame(people, schema).show()

+------+---+
|  name|age|
+------+---+
| Luffy| 17|
|  Nami| 26|
| Robin| 38|
| Brook|108|
|shanks| 42|
| Buggy| 40|
+------+---+



In [None]:
df = spark.read.json("customer.json")
df.show()

df2 = spark.read.load("people.json", format="json")
df3 = spark.read.load("users.parquet")
df4 = spark.read.text("people.txt")

In [9]:
#Filter entries of age, only keep those records of which the values are >24
peopleDf.filter(peopleDf["age"]>24).show()
peopleDf = peopleDf.dropDuplicates()

+------+---+
|  name|age|
+------+---+
|  Nami| 26|
| Robin| 38|
| Brook|108|
|shanks| 42|
| Buggy| 40|
+------+---+



In [28]:
data = [('Jaya', '20', ['SQL','Data Science']),
        ('Milan', '21', ['ML','AI']),
        ('Rohit', '19', ['Programming', 'DSA']),
        ('Maria', '20', ['DBMS', 'Networking']),
        ('Jay', '22', ['Data Analytics','ML'])]

# column names for dataframe
columns = ['Name', 'Age', 'Courses_enrolled']

# creating dataframe with createDataFrame()
df = spark.createDataFrame(data, columns)

In [29]:
from pyspark.sql import functions as F


df.select("Name").show() 
df.select("Name","Age").show()
df=df.select("Name","Age",F.explode("Courses_enrolled").alias("Courses"))\
#.select("Courses.type","Name","Age")\
# .show()
df.select(df["Name"],df["Age"]+ 1).show()
df.select(df['Age'] > 24).show() #Show all entries where age >24

+-----+
| Name|
+-----+
| Jaya|
|Milan|
|Rohit|
|Maria|
|  Jay|
+-----+

+-----+---+
| Name|Age|
+-----+---+
| Jaya| 20|
|Milan| 21|
|Rohit| 19|
|Maria| 20|
|  Jay| 22|
+-----+---+

+-----+---------+
| Name|(Age + 1)|
+-----+---------+
| Jaya|     21.0|
| Jaya|     21.0|
|Milan|     22.0|
|Milan|     22.0|
|Rohit|     20.0|
|Rohit|     20.0|
|Maria|     21.0|
|Maria|     21.0|
|  Jay|     23.0|
|  Jay|     23.0|
+-----+---------+

+----------+
|(Age > 24)|
+----------+
|     false|
|     false|
|     false|
|     false|
|     false|
|     false|
|     false|
|     false|
|     false|
|     false|
+----------+



In [31]:

df.select("Name",F.when(df.Age > 21, 1).otherwise(0).alias("Adult")).dropDuplicates().show()
lis=df[df.Name.isin("Jaya","Boris")].collect()
print(lis)

+-----+-----+
| Name|Adult|
+-----+-----+
| Jaya|    0|
|Milan|    0|
|Rohit|    0|
|Maria|    0|
|  Jay|    1|
+-----+-----+

[Row(Name='Jaya', Age='20', Courses='SQL'), Row(Name='Jaya', Age='20', Courses='Data Science')]


In [32]:
df.show()

+-----+---+--------------+
| Name|Age|       Courses|
+-----+---+--------------+
| Jaya| 20|           SQL|
| Jaya| 20|  Data Science|
|Milan| 21|            ML|
|Milan| 21|            AI|
|Rohit| 19|   Programming|
|Rohit| 19|           DSA|
|Maria| 20|          DBMS|
|Maria| 20|    Networking|
|  Jay| 22|Data Analytics|
|  Jay| 22|            ML|
+-----+---+--------------+



In [35]:
df.select("Name",df.Courses.like("%Data%")).show()

+-----+------------------+
| Name|Courses LIKE %DSA%|
+-----+------------------+
| Jaya|             false|
| Jaya|             false|
|Milan|             false|
|Milan|             false|
|Rohit|             false|
|Rohit|              true|
|Maria|             false|
|Maria|             false|
|  Jay|             false|
|  Jay|             false|
+-----+------------------+



In [36]:
df.filter(df.Courses.like("%Data%")).show()

+----+---+--------------+
|Name|Age|       Courses|
+----+---+--------------+
|Jaya| 20|  Data Science|
| Jay| 22|Data Analytics|
+----+---+--------------+



In [39]:
df.filter(df.Courses.startswith("Da")).show()

+----+---+--------------+
|Name|Age|       Courses|
+----+---+--------------+
|Jaya| 20|  Data Science|
| Jay| 22|Data Analytics|
+----+---+--------------+



In [41]:
df.filter(df.Courses.endswith("ing")).show()

+-----+---+-----------+
| Name|Age|    Courses|
+-----+---+-----------+
|Rohit| 19|Programming|
|Maria| 20| Networking|
+-----+---+-----------+



In [43]:
df.select(df.Name.substr(1,3).alias("Name Code")).show()

+---------+
|Name Code|
+---------+
|      Jay|
|      Jay|
|      Mil|
|      Mil|
|      Roh|
|      Roh|
|      Mar|
|      Mar|
|      Jay|
|      Jay|
+---------+



In [45]:
df.filter(df.Age.between(22,24)).show()

+----+---+--------------+
|Name|Age|       Courses|
+----+---+--------------+
| Jay| 22|Data Analytics|
| Jay| 22|            ML|
+----+---+--------------+



In [54]:
from pyspark.sql.functions import lit

address={
    'city':'Chennai',
    'postalCode':800007,
    'state':'GOA',
    'streetAddress':'Dublin street Ireland colony'
}

df = df.withColumn('city',lit(address['city'])) \
    .withColumn('postalCode',lit(address['postalCode'])) \
    .withColumn('state',lit(address['state'])) \
    .withColumn('streetAddress',lit(address['streetAddress']))

In [55]:
df.show()

+-----+---+--------------+-------+----------+-----+--------------------+
| Name|Age|       Courses|   city|postalCode|state|       streetAddress|
+-----+---+--------------+-------+----------+-----+--------------------+
| Jaya| 20|           SQL|Chennai|    800007|  GOA|Dublin street Ire...|
| Jaya| 20|  Data Science|Chennai|    800007|  GOA|Dublin street Ire...|
|Milan| 21|            ML|Chennai|    800007|  GOA|Dublin street Ire...|
|Milan| 21|            AI|Chennai|    800007|  GOA|Dublin street Ire...|
|Rohit| 19|   Programming|Chennai|    800007|  GOA|Dublin street Ire...|
|Rohit| 19|           DSA|Chennai|    800007|  GOA|Dublin street Ire...|
|Maria| 20|          DBMS|Chennai|    800007|  GOA|Dublin street Ire...|
|Maria| 20|    Networking|Chennai|    800007|  GOA|Dublin street Ire...|
|  Jay| 22|Data Analytics|Chennai|    800007|  GOA|Dublin street Ire...|
|  Jay| 22|            ML|Chennai|    800007|  GOA|Dublin street Ire...|
+-----+---+--------------+-------+----------+-----+

In [56]:
df=df.withColumnRenamed('streetAddress','Address')
df=df.drop('Address')
df.show(2)

+----+---+------------+-------+----------+-----+
|Name|Age|     Courses|   city|postalCode|state|
+----+---+------------+-------+----------+-----+
|Jaya| 20|         SQL|Chennai|    800007|  GOA|
|Jaya| 20|Data Science|Chennai|    800007|  GOA|
+----+---+------------+-------+----------+-----+
only showing top 2 rows



In [57]:
df.na.fill(50).show()
df.na.drop().show()
df.na.replace(10,20).show()

+-----+---+--------------+-------+----------+-----+
| Name|Age|       Courses|   city|postalCode|state|
+-----+---+--------------+-------+----------+-----+
| Jaya| 20|           SQL|Chennai|    800007|  GOA|
| Jaya| 20|  Data Science|Chennai|    800007|  GOA|
|Milan| 21|            ML|Chennai|    800007|  GOA|
|Milan| 21|            AI|Chennai|    800007|  GOA|
|Rohit| 19|   Programming|Chennai|    800007|  GOA|
|Rohit| 19|           DSA|Chennai|    800007|  GOA|
|Maria| 20|          DBMS|Chennai|    800007|  GOA|
|Maria| 20|    Networking|Chennai|    800007|  GOA|
|  Jay| 22|Data Analytics|Chennai|    800007|  GOA|
|  Jay| 22|            ML|Chennai|    800007|  GOA|
+-----+---+--------------+-------+----------+-----+

+-----+---+--------------+-------+----------+-----+
| Name|Age|       Courses|   city|postalCode|state|
+-----+---+--------------+-------+----------+-----+
| Jaya| 20|           SQL|Chennai|    800007|  GOA|
| Jaya| 20|  Data Science|Chennai|    800007|  GOA|
|Milan| 21|

In [58]:
df.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 20|    4|
| 21|    2|
| 19|    2|
| 22|    2|
+---+-----+



In [63]:
df.sort(df.Age.desc()).collect()


[Row(Name='Jay', Age='22', Courses='Data Analytics', city='Chennai', postalCode=800007, state='GOA'),
 Row(Name='Jay', Age='22', Courses='ML', city='Chennai', postalCode=800007, state='GOA'),
 Row(Name='Milan', Age='21', Courses='ML', city='Chennai', postalCode=800007, state='GOA'),
 Row(Name='Milan', Age='21', Courses='AI', city='Chennai', postalCode=800007, state='GOA'),
 Row(Name='Jaya', Age='20', Courses='SQL', city='Chennai', postalCode=800007, state='GOA'),
 Row(Name='Jaya', Age='20', Courses='Data Science', city='Chennai', postalCode=800007, state='GOA'),
 Row(Name='Maria', Age='20', Courses='DBMS', city='Chennai', postalCode=800007, state='GOA'),
 Row(Name='Maria', Age='20', Courses='Networking', city='Chennai', postalCode=800007, state='GOA'),
 Row(Name='Rohit', Age='19', Courses='Programming', city='Chennai', postalCode=800007, state='GOA'),
 Row(Name='Rohit', Age='19', Courses='DSA', city='Chennai', postalCode=800007, state='GOA')]

In [64]:
df.sort("age",ascending=False).show()
df.orderBy(['name','age'],ascending=[0,1]).show()

+-----+---+--------------+-------+----------+-----+
| Name|Age|       Courses|   city|postalCode|state|
+-----+---+--------------+-------+----------+-----+
|  Jay| 22|Data Analytics|Chennai|    800007|  GOA|
|  Jay| 22|            ML|Chennai|    800007|  GOA|
|Milan| 21|            ML|Chennai|    800007|  GOA|
|Milan| 21|            AI|Chennai|    800007|  GOA|
|Maria| 20|          DBMS|Chennai|    800007|  GOA|
| Jaya| 20|           SQL|Chennai|    800007|  GOA|
|Maria| 20|    Networking|Chennai|    800007|  GOA|
| Jaya| 20|  Data Science|Chennai|    800007|  GOA|
|Rohit| 19|   Programming|Chennai|    800007|  GOA|
|Rohit| 19|           DSA|Chennai|    800007|  GOA|
+-----+---+--------------+-------+----------+-----+

+-----+---+--------------+-------+----------+-----+
| Name|Age|       Courses|   city|postalCode|state|
+-----+---+--------------+-------+----------+-----+
|Rohit| 19|   Programming|Chennai|    800007|  GOA|
|Rohit| 19|           DSA|Chennai|    800007|  GOA|
|Milan| 21|

In [65]:
df.repartition(10).rdd.getNumPartitions()

10

In [66]:
df.coalesce(1).rdd.getNumPartitions()

1

In [67]:

peopleDf.createGlobalTempView("people")
df.createTempView("customer")
df.createOrReplaceTempView("customer")

In [68]:
df5 = spark.sql("SELECT * FROM customer").show()
peopleDf2 = spark.sql("SELECT * FROM global_temp.people").show()

+-----+---+--------------+-------+----------+-----+
| Name|Age|       Courses|   city|postalCode|state|
+-----+---+--------------+-------+----------+-----+
| Jaya| 20|           SQL|Chennai|    800007|  GOA|
| Jaya| 20|  Data Science|Chennai|    800007|  GOA|
|Milan| 21|            ML|Chennai|    800007|  GOA|
|Milan| 21|            AI|Chennai|    800007|  GOA|
|Rohit| 19|   Programming|Chennai|    800007|  GOA|
|Rohit| 19|           DSA|Chennai|    800007|  GOA|
|Maria| 20|          DBMS|Chennai|    800007|  GOA|
|Maria| 20|    Networking|Chennai|    800007|  GOA|
|  Jay| 22|Data Analytics|Chennai|    800007|  GOA|
|  Jay| 22|            ML|Chennai|    800007|  GOA|
+-----+---+--------------+-------+----------+-----+

+------+---+
|  name|age|
+------+---+
| Brook|108|
| Robin| 38|
| Luffy| 17|
|  Nami| 26|
| Buggy| 40|
|shanks| 42|
+------+---+



In [69]:

df.dtypes #Return df column names and data types
df.show() #Display the content of df
df.head() #Return first n rows
df.first() #Return first row
df.take(2) #Return the first n rows 
df.schema #Return the schema of df
df.describe().show() #Compute summary statistics 
df.columns #Return the columns of df
df.count() #Count the number of rows in df
df.distinct().count() #Count the number of distinct rows in df
df.printSchema() #Print the schema of df
df.explain() #Print the (logical and physical) plans

+-----+---+--------------+-------+----------+-----+
| Name|Age|       Courses|   city|postalCode|state|
+-----+---+--------------+-------+----------+-----+
| Jaya| 20|           SQL|Chennai|    800007|  GOA|
| Jaya| 20|  Data Science|Chennai|    800007|  GOA|
|Milan| 21|            ML|Chennai|    800007|  GOA|
|Milan| 21|            AI|Chennai|    800007|  GOA|
|Rohit| 19|   Programming|Chennai|    800007|  GOA|
|Rohit| 19|           DSA|Chennai|    800007|  GOA|
|Maria| 20|          DBMS|Chennai|    800007|  GOA|
|Maria| 20|    Networking|Chennai|    800007|  GOA|
|  Jay| 22|Data Analytics|Chennai|    800007|  GOA|
|  Jay| 22|            ML|Chennai|    800007|  GOA|
+-----+---+--------------+-------+----------+-----+

+-------+-----+----------------+-------+-------+----------+-----+
|summary| Name|             Age|Courses|   city|postalCode|state|
+-------+-----+----------------+-------+-------+----------+-----+
|  count|   10|              10|     10|     10|        10|   10|
|   mea

In [70]:

rdd1 = df.rdd #Convert df into an RDD
df.toJSON().first() #Convert df into a RDD of string
df.toPandas() #Return the contents of df as Pandas DataFrame

Unnamed: 0,Name,Age,Courses,city,postalCode,state
0,Jaya,20,SQL,Chennai,800007,GOA
1,Jaya,20,Data Science,Chennai,800007,GOA
2,Milan,21,ML,Chennai,800007,GOA
3,Milan,21,AI,Chennai,800007,GOA
4,Rohit,19,Programming,Chennai,800007,GOA
5,Rohit,19,DSA,Chennai,800007,GOA
6,Maria,20,DBMS,Chennai,800007,GOA
7,Maria,20,Networking,Chennai,800007,GOA
8,Jay,22,Data Analytics,Chennai,800007,GOA
9,Jay,22,ML,Chennai,800007,GOA


In [None]:
df.select("Name", "city")\
    .write\
    .save("./output/nameAndCity.parquet") 


Py4JJavaError: An error occurred while calling o719.save.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 11 in stage 202.0 failed 1 times, most recent failure: Lost task 11.0 in stage 202.0 (TID 909) (Balamurugan213 executor driver): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to file:/y:/DEVELOPER.GIT/pyspark/output/nameAndCity.parquet.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:775)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:420)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:407)
	at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:391)
	at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:341)
	at org.apache.hadoop.fs.RawLocalFileSystem.rename(RawLocalFileSystem.java:505)
	at org.apache.hadoop.fs.ChecksumFileSystem.rename(ChecksumFileSystem.java:694)
	at org.apache.hadoop.hive.ql.io.ProxyLocalFileSystem.rename(ProxyLocalFileSystem.java:34)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitTask(FileOutputCommitter.java:600)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitTask(FileOutputCommitter.java:571)
	at org.apache.spark.mapred.SparkHadoopMapRedUtil$.$anonfun$commitTask$1(SparkHadoopMapRedUtil.scala:51)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:552)
	at org.apache.spark.mapred.SparkHadoopMapRedUtil$.performCommit$1(SparkHadoopMapRedUtil.scala:51)
	at org.apache.spark.mapred.SparkHadoopMapRedUtil$.commitTask(SparkHadoopMapRedUtil.scala:78)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitTask(HadoopMapReduceCommitProtocol.scala:279)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.$anonfun$commit$1(FileFormatDataWriter.scala:107)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:552)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.commit(FileFormatDataWriter.scala:107)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:404)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:410)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$4(FileFormatWriter.scala:307)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:271)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to file:/y:/DEVELOPER.GIT/pyspark/output/nameAndCity.parquet.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:775)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:420)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
Caused by: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:407)
	at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:391)
	at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:341)
	at org.apache.hadoop.fs.RawLocalFileSystem.rename(RawLocalFileSystem.java:505)
	at org.apache.hadoop.fs.ChecksumFileSystem.rename(ChecksumFileSystem.java:694)
	at org.apache.hadoop.hive.ql.io.ProxyLocalFileSystem.rename(ProxyLocalFileSystem.java:34)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitTask(FileOutputCommitter.java:600)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitTask(FileOutputCommitter.java:571)
	at org.apache.spark.mapred.SparkHadoopMapRedUtil$.$anonfun$commitTask$1(SparkHadoopMapRedUtil.scala:51)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:552)
	at org.apache.spark.mapred.SparkHadoopMapRedUtil$.performCommit$1(SparkHadoopMapRedUtil.scala:51)
	at org.apache.spark.mapred.SparkHadoopMapRedUtil$.commitTask(SparkHadoopMapRedUtil.scala:78)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitTask(HadoopMapReduceCommitProtocol.scala:279)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.$anonfun$commit$1(FileFormatDataWriter.scala:107)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:552)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.commit(FileFormatDataWriter.scala:107)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:404)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:410)
	... 17 more


In [77]:
df.select("Name", "Age")\
    .write\
    .save("./output/namesAndAges.json",format="json")

Py4JJavaError: An error occurred while calling o729.save.
: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:334)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:404)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$writeAndCommit$3(FileFormatWriter.scala:275)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:552)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:275)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)


In [None]:
spark.stop()