In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split,col,avg

In [2]:
spark = SparkSession.builder.appName('PySparkTitanikJob').getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.option('header','true').csv('train.csv')

In [5]:
df.show(2)

+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0| PC 17599|71.2833|  C85|       C|
+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+
only showing top 2 rows



In [6]:
df.dtypes

[('PassengerId', 'string'),
 ('Survived', 'string'),
 ('Pclass', 'string'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'string'),
 ('SibSp', 'string'),
 ('Parch', 'string'),
 ('Ticket', 'string'),
 ('Fare', 'string'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

In [7]:
df.describe(['Sex', 'Age']).show()

+-------+------+------------------+
|summary|   Sex|               Age|
+-------+------+------------------+
|  count|   891|               714|
|   mean|  null| 29.69911764705882|
| stddev|  null|14.526497332334035|
|    min|female|              0.42|
|    max|  male|                 9|
+-------+------+------------------+



In [8]:
df.count()

891

In [9]:
df = df.drop('Ticket', 'Name', 'Fare','Cabin')

In [10]:
df.show()

+-----------+--------+------+------+----+-----+-----+--------+
|PassengerId|Survived|Pclass|   Sex| Age|SibSp|Parch|Embarked|
+-----------+--------+------+------+----+-----+-----+--------+
|          1|       0|     3|  male|  22|    1|    0|       S|
|          2|       1|     1|female|  38|    1|    0|       C|
|          3|       1|     3|female|  26|    0|    0|       S|
|          4|       1|     1|female|  35|    1|    0|       S|
|          5|       0|     3|  male|  35|    0|    0|       S|
|          6|       0|     3|  male|null|    0|    0|       Q|
|          7|       0|     1|  male|  54|    0|    0|       S|
|          8|       0|     3|  male|   2|    3|    1|       S|
|          9|       1|     3|female|  27|    0|    2|       S|
|         10|       1|     2|female|  14|    1|    0|       C|
|         11|       1|     3|female|   4|    1|    1|       S|
|         12|       1|     1|female|  58|    0|    0|       S|
|         13|       0|     3|  male|  20|    0|    0|  

In [11]:
df = df.withColumn('FamilySize', col('SibSp') + col('Parch') + 1)

In [12]:
df.show(4)

+-----------+--------+------+------+---+-----+-----+--------+----------+
|PassengerId|Survived|Pclass|   Sex|Age|SibSp|Parch|Embarked|FamilySize|
+-----------+--------+------+------+---+-----+-----+--------+----------+
|          1|       0|     3|  male| 22|    1|    0|       S|       2.0|
|          2|       1|     1|female| 38|    1|    0|       C|       2.0|
|          3|       1|     3|female| 26|    0|    0|       S|       1.0|
|          4|       1|     1|female| 35|    1|    0|       S|       2.0|
+-----------+--------+------+------+---+-----+-----+--------+----------+
only showing top 4 rows



In [13]:
df.where(col('Age').isNull()).count()

177

In [14]:
avg_age = df.select(avg(col('Age'))).collect()[0][0]

In [15]:
ndf = df.fillna({'Age': avg_age})

In [16]:
ndf.show(5)

+-----------+--------+------+------+---+-----+-----+--------+----------+
|PassengerId|Survived|Pclass|   Sex|Age|SibSp|Parch|Embarked|FamilySize|
+-----------+--------+------+------+---+-----+-----+--------+----------+
|          1|       0|     3|  male| 22|    1|    0|       S|       2.0|
|          2|       1|     1|female| 38|    1|    0|       C|       2.0|
|          3|       1|     3|female| 26|    0|    0|       S|       1.0|
|          4|       1|     1|female| 35|    1|    0|       S|       2.0|
|          5|       0|     3|  male| 35|    0|    0|       S|       1.0|
+-----------+--------+------+------+---+-----+-----+--------+----------+
only showing top 5 rows



In [17]:
ndf[['Sex']].distinct().show()

+------+
|   Sex|
+------+
|female|
|  male|
+------+



In [18]:
ndf = ndf.withColumn('M', col('Sex') == 'male')

In [19]:
ndf = ndf.withColumn('W', col('Sex') == 'female')

In [20]:
ndf.show(2)

+-----------+--------+------+------+---+-----+-----+--------+----------+-----+-----+
|PassengerId|Survived|Pclass|   Sex|Age|SibSp|Parch|Embarked|FamilySize|    M|    W|
+-----------+--------+------+------+---+-----+-----+--------+----------+-----+-----+
|          1|       0|     3|  male| 22|    1|    0|       S|       2.0| true|false|
|          2|       1|     1|female| 38|    1|    0|       C|       2.0|false| true|
+-----------+--------+------+------+---+-----+-----+--------+----------+-----+-----+
only showing top 2 rows



In [21]:
ndf = ndf.drop('Sex')

In [22]:
ndf.show(5)

+-----------+--------+------+---+-----+-----+--------+----------+-----+-----+
|PassengerId|Survived|Pclass|Age|SibSp|Parch|Embarked|FamilySize|    M|    W|
+-----------+--------+------+---+-----+-----+--------+----------+-----+-----+
|          1|       0|     3| 22|    1|    0|       S|       2.0| true|false|
|          2|       1|     1| 38|    1|    0|       C|       2.0|false| true|
|          3|       1|     3| 26|    0|    0|       S|       1.0|false| true|
|          4|       1|     1| 35|    1|    0|       S|       2.0|false| true|
|          5|       0|     3| 35|    0|    0|       S|       1.0| true|false|
+-----------+--------+------+---+-----+-----+--------+----------+-----+-----+
only showing top 5 rows



In [26]:
ndf.coalesce(1).write.option('header','true').csv('clear_data')

Py4JJavaError: An error occurred while calling o91.csv.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:231)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:188)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:979)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:645)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1230)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1435)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:493)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1868)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1910)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:678)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1868)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1910)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:332)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:402)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:375)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:182)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:220)
	... 33 more
