In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

In [2]:
data = spark.read.csv('insurance.csv', inferSchema=True, header=True)

In [3]:
data.show()

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [4]:
data.count(), len(data.columns)

(1338, 7)

In [5]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- charges: double (nullable = true)



In [6]:
data.select("age", "bmi", "children").show()

+---+------+--------+
|age|   bmi|children|
+---+------+--------+
| 19|  27.9|       0|
| 18| 33.77|       1|
| 28|  33.0|       3|
| 33|22.705|       0|
| 32| 28.88|       0|
| 31| 25.74|       0|
| 46| 33.44|       1|
| 37| 27.74|       3|
| 37| 29.83|       2|
| 60| 25.84|       0|
| 25| 26.22|       0|
| 62| 26.29|       0|
| 23|  34.4|       0|
| 56| 39.82|       0|
| 27| 42.13|       0|
| 19|  24.6|       1|
| 52| 30.78|       1|
| 23|23.845|       0|
| 56|  40.3|       0|
| 30|  35.3|       0|
+---+------+--------+
only showing top 20 rows



In [7]:
data.describe(["age", "bmi", "children", "charges"]).show()

+-------+------------------+------------------+-----------------+------------------+
|summary|               age|               bmi|         children|           charges|
+-------+------------------+------------------+-----------------+------------------+
|  count|              1338|              1338|             1338|              1338|
|   mean| 39.20702541106129|30.663396860986538|  1.0949177877429|13270.422265141257|
| stddev|14.049960379216147| 6.098186911679012|1.205492739781914|12110.011236693992|
|    min|                18|             15.96|                0|         1121.8739|
|    max|                64|             53.13|                5|       63770.42801|
+-------+------------------+------------------+-----------------+------------------+



In [8]:
data.groupBy("region") \
.count() \
.orderBy("count", ascending=False) \
.show()

+---------+-----+
|   region|count|
+---------+-----+
|southeast|  364|
|northwest|  325|
|southwest|  325|
|northeast|  324|
+---------+-----+



In [9]:
data.na.drop()

DataFrame[age: int, sex: string, bmi: double, children: int, smoker: string, region: string, charges: double]

In [10]:
data.count()

1338

In [12]:
import pyspark
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

In [13]:
from delta.tables import *

In [17]:
df = spark.read.csv('insurance.csv', inferSchema=True, header=True)

In [18]:
df.show()

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [19]:
df.write.format("delta").save("delta-table")

In [20]:
read_df = spark.read.format("delta").load("delta-table")

In [21]:
read_df.show()

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [23]:
from pyspark.sql.functions import *

delta_table = DeltaTable.forPath(spark, "delta-table")