In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
spark

In [3]:
import pandas as pd
import numpy as np

np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
print(pandas_dataframe.shape)
pandas_dataframe.head()

(20, 2)


Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c


In [4]:
sp_df = spark.createDataFrame(pandas_dataframe)

In [5]:
sp_df

DataFrame[n: bigint, group: string]

In [6]:
pandas_dataframe.head()

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c


In [7]:
sp_df.show(2)

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
+---+-----+
only showing top 2 rows



In [8]:
from pydataset import data

mpg_pd = data("mpg")
mpg_pd.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [9]:
mpg_sp = spark.createDataFrame(mpg_pd)
mpg_sp.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [10]:
mpg_pd.year

1      1999
2      1999
3      2008
4      2008
5      1999
       ... 
230    2008
231    2008
232    1999
233    1999
234    2008
Name: year, Length: 234, dtype: int64

In [11]:
mpg_sp.year

Column<b'year'>

In [12]:
mpg_sp.select(mpg_sp.hwy, mpg_sp.cty, mpg_sp.model)

DataFrame[hwy: bigint, cty: bigint, model: string]

In [13]:
mpg_sp.select(mpg_sp.hwy, mpg_sp.cty, mpg_sp.model).show()

+---+---+------------------+
|hwy|cty|             model|
+---+---+------------------+
| 29| 18|                a4|
| 29| 21|                a4|
| 31| 20|                a4|
| 30| 21|                a4|
| 26| 16|                a4|
| 26| 18|                a4|
| 27| 18|                a4|
| 26| 18|        a4 quattro|
| 25| 16|        a4 quattro|
| 28| 20|        a4 quattro|
| 27| 19|        a4 quattro|
| 25| 15|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 15|        a4 quattro|
| 24| 15|        a6 quattro|
| 25| 17|        a6 quattro|
| 23| 16|        a6 quattro|
| 20| 14|c1500 suburban 2wd|
| 15| 11|c1500 suburban 2wd|
+---+---+------------------+
only showing top 20 rows



In [14]:
mpg = mpg_sp
mpg.select(mpg.hwy, mpg.hwy + 1).show(2)

+---+---------+
|hwy|(hwy + 1)|
+---+---------+
| 29|       30|
| 29|       30|
+---+---------+
only showing top 2 rows



In [15]:
mpg.select(mpg.hwy.alias("highway_mileage"), 
          (mpg.hwy + 1).alias("highway_mileage_plus1")).show(2)

+---------------+---------------------+
|highway_mileage|highway_mileage_plus1|
+---------------+---------------------+
|             29|                   30|
|             29|                   30|
+---------------+---------------------+
only showing top 2 rows



In [16]:
col1 = mpg.hwy.alias("highway_mileage")
col2 = (mpg.hwy / 2).alias("highway_mileage_halved")
mpg.select(col1, col2).show(5)

+---------------+----------------------+
|highway_mileage|highway_mileage_halved|
+---------------+----------------------+
|             29|                  14.5|
|             29|                  14.5|
|             31|                  15.5|
|             30|                  15.0|
|             26|                  13.0|
+---------------+----------------------+
only showing top 5 rows



In [17]:
from pyspark.sql.functions import col, expr
col("hwy")
# mpg.hwy

Column<b'hwy'>

In [18]:
col("class")

Column<b'class'>

In [19]:
avg_column = (col("hwy") + col("cty")) / 2
avg_column

Column<b'((hwy + cty) / 2)'>

In [20]:
mpg.select(
    col("hwy").alias("highway_mileage"),
    mpg.cty.alias("city_mileage"),
    avg_column.alias("avg_milage")
).show(5)

+---------------+------------+----------+
|highway_mileage|city_mileage|avg_milage|
+---------------+------------+----------+
|             29|          18|      23.5|
|             29|          21|      25.0|
|             31|          20|      25.5|
|             30|          21|      25.5|
|             26|          16|      21.0|
+---------------+------------+----------+
only showing top 5 rows



In [21]:
mpg.select(
    expr("hwy"), # same as "col"
    expr("hwy + 1"),
    expr("hwy AS highway_mileage"),
    expr("hwy + 1 AS highway_incremented")
).show(5)

+---+---------+---------------+-------------------+
|hwy|(hwy + 1)|highway_mileage|highway_incremented|
+---+---------+---------------+-------------------+
| 29|       30|             29|                 30|
| 29|       30|             29|                 30|
| 31|       32|             31|                 32|
| 30|       31|             30|                 31|
| 26|       27|             26|                 27|
+---+---------+---------------+-------------------+
only showing top 5 rows



In [22]:
mpg.createOrReplaceTempView("mpg")

In [23]:
spark.sql("""
    SELECT hwy, cty, (hwy + cty) / 2 as avg
    FROM mpg
""").show()

+---+---+----+
|hwy|cty| avg|
+---+---+----+
| 29| 18|23.5|
| 29| 21|25.0|
| 31| 20|25.5|
| 30| 21|25.5|
| 26| 16|21.0|
| 26| 18|22.0|
| 27| 18|22.5|
| 26| 18|22.0|
| 25| 16|20.5|
| 28| 20|24.0|
| 27| 19|23.0|
| 25| 15|20.0|
| 25| 17|21.0|
| 25| 17|21.0|
| 25| 15|20.0|
| 24| 15|19.5|
| 25| 17|21.0|
| 23| 16|19.5|
| 20| 14|17.0|
| 15| 11|13.0|
+---+---+----+
only showing top 20 rows



In [24]:
mpg.dtypes

[('manufacturer', 'string'),
 ('model', 'string'),
 ('displ', 'double'),
 ('year', 'bigint'),
 ('cyl', 'bigint'),
 ('trans', 'string'),
 ('drv', 'string'),
 ('cty', 'bigint'),
 ('hwy', 'bigint'),
 ('fl', 'string'),
 ('class', 'string')]

In [25]:
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [26]:
mpg.select(mpg.hwy.cast("string")).printSchema()

root
 |-- hwy: string (nullable = true)



In [27]:
mpg.select(mpg.model, mpg.model.cast("int")).show(4)

+-----+-----+
|model|model|
+-----+-----+
|   a4| null|
|   a4| null|
|   a4| null|
|   a4| null|
+-----+-----+
only showing top 4 rows



In [28]:
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean
# overwrites base python functions

In [29]:
mpg.select(
    (sum(mpg.hwy) / count(mpg.hwy)).alias('average_1'),
    avg(mpg.hwy).alias('average_2')).show(2)

+-----------------+-----------------+
|        average_1|        average_2|
+-----------------+-----------------+
|23.44017094017094|23.44017094017094|
+-----------------+-----------------+



In [30]:
# textdf = spark.createDataFrame(
#     pd.DataFrame(
#     {"address": ["2206 Copper Hill Drive",
#                 "Anot"]
#     })
# )

In [31]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [32]:
# import antigravity