In [1]:
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import Row

spark = pyspark.sql.SparkSession.builder.appName("dataframe").getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/10 08:58:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
path = "datasets/"
students = spark.read.csv(path+"students.csv", inferSchema=True, header=True)

# students.limit(5).toPandas() #need to import pandas
students.show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

#### Get columns

In [3]:
students.lunch
#students["lunch"]
#from pyspark.sql.functions import col
#col("lunch")
#students.select("lunch") # return - DataFrame

Column<'lunch'>

#### agg()

In [4]:
students.agg({"math score": "max"}).collect()

[Row(max(math score)=100)]

In [5]:
students.agg(min(students["`math score`"])).collect()

[Row(min(math score)=0)]

#### alias()

In [6]:
students.filter(students["reading score"] > 80).groupBy("gender").agg(avg(col("math score")).alias("average")).show()

+------+-----------------+
|gender|          average|
+------+-----------------+
|female|79.16981132075472|
|  male| 87.6842105263158|
+------+-----------------+



In [7]:
tour = spark.read.csv(path+"pga_tour_historical.csv", inferSchema=True, header=True)
tour.show(5)

[Stage 13:>                                                       (0 + 12) / 12]

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+
only showing top 5 rows



                                                                                

In [8]:
df = tour.limit(10)
df_as1 = df.alias("df_as1")
df_as2 = df.alias("df_as2")
joined_df = df_as1.join(df_as2, col("df_as1.Player Name") == col("df_as2.Player Name"), 'inner')
joined_df.select("df_as1.Player Name", "df_as2.Player Name", "df_as2.Value").sort(desc("df_as1.Player Name")).collect()

[Row(Player Name='Robert Garrigus', Player Name='Robert Garrigus', Value='71'),
 Row(Player Name='John Daly', Player Name='John Daly', Value='63'),
 Row(Player Name='J.B. Holmes', Player Name='J.B. Holmes', Value='100'),
 Row(Player Name='Graham DeLaet', Player Name='Graham DeLaet', Value='88'),
 Row(Player Name='Dustin Johnson', Player Name='Dustin Johnson', Value='83'),
 Row(Player Name='D.J. Trahan', Player Name='D.J. Trahan', Value='92'),
 Row(Player Name='Charles Warren', Player Name='Charles Warren', Value='64'),
 Row(Player Name='Bubba Watson', Player Name='Bubba Watson', Value='77'),
 Row(Player Name='Brett Wetterich', Player Name='Brett Wetterich', Value='54'),
 Row(Player Name='Angel Cabrera', Player Name='Angel Cabrera', Value='64')]

#### approxQuantile()

In [9]:
quantiles = students.approxQuantile(col=["math score", "reading score", "writing score"], probabilities=[0.25, 0.5, 0.75], relativeError=1)
# students.columns - list of columns' name
print("Approximate Quantiles:")
print("25th Percentile (Q1):", quantiles[0])
print("50th Percentile (Median, Q2):", quantiles[1])
print("75th Percentile (Q3):", quantiles[2])

Approximate Quantiles:
25th Percentile (Q1): [0.0, 0.0, 0.0]
50th Percentile (Median, Q2): [17.0, 17.0, 17.0]
75th Percentile (Q3): [10.0, 10.0, 10.0]


#### caсhe()
##### треба глянути

In [10]:
import time
df = students.limit(50)

df.cache()

start_time = time.time()

result1 = df.filter(df["reading score"] > 80).groupBy("gender").agg({"math score": "avg"})
result2 = df.filter(df["reading score"] < 80).groupBy("gender").agg({"math score": "avg"})

result1.show()
result2.show()

print(f"execution_time - {time.time() - start_time}")

+------+---------------+
|gender|avg(math score)|
+------+---------------+
|female|           76.0|
|  male|           87.0|
+------+---------------+

+------+---------------+
|gender|avg(math score)|
+------+---------------+
|female|          57.05|
|  male|           59.0|
+------+---------------+

execution_time - 0.45588207244873047


In [11]:
start_time = time.time()

result3 = df.filter(df["reading score"] > 80).groupBy("gender").agg({"math score": "avg"})
result4 = df.filter(df["reading score"] < 80).groupBy("gender").agg({"math score": "avg"})

result3.show()
result4.show()

print(f"execution_time - {time.time() - start_time}")

+------+---------------+
|gender|avg(math score)|
+------+---------------+
|female|           76.0|
|  male|           87.0|
+------+---------------+

+------+---------------+
|gender|avg(math score)|
+------+---------------+
|female|          57.05|
|  male|           59.0|
+------+---------------+

execution_time - 0.21222281455993652


In [12]:
df.cache()
start_time = time.time()
result1 = df.groupBy("gender").count()
result2 = df.groupBy("parental level of education").agg({"math score": "avg", "writing score": "avg", "reading score": "avg"})

result1.show()
result2.show()

print(f"execution_time - {time.time() - start_time}")
df.unpersist()
start_time = time.time()

result3 = df.groupBy("gender").count()
result4 = df.groupBy("parental level of education").agg({"math score": "avg", "writing score": "avg", "reading score": "avg"})

result3.show()
result4.show()

print(f"execution_time - {time.time() - start_time}")

23/10/10 08:58:10 WARN CacheManager: Asked to cache already cached data.


+------+-----+
|gender|count|
+------+-----+
|female|   27|
|  male|   23|
+------+-----+

+---------------------------+------------------+------------------+------------------+
|parental level of education|avg(reading score)|avg(writing score)|   avg(math score)|
+---------------------------+------------------+------------------+------------------+
|          bachelor's degree| 70.66666666666667| 76.33333333333333|              71.0|
|               some college| 68.76923076923077| 66.38461538461539| 65.92307692307692|
|            master's degree| 67.66666666666667| 68.16666666666667|62.833333333333336|
|         associate's degree|           65.8125|           63.8125|            59.375|
|                high school|              73.5|             70.25|            67.375|
|           some high school|              61.0|              59.5|              51.5|
+---------------------------+------------------+------------------+------------------+

execution_time - 0.31133580207824707
+

#### checkpoint()
##### треба глянути

#### Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the logical plan of this DataFrame, which is especially useful in iterative algorithms where the plan may grow exponentially. It will be saved to files inside the checkpoint directory set with SparkContext.setCheckpointDir().

Parameters
eager – Whether to checkpoint this DataFrame immediately

##### Note: Experimental

#### coalesce()

In [13]:
print(f"{students.rdd.getNumPartitions()}\n"
      f"{df.rdd.getNumPartitions()}\n"
      f"{students.repartition(2).rdd.getNumPartitions()}\n"
      f"{students.rdd.getNumPartitions()}\n"
      f"{df.coalesce(1).rdd.getNumPartitions()}")

1
1
2
1
1


#### colRegex()

In [14]:
df.select(df.colRegex("`.+score.?`")).show(5)

+----------+-------------+-------------+
|math score|reading score|writing score|
+----------+-------------+-------------+
|        72|           72|           74|
|        69|           90|           88|
|        90|           95|           93|
|        47|           57|           44|
|        76|           78|           75|
+----------+-------------+-------------+
only showing top 5 rows



#### collect()

In [15]:
df.select("math score", "reading score", "writing score").sort(desc("math score")).limit(5).collect()

[Row(math score=97, reading score=87, writing score=82),
 Row(math score=90, reading score=95, writing score=93),
 Row(math score=88, reading score=95, writing score=92),
 Row(math score=88, reading score=89, writing score=86),
 Row(math score=82, reading score=84, writing score=82)]

#### corr()

In [16]:
students.corr("reading score", "writing score")

0.9545980771462476

In [17]:
students.corr("math score", "writing score")

0.8026420459498075

In [18]:
students.corr("math score", "reading score")

0.8175796636720539

#### count()

In [19]:
df.count()

50

In [20]:
students.count()

1000

In [21]:
tour.count()

2740403

#### cov()

In [22]:
df.cov("reading score", "writing score")

200.49632653061224

In [23]:
students.cov("reading score", "writing score")

211.7866606606608

#### createGlobalTempView()

In [24]:
#spark.catalog.dropGlobalTempView("students")
#students.createGlobalTempView("students")
students.createOrReplaceGlobalTempView("students")

In [25]:
result = spark.sql("SELECT * FROM global_temp.students WHERE `math score` > 80")
result.show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|female|       group B|               some college|    standard|              completed|        88|           95|           92|
|  male|       group C|                high school|    standard|                   none|        88|           89|           86|
|  male|       group E|               some college|    standard|                   none|        97|           87|           82|
|  male|       group E|         associate's degree|    standard|              completed|        81|     

#### crossJoin()

In [26]:
df = tour.limit(5).select("Value", "Player name")
df2 = tour.limit(5).select("Player name", "Season")
df.crossJoin(df2.select("Season")).select("Value", "Player name", "Season").show()

+-----+---------------+------+
|Value|    Player name|Season|
+-----+---------------+------+
|   71|Robert Garrigus|  2010|
|   71|Robert Garrigus|  2010|
|   71|Robert Garrigus|  2010|
|   71|Robert Garrigus|  2010|
|   71|Robert Garrigus|  2010|
|   77|   Bubba Watson|  2010|
|   77|   Bubba Watson|  2010|
|   77|   Bubba Watson|  2010|
|   77|   Bubba Watson|  2010|
|   77|   Bubba Watson|  2010|
|   83| Dustin Johnson|  2010|
|   83| Dustin Johnson|  2010|
|   83| Dustin Johnson|  2010|
|   83| Dustin Johnson|  2010|
|   83| Dustin Johnson|  2010|
|   54|Brett Wetterich|  2010|
|   54|Brett Wetterich|  2010|
|   54|Brett Wetterich|  2010|
|   54|Brett Wetterich|  2010|
|   54|Brett Wetterich|  2010|
+-----+---------------+------+
only showing top 20 rows



23/10/10 08:58:13 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


#### crosstab()

In [27]:
data = [("Alice", "Female", "A"),
        ("Bob", "Male", "B"),
        ("Charlie", "Male", "A"),
        ("David", "Male", "C"),
        ("Eve", "Female", "B")]

columns = ["Name", "Gender", "Grade"]
df = spark.createDataFrame(data, columns)

cross_tab = df.crosstab("Gender", "Grade")
cross_tab.show()

                                                                                

+------------+---+---+---+
|Gender_Grade|  A|  B|  C|
+------------+---+---+---+
|      Female|  1|  1|  0|
|        Male|  1|  1|  1|
+------------+---+---+---+



#### cube()

##### Create a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregations on them.

In [28]:
tour.cube("Player name", tour.Value).count().orderBy("Player name", "Value").show()

[Stage 86:>                                                       (0 + 12) / 12]

+-----------+----------+-------+
|Player name|     Value|  count|
+-----------+----------+-------+
|       NULL|      NULL|  43498|
|       NULL|      NULL|2740403|
|       NULL|$1,001,580|      1|
|       NULL|$1,001,581|      2|
|       NULL|$1,002,036|      4|
|       NULL|$1,003,359|      1|
|       NULL|$1,003,362|      2|
|       NULL|$1,003,363|      2|
|       NULL|$1,004,033|      1|
|       NULL|$1,004,034|      2|
|       NULL|$1,004,035|      2|
|       NULL|$1,004,159|      1|
|       NULL|$1,004,160|      2|
|       NULL|$1,004,161|      2|
|       NULL|$1,004,693|      1|
|       NULL|$1,004,696|      4|
|       NULL|$1,005,321|      1|
|       NULL|$1,005,322|      2|
|       NULL|$1,007,218|      3|
|       NULL|$1,007,219|      2|
+-----------+----------+-------+
only showing top 20 rows



                                                                                

#### describe()

In [29]:
students.describe("math score", "reading score", "writing score").show()

+-------+------------------+------------------+-----------------+
|summary|        math score|     reading score|    writing score|
+-------+------------------+------------------+-----------------+
|  count|              1000|              1000|             1000|
|   mean|            66.089|            69.169|           68.054|
| stddev|15.163080096009454|14.600191937252223|15.19565701086966|
|    min|                 0|                17|               10|
|    max|               100|               100|              100|
+-------+------------------+------------------+-----------------+



#### distinct()

In [30]:
tour.count()

2740403

In [31]:
tour.distinct().count()

23/10/10 08:58:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/10 08:58:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/10 08:58:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/10 08:58:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/10 08:58:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/10 08:58:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/10 08:58:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/10 08:58:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/10 08:58:21 WARN RowBasedKeyValueBatch: Calling spill() on

2740403

#### drop()

In [32]:
tour.limit(5).drop("Value").show(5, False)

+---------------+------+----------------+---------------------------+
|Player Name    |Season|Statistic       |Variable                   |
+---------------+------+----------------+---------------------------+
|Robert Garrigus|2010  |Driving Distance|Driving Distance - (ROUNDS)|
|Bubba Watson   |2010  |Driving Distance|Driving Distance - (ROUNDS)|
|Dustin Johnson |2010  |Driving Distance|Driving Distance - (ROUNDS)|
|Brett Wetterich|2010  |Driving Distance|Driving Distance - (ROUNDS)|
|J.B. Holmes    |2010  |Driving Distance|Driving Distance - (ROUNDS)|
+---------------+------+----------------+---------------------------+



In [33]:
students.drop("math score", "reading score", "writing score").show(5, False)

+------+--------------+---------------------------+------------+-----------------------+
|gender|race/ethnicity|parental level of education|lunch       |test preparation course|
+------+--------------+---------------------------+------------+-----------------------+
|female|group B       |bachelor's degree          |standard    |none                   |
|female|group C       |some college               |standard    |completed              |
|female|group B       |master's degree            |standard    |none                   |
|male  |group A       |associate's degree         |free/reduced|none                   |
|male  |group C       |some college               |standard    |none                   |
+------+--------------+---------------------------+------------+-----------------------+
only showing top 5 rows



#### dropDuplicates() / drop_duplicates()

In [34]:
df = spark.createDataFrame([ \
    Row(name='Alice', age=5, height=80), \
    Row(name='Alice', age=5, height=80), \
    Row(name='Alice', age=10, height=80)])
df.dropDuplicates().show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
|Alice| 10|    80|
+-----+---+------+



In [35]:
df.dropDuplicates(['name', 'height']).show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
+-----+---+------+



In [36]:
tour.dropDuplicates(["Player name"]).count()

                                                                                

3053

#### dropna()

In [37]:
tour.dropna().count()

                                                                                

2696905

In [38]:
students.dropna().count()

1000

#### dtypes (property)

In [39]:
tour.dtypes

[('Player Name', 'string'),
 ('Season', 'int'),
 ('Statistic', 'string'),
 ('Variable', 'string'),
 ('Value', 'string')]

In [40]:
students.dtypes

[('gender', 'string'),
 ('race/ethnicity', 'string'),
 ('parental level of education', 'string'),
 ('lunch', 'string'),
 ('test preparation course', 'string'),
 ('math score', 'int'),
 ('reading score', 'int'),
 ('writing score', 'int')]

#### exceptAll()
##### Return a new DataFrame containing rows in this DataFrame but not in another DataFrame while preserving duplicates.

This is equivalent to EXCEPT ALL in SQL.

In [41]:
df1 = spark.createDataFrame([("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b",  3), ("c", 4)], ["C1", "C2"])
df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
df1.exceptAll(df2).show()
# Also as standard in SQL, this function resolves columns by position (not by name).

+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  a|  1|
|  a|  2|
|  c|  4|
+---+---+



#### explain()

In [42]:
tour.explain()

== Physical Plan ==
FileScan csv [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/zsavchenko/spark_env/training/datasets/pga_tour_historical..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Player Name:string,Season:int,Statistic:string,Variable:string,Value:string>




In [43]:
tour.explain(mode="formatted")

== Physical Plan ==
Scan csv  (1)


(1) Scan csv 
Output [5]: [Player Name#156, Season#157, Statistic#158, Variable#159, Value#160]
Batched: false
Location: InMemoryFileIndex [file:/Users/zsavchenko/spark_env/training/datasets/pga_tour_historical.csv]
ReadSchema: struct<Player Name:string,Season:int,Statistic:string,Variable:string,Value:string>




In [44]:
tour.explain(True)

== Parsed Logical Plan ==
Relation [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] csv

== Analyzed Logical Plan ==
Player Name: string, Season: int, Statistic: string, Variable: string, Value: string
Relation [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] csv

== Optimized Logical Plan ==
Relation [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] csv

== Physical Plan ==
FileScan csv [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/zsavchenko/spark_env/training/datasets/pga_tour_historical..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Player Name:string,Season:int,Statistic:string,Variable:string,Value:string>



#### fillna()
Replace null values, alias for na.fill().

In [45]:
tour.na.fill(50).dropna().count()

                                                                                

2696905

In [46]:
df = tour.na.fill({'Value': 50, 'Player name': 'unknown'})
df.count()

2740403

In [47]:
df.dropna().count()

                                                                                

2740403

#### filter()

In [48]:
df.filter(df.Value > 3).show(5)
df.where(df.Value == 2).show(5)

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+
only showing top 5 rows

+----------------+------+---------------+--------------------+-----+
|     Player Name|Season|      Statistic|            Variable|Value|
+----------------+------+---------------+--------------------+-----+
|Kevin Sutherland|  2010|Putting Average|Putting Average -...|    2|
| Cameron Beckman|  2010|   Total Eagles|Total Eagles - (T...|    2|
|     Kri

In [49]:
df.filter("Value > 3").show(5)
df.where("Value = 2").show(5)

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+
only showing top 5 rows

+----------------+------+---------------+--------------------+-----+
|     Player Name|Season|      Statistic|            Variable|Value|
+----------------+------+---------------+--------------------+-----+
|Kevin Sutherland|  2010|Putting Average|Putting Average -...|    2|
| Cameron Beckman|  2010|   Total Eagles|Total Eagles - (T...|    2|
|     Kri

#### first()

In [50]:
df.first()

Row(Player Name='Robert Garrigus', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='71')

#### foreach()

In [51]:
def f(person):
    # if person.Value == 100:
    print(person.Value)
df.limit(10).foreach(f)

71
77
83
54
100
63
88
64
64
92


#### foreachPosition()

In [52]:
def f(people):
    for person in people:
        print(person.Value)
df.limit(5).foreachPartition(f)

71
77
83
54
100


#### freqItems()
##### DataFrame.freqItems() and DataFrameStatFunctions.freqItems() are aliases.

In [53]:
tour.freqItems(["Value"]).show()

[Stage 142:>                                                      (0 + 12) / 12]

+--------------------+
|     Value_freqItems|
+--------------------+
|[8, 6.7, 67, 27, ...|
+--------------------+



                                                                                

#### groupBy() / groupby()

In [54]:
students.groupBy().avg().collect()

[Row(avg(math score)=66.089, avg(reading score)=69.169, avg(writing score)=68.054)]

In [55]:
sorted(students.groupBy('gender').agg({'math score': 'mean'}).collect())

[Row(gender='female', avg(math score)=63.633204633204635),
 Row(gender='male', avg(math score)=68.72821576763485)]

In [56]:
sorted(students.groupBy(['gender', students.gender]).count().collect())

[Row(gender='female', gender='female', count=518),
 Row(gender='male', gender='male', count=482)]

In [57]:
sorted(students.groupBy(students.gender).avg().collect())

[Row(gender='female', avg(math score)=63.633204633204635, avg(reading score)=72.60810810810811, avg(writing score)=72.46718146718146),
 Row(gender='male', avg(math score)=68.72821576763485, avg(reading score)=65.47302904564316, avg(writing score)=63.31120331950208)]

#### head()

In [58]:
students.head(5)
# n – int, default 1. Number of rows to return.

[Row(gender='female', race/ethnicity='group B', parental level of education="bachelor's degree", lunch='standard', test preparation course='none', math score=72, reading score=72, writing score=74),
 Row(gender='female', race/ethnicity='group C', parental level of education='some college', lunch='standard', test preparation course='completed', math score=69, reading score=90, writing score=88),
 Row(gender='female', race/ethnicity='group B', parental level of education="master's degree", lunch='standard', test preparation course='none', math score=90, reading score=95, writing score=93),
 Row(gender='male', race/ethnicity='group A', parental level of education="associate's degree", lunch='free/reduced', test preparation course='none', math score=47, reading score=57, writing score=44),
 Row(gender='male', race/ethnicity='group C', parental level of education='some college', lunch='standard', test preparation course='none', math score=76, reading score=78, writing score=75)]

#### hint()
Specifies some hint on the current DataFrame.

Parameters:
name – A name of the hint.
parameters – Optional parameters.

Returns - DataFrame

In [59]:
students.limit(5).join(students.limit(10).hint("broadcast"), "gender").toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,race/ethnicity.1,parental level of education.1,lunch.1,test preparation course.1,math score.1,reading score.1,writing score.1
0,female,group B,bachelor's degree,standard,none,72,72,74,group B,high school,free/reduced,none,38,60,50
1,female,group B,bachelor's degree,standard,none,72,72,74,group B,some college,standard,completed,88,95,92
2,female,group B,bachelor's degree,standard,none,72,72,74,group B,associate's degree,standard,none,71,83,78
3,female,group B,bachelor's degree,standard,none,72,72,74,group B,master's degree,standard,none,90,95,93
4,female,group B,bachelor's degree,standard,none,72,72,74,group C,some college,standard,completed,69,90,88
5,female,group B,bachelor's degree,standard,none,72,72,74,group B,bachelor's degree,standard,none,72,72,74
6,female,group C,some college,standard,completed,69,90,88,group B,high school,free/reduced,none,38,60,50
7,female,group C,some college,standard,completed,69,90,88,group B,some college,standard,completed,88,95,92
8,female,group C,some college,standard,completed,69,90,88,group B,associate's degree,standard,none,71,83,78
9,female,group C,some college,standard,completed,69,90,88,group B,master's degree,standard,none,90,95,93


#### intersect()

In [60]:
data1 = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
data2 = [("Alice", 25), ("David", 32), ("Eve", 26)]
columns = ["name", "age"]

df1 = spark.createDataFrame(data1, columns)
df2 = spark.createDataFrame(data2, columns)

In [61]:
intersect_df = df1.intersect(df2)
intersect_df.show()

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
+-----+---+



#### intersectAll()

In [62]:
data3 = [("Alice", 25), ("Charlie", 28), ("Frank", 22)]
df3 = spark.createDataFrame(data3, columns)

In [63]:
df1.intersectAll(df2).show()
intersect_all_df = df1.intersectAll(df2).intersectAll(df3).show()

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
+-----+---+

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
+-----+---+



#### isLocal()

In [64]:
if tour.isLocal:
    print("Local")
else:
    print("Cluster")

Local


#### isStreaming (property)

In [65]:
tour.isStreaming

False

### join

how – str, default inner. Must be one of: inner, cross, outer, full, fullouter, full_outer, left, leftouter, left_outer, right, rightouter, right_outer, semi, leftsemi, left_semi, anti, leftanti and left_anti.

In [215]:
data1 = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
data2 = [("Bob", "Engineer"), ("Eve", "Doctor"), ("Alice", "Teacher")]
columns1 = ["name", "age"]
columns2 = ["name", "occupation"]

df1 = spark.createDataFrame(data1, columns1)
df2 = spark.createDataFrame(data2, columns2)

In [216]:
# Inner Join
inner_join = df1.join(df2, "name", "inner")
inner_join.show()

[Stage 386:>                                                      (0 + 12) / 12]

+-----+---+----------+
| name|age|occupation|
+-----+---+----------+
|Alice| 25|   Teacher|
|  Bob| 30|  Engineer|
+-----+---+----------+



                                                                                

In [217]:
# Cross Join
cross_join = df1.join(df2, "name", "cross")
cross_join.show()

+-----+---+----------+
| name|age|occupation|
+-----+---+----------+
|Alice| 25|   Teacher|
|  Bob| 30|  Engineer|
+-----+---+----------+



In [218]:
# Outer Join
outer_join = df1.join(df2, "name", "outer")
outer_join.show()

+-------+----+----------+
|   name| age|occupation|
+-------+----+----------+
|  Alice|  25|   Teacher|
|    Bob|  30|  Engineer|
|Charlie|  28|      NULL|
|    Eve|NULL|    Doctor|
+-------+----+----------+



In [219]:
# Left Join
left_join = df1.join(df2, "name", "left")
left_join.show()

+-------+---+----------+
|   name|age|occupation|
+-------+---+----------+
|  Alice| 25|   Teacher|
|    Bob| 30|  Engineer|
|Charlie| 28|      NULL|
+-------+---+----------+



In [220]:
# Right Join
right_join = df1.join(df2, "name", "right")
right_join.show()

+-----+----+----------+
| name| age|occupation|
+-----+----+----------+
|  Bob|  30|  Engineer|
|  Eve|NULL|    Doctor|
|Alice|  25|   Teacher|
+-----+----+----------+



In [221]:
# Semi Join
semi_join = df1.join(df2, "name", "semi")
semi_join.show()

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
|  Bob| 30|
+-----+---+



In [222]:
# Anti Join
anti_join = df1.join(df2, "name", "anti")
anti_join.show()

+-------+---+
|   name|age|
+-------+---+
|Charlie| 28|
+-------+---+



In [223]:
# full, fullouter, full_outer
full_join = df1.join(df2, "name", "full")
full_join.show()

+-------+----+----------+
|   name| age|occupation|
+-------+----+----------+
|  Alice|  25|   Teacher|
|    Bob|  30|  Engineer|
|Charlie|  28|      NULL|
|    Eve|NULL|    Doctor|
+-------+----+----------+



                                                                                

In [224]:
# leftouter, left_outer
left_outer_join = df1.join(df2, "name", "leftouter")
left_outer_join.show()

+-------+---+----------+
|   name|age|occupation|
+-------+---+----------+
|  Alice| 25|   Teacher|
|    Bob| 30|  Engineer|
|Charlie| 28|      NULL|
+-------+---+----------+



                                                                                

In [225]:
# rightouter, right_outer
right_outer_join = df1.join(df2, "name", "rightouter")
right_outer_join.show()

+-----+----+----------+
| name| age|occupation|
+-----+----+----------+
|  Bob|  30|  Engineer|
|  Eve|NULL|    Doctor|
|Alice|  25|   Teacher|
+-----+----+----------+



In [226]:
# leftsemi, left_semi
left_semi_join = df1.join(df2, "name", "leftsemi")
left_semi_join.show()

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
|  Bob| 30|
+-----+---+



                                                                                

In [227]:
# leftanti, left_anti
left_anti_join = df1.join(df2, "name", "leftanti")
left_anti_join.show()

+-------+---+
|   name|age|
+-------+---+
|Charlie| 28|
+-------+---+



                                                                                

#### limit()

In [66]:
tour.limit(5)

DataFrame[Player Name: string, Season: int, Statistic: string, Variable: string, Value: string]

#### localCheckpoint()

#### mapInPandas()

In [67]:
from pyspark.sql.functions import pandas_udf
df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))
def filter_func(iterator):
    for pdf in iterator:
        yield pdf[pdf.id == 1]
df.mapInPandas(filter_func, df.schema)#.show() 

DataFrame[id: bigint, age: bigint]

In [68]:
def f_func(iterator):
    for pdf in iterator:
        yield pdf.Value
df = tour.mapInPandas(f_func, tour.schema)
type(df)

pyspark.sql.dataframe.DataFrame

In [69]:
for i in df:
    print(i)

Column<'Player Name'>
Column<'Season'>
Column<'Statistic'>
Column<'Variable'>
Column<'Value'>


#### na (property)

In [70]:
tour.na.drop().count()

                                                                                

2696905

#### orderBy()

In [71]:
tour.sort(tour.Value.asc()).show(5)

[Stage 185:>                                                      (0 + 12) / 12]

+--------------+------+--------------------+--------------------+-----+
|   Player Name|Season|           Statistic|            Variable|Value|
+--------------+------+--------------------+--------------------+-----+
|    J.J. Henry|  2018|Sand Saves from <...|Sand Saves from <...| NULL|
|  Jeff Overton|  2010|       Money Leaders|Money Leaders - (...| NULL|
|Phil Mickelson|  2016|% of Potential Pt...|% of Potential Pt...| NULL|
|   Nick Watney|  2010|       Money Leaders|Money Leaders - (...| NULL|
|Steve Stricker|  2013|FedExCup Season P...|FedExCup Season P...| NULL|
+--------------+------+--------------------+--------------------+-----+
only showing top 5 rows



                                                                                

In [72]:
students.orderBy(desc("math score"), "reading score").show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group E|               some college|    standard|                   none|       100|           92|           97|
|  male|       group A|               some college|    standard|              completed|       100|           96|           86|
|  male|       group D|               some college|    standard|              completed|       100|           97|           99|
|  male|       group E|         associate's degree|free/reduced|              completed|       100|          100|           93|
|female|       group E|          bachelor's degree|    standard|                   none|       100|     

In [73]:
tour.orderBy(["Value", "Player name"], ascending=[0, 1]).show()

+------------------+------+--------------------+--------------------+--------------------+
|       Player Name|Season|           Statistic|            Variable|               Value|
+------------------+------+--------------------+--------------------+--------------------+
|       Andrew Dorn|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       Harry Ellis|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|      Jason Dufner|  2017|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|         John Hahn|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       Kenny Perry|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|      Ryan Ruffels|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|     Scott Gregory|  2017|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|    Soren Kjeldsen|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|

                                                                                

In [74]:
tour.orderBy(["Value", "Player name"], ascending=[0, 0]).show()

+----------------+------+--------------------+--------------------+--------------------+
|     Player Name|Season|           Statistic|            Variable|               Value|
+----------------+------+--------------------+--------------------+--------------------+
|     Vijay Singh|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|  Soren Kjeldsen|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|   Scott Gregory|  2017|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|    Ryan Ruffels|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|     Kenny Perry|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       John Hahn|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|    Jason Dufner|  2017|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|     Harry Ellis|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|     Andrew Dorn|  2

                                                                                

In [75]:
tour.orderBy(["Value", "Player name"], ascending=[1, 0]).show()

+-----------+------+--------------------+--------------------+-----+
|Player Name|Season|           Statistic|            Variable|Value|
+-----------+------+--------------------+--------------------+-----+
| Zihao Chen|  2017|Official World Go...|Official World Go...| NULL|
|    Zeyu He|  2016|Official World Go...|Official World Go...| NULL|
|    Zeyu He|  2016|Presidents Cup (I...|Presidents Cup (I...| NULL|
|Zecheng Dou|  2016|Non-Member Off+WG...|Non-Member Off+WG...| NULL|
|Zecheng Dou|  2018|FedExCup Season P...|FedExCup Season P...| NULL|
|Zecheng Dou|  2017|     Victory Leaders|Victory Leaders -...| NULL|
|Zecheng Dou|  2016|Consecutive Sand ...|Consecutive Sand ...| NULL|
|Zecheng Dou|  2015|Non-Member Off+WG...|Non-Member Off+WG...| NULL|
|Zecheng Dou|  2016|     Victory Leaders|Victory Leaders -...| NULL|
|Zecheng Dou|  2017|Non-Member Off+WG...|Non-Member Off+WG...| NULL|
|Zecheng Dou|  2015|     Victory Leaders|Victory Leaders -...| NULL|
|Zecheng Dou|  2018|      Official

                                                                                

#### persist()

Sets the storage level to persist the contents of the DataFrame across operations after the first time it is computed. This can only be used to assign a new storage level if the DataFrame does not have a storage level set yet. If no storage level is specified defaults to (MEMORY_AND_DISK).



In [76]:
start_time = time.time()

data = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

df.persist()
print(f"execution_time - {time.time() - start_time}")

execution_time - 0.018904924392700195


In [77]:
from pyspark.storagelevel import StorageLevel
start_time = time.time()

data = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

df.persist(StorageLevel.MEMORY_ONLY)
# print(f"execution_time - {time.time() - start_time}")

DataFrame[name: string, age: bigint]

In [78]:
start_time = time.time()

data = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

df.persist(StorageLevel.MEMORY_ONLY)

print(f"execution_time - {time.time() - start_time}")

execution_time - 0.024343013763427734


#### printSchema()

In [79]:
tour.printSchema()

root
 |-- Player Name: string (nullable = true)
 |-- Season: integer (nullable = true)
 |-- Statistic: string (nullable = true)
 |-- Variable: string (nullable = true)
 |-- Value: string (nullable = true)



#### randomSplit()

In [80]:
tour.randomSplit([1.0, 2.0], 24)

[DataFrame[Player Name: string, Season: int, Statistic: string, Variable: string, Value: string],
 DataFrame[Player Name: string, Season: int, Statistic: string, Variable: string, Value: string]]

#### rdd (property)

In [81]:
tour.rdd

MapPartitionsRDD[582] at javaToPython at NativeMethodAccessorImpl.java:0

#### registerTempTable()

In [82]:
df = students.limit(5)
df.registerTempTable("students")
df2 = spark.sql("select * from students")
sorted(df.collect()) == sorted(df2.collect())



True

In [83]:
spark.catalog.dropTempView("students")

True

#### repartition()

In [84]:
tour.repartition(10).rdd.getNumPartitions()

[Stage 192:====>                                                  (1 + 11) / 12]

10

In [85]:
data = tour.union(tour).repartition("Value")
data.show(5)

data = data.repartition(7, "Value")
data.show(5)

                                                                                

+----------------+------+----------------+--------------------+-----+
|     Player Name|Season|       Statistic|            Variable|Value|
+----------------+------+----------------+--------------------+-----+
|  David Lutterus|  2010|Driving Distance|Driving Distance ...|291.9|
|     Boo Weekley|  2010|Driving Distance|Driving Distance ...|291.9|
|Brendon de Jonge|  2010|Driving Distance|Driving Distance ...|286.9|
|     David Duval|  2010|Driving Distance|Driving Distance ...|286.9|
|     Matt Kuchar|  2010|Driving Distance|Driving Distance ...|286.9|
+----------------+------+----------------+--------------------+-----+
only showing top 5 rows





+------------------+------+----------------+--------------------+-----+
|       Player Name|Season|       Statistic|            Variable|Value|
+------------------+------+----------------+--------------------+-----+
|    Phil Mickelson|  2010|Driving Distance|Driving Distance ...|   76|
|    Aaron Baddeley|  2010|Driving Distance|Driving Distance ...|   94|
|      Martin Laird|  2010|Driving Distance|Driving Distance ...|   91|
|       Nick Watney|  2010|Driving Distance|Driving Distance ...|   91|
|Charles Howell III|  2010|Driving Distance|Driving Distance ...|  103|
+------------------+------+----------------+--------------------+-----+
only showing top 5 rows



                                                                                

In [86]:
data.rdd.getNumPartitions()



7

In [87]:
data = data.repartition("Player name", "Value")
data.show(5)



+--------------+------+----------------+--------------------+------+
|   Player Name|Season|       Statistic|            Variable| Value|
+--------------+------+----------------+--------------------+------+
|     John Daly|  2010|Driving Distance|Driving Distance ...| 305.7|
|James Driscoll|  2010|Driving Distance|Driving Distance ...| 293.9|
|  Steve Marino|  2010|Driving Distance|Driving Distance ...|47,079|
|  Justin Bolli|  2010|Driving Distance|Driving Distance ...|27,618|
| Andres Romero|  2010|Driving Distance|Driving Distance ...|   146|
+--------------+------+----------------+--------------------+------+
only showing top 5 rows



                                                                                

#### repartitionByRange()

In [88]:
tour.repartitionByRange(2, "Value").rdd.getNumPartitions()

[Stage 204:====>                                                  (1 + 11) / 12]

2

In [89]:
tour.show(5)

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+
only showing top 5 rows



                                                                                

In [90]:
tour.repartitionByRange(1, "Value").rdd.getNumPartitions()

[Stage 206:>                                                      (0 + 12) / 12]

1

In [91]:
data = tour.repartitionByRange("Value")
tour.show(5)

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+
only showing top 5 rows



                                                                                

#### replace()

In [92]:
data.na.replace(10, 20).show(5)

[Stage 209:====>                                                  (1 + 11) / 12]

+------------+------+-------------+--------------------+-----+
| Player Name|Season|    Statistic|            Variable|Value|
+------------+------+-------------+--------------------+-----+
|Craig Bowden|  2010| Total Eagles|Total Eagles - (T...| NULL|
| Luke Donald|  2010|Money Leaders|Money Leaders - (...| NULL|
|  Paul Casey|  2010|Money Leaders|Money Leaders - (...| NULL|
|Jeff Overton|  2010|Money Leaders|Money Leaders - (...| NULL|
| Bo Van Pelt|  2010|Money Leaders|Money Leaders - (...| NULL|
+------------+------+-------------+--------------------+-----+
only showing top 5 rows



                                                                                

In [93]:
data.na.replace(2010, None).show(5)



+------------+------+-------------+--------------------+-----+
| Player Name|Season|    Statistic|            Variable|Value|
+------------+------+-------------+--------------------+-----+
|Craig Bowden|  NULL| Total Eagles|Total Eagles - (T...| NULL|
| Luke Donald|  NULL|Money Leaders|Money Leaders - (...| NULL|
|  Paul Casey|  NULL|Money Leaders|Money Leaders - (...| NULL|
|Jeff Overton|  NULL|Money Leaders|Money Leaders - (...| NULL|
| Bo Van Pelt|  NULL|Money Leaders|Money Leaders - (...| NULL|
+------------+------+-------------+--------------------+-----+
only showing top 5 rows



                                                                                

In [94]:
data.na.replace({'Money Leaders': None}).show(5)

[Stage 217:====>                                                  (1 + 11) / 12]

+------------+------+------------+--------------------+-----+
| Player Name|Season|   Statistic|            Variable|Value|
+------------+------+------------+--------------------+-----+
|Craig Bowden|  2010|Total Eagles|Total Eagles - (T...| NULL|
| Luke Donald|  2010|        NULL|Money Leaders - (...| NULL|
|  Paul Casey|  2010|        NULL|Money Leaders - (...| NULL|
|Jeff Overton|  2010|        NULL|Money Leaders - (...| NULL|
| Bo Van Pelt|  2010|        NULL|Money Leaders - (...| NULL|
+------------+------+------------+--------------------+-----+
only showing top 5 rows



                                                                                

In [95]:
data.na.replace(['Luke Donald', 'Bob'], ["Paul Casey", "Fish"], 'Player name').show(5)

[Stage 221:====>                                                  (1 + 11) / 12]

+------------+------+-------------+--------------------+-----+
| Player Name|Season|    Statistic|            Variable|Value|
+------------+------+-------------+--------------------+-----+
|Craig Bowden|  2010| Total Eagles|Total Eagles - (T...| NULL|
| Luke Donald|  2010|Money Leaders|Money Leaders - (...| NULL|
|  Paul Casey|  2010|Money Leaders|Money Leaders - (...| NULL|
|Jeff Overton|  2010|Money Leaders|Money Leaders - (...| NULL|
| Bo Van Pelt|  2010|Money Leaders|Money Leaders - (...| NULL|
+------------+------+-------------+--------------------+-----+
only showing top 5 rows



                                                                                

#### rollup()

In [96]:
tour.rollup("Player name", tour.Value).count().orderBy("Player name", "Value").show()



+--------------+-----------+-------+
|   Player name|      Value|  count|
+--------------+-----------+-------+
|          NULL|       NULL|2740403|
|A.J. McInerney|       NULL|      8|
|A.J. McInerney|       NULL|     88|
|A.J. McInerney|   $164,923|      6|
|A.J. McInerney|$25,700,000|      1|
|A.J. McInerney| $4,626,000|      1|
|A.J. McInerney|    $41,230|      1|
|A.J. McInerney|        .64|      1|
|A.J. McInerney|          1|      4|
|A.J. McInerney|         12|      5|
|A.J. McInerney|     14,190|      2|
|A.J. McInerney| 16,492,333|      1|
|A.J. McInerney|     164923|      1|
|A.J. McInerney|         17|      2|
|A.J. McInerney|          2|      9|
|A.J. McInerney|    2018060|      1|
|A.J. McInerney|    2018220|      1|
|A.J. McInerney|          3|      5|
|A.J. McInerney|       3.57|      1|
|A.J. McInerney|          4|     23|
+--------------+-----------+-------+
only showing top 20 rows



                                                                                

#### sample()

In [97]:
df = spark.range(10)
df.sample(0.5, 3).count()

9

In [98]:
df.sample(fraction=0.5, seed=3).count()

9

In [99]:
df.sample(withReplacement=True, fraction=0.5, seed=3).count()

6

In [100]:
df.sample(1.0).count()

10

In [101]:
df.sample(fraction=1.0).count()

10

In [102]:
df.sample(False, fraction=1.0).count()

10

#### sampleBy()

In [104]:
dataset = spark.range(0, 100).select((col("id") % 3).alias("key"))
sampled = dataset.sampleBy("key", fractions={0: 0.1, 1: 0.2}, seed=0)
sampled.groupBy("key").count().orderBy("key").show()

+---+-----+
|key|count|
+---+-----+
|  0|    2|
|  1|    6|
+---+-----+



In [105]:
dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count()

33

#### schema (property)

In [106]:
df.schema

StructType([StructField('id', LongType(), False)])

In [107]:
tour.schema

StructType([StructField('Player Name', StringType(), True), StructField('Season', IntegerType(), True), StructField('Statistic', StringType(), True), StructField('Variable', StringType(), True), StructField('Value', StringType(), True)])

In [108]:
students.schema

StructType([StructField('gender', StringType(), True), StructField('race/ethnicity', StringType(), True), StructField('parental level of education', StringType(), True), StructField('lunch', StringType(), True), StructField('test preparation course', StringType(), True), StructField('math score', IntegerType(), True), StructField('reading score', IntegerType(), True), StructField('writing score', IntegerType(), True)])

#### select()

In [115]:
df = tour.limit(5)
df.select('*').show()

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+



In [116]:
df.select('Player name', 'Value').show()

+---------------+-----+
|    Player name|Value|
+---------------+-----+
|Robert Garrigus|   71|
|   Bubba Watson|   77|
| Dustin Johnson|   83|
|Brett Wetterich|   54|
|    J.B. Holmes|  100|
+---------------+-----+



In [118]:
df.select(df["Player name"], (df.Value + 10).alias('Value')).show()

+---------------+-----+
|    Player name|Value|
+---------------+-----+
|Robert Garrigus| 81.0|
|   Bubba Watson| 87.0|
| Dustin Johnson| 93.0|
|Brett Wetterich| 64.0|
|    J.B. Holmes|110.0|
+---------------+-----+



#### selectExpr()

In [119]:
df.selectExpr("Value * 2", "abs(Value)").show()

+-----------+----------+
|(Value * 2)|abs(Value)|
+-----------+----------+
|      142.0|      71.0|
|      154.0|      77.0|
|      166.0|      83.0|
|      108.0|      54.0|
|      200.0|     100.0|
+-----------+----------+



In [132]:
df.selectExpr("*", "concat(Season, ' ', Value) as info").show()

+---------------+------+----------------+--------------------+-----+--------+
|    Player Name|Season|       Statistic|            Variable|Value|    info|
+---------------+------+----------------+--------------------+-----+--------+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71| 2010 71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77| 2010 77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83| 2010 83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54| 2010 54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|2010 100|
+---------------+------+----------------+--------------------+-----+--------+



#### show()

In [140]:
tour.limit(5).show()

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+



In [141]:
tour.limit(5).show(truncate=3)

+-----------+------+---------+--------+-----+
|Player Name|Season|Statistic|Variable|Value|
+-----------+------+---------+--------+-----+
|        Rob|   201|      Dri|     Dri|   71|
|        Bub|   201|      Dri|     Dri|   77|
|        Dus|   201|      Dri|     Dri|   83|
|        Bre|   201|      Dri|     Dri|   54|
|        J.B|   201|      Dri|     Dri|  100|
+-----------+------+---------+--------+-----+



In [143]:
tour.limit(5).show(vertical=True)

-RECORD 0---------------------------
 Player Name | Robert Garrigus      
 Season      | 2010                 
 Statistic   | Driving Distance     
 Variable    | Driving Distance ... 
 Value       | 71                   
-RECORD 1---------------------------
 Player Name | Bubba Watson         
 Season      | 2010                 
 Statistic   | Driving Distance     
 Variable    | Driving Distance ... 
 Value       | 77                   
-RECORD 2---------------------------
 Player Name | Dustin Johnson       
 Season      | 2010                 
 Statistic   | Driving Distance     
 Variable    | Driving Distance ... 
 Value       | 83                   
-RECORD 3---------------------------
 Player Name | Brett Wetterich      
 Season      | 2010                 
 Statistic   | Driving Distance     
 Variable    | Driving Distance ... 
 Value       | 54                   
-RECORD 4---------------------------
 Player Name | J.B. Holmes          
 Season      | 2010                 
 

#### sort()

In [145]:
students.sort(students["math score"].desc()).show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|  male|       group E|         associate's degree|free/reduced|              completed|       100|          100|           93|
|female|       group E|               some college|    standard|                   none|       100|           92|           97|
|female|       group E|          bachelor's degree|    standard|                   none|       100|          100|          100|
|  male|       group A|               some college|    standard|              completed|       100|           96|           86|
|  male|       group D|               some college|    standard|              completed|       100|     

In [146]:
students.sort("math score", ascending=False).show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|  male|       group E|         associate's degree|free/reduced|              completed|       100|          100|           93|
|female|       group E|               some college|    standard|                   none|       100|           92|           97|
|female|       group E|          bachelor's degree|    standard|                   none|       100|          100|          100|
|  male|       group A|               some college|    standard|              completed|       100|           96|           86|
|  male|       group D|               some college|    standard|              completed|       100|     

In [148]:
students.orderBy(students["math score"].desc()).show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|  male|       group E|         associate's degree|free/reduced|              completed|       100|          100|           93|
|female|       group E|               some college|    standard|                   none|       100|           92|           97|
|female|       group E|          bachelor's degree|    standard|                   none|       100|          100|          100|
|  male|       group A|               some college|    standard|              completed|       100|           96|           86|
|  male|       group D|               some college|    standard|              completed|       100|     

In [149]:
students.sort(asc("math score")).show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group C|           some high school|free/reduced|                   none|         0|           17|           10|
|female|       group B|                high school|free/reduced|                   none|         8|           24|           23|
|female|       group B|           some high school|free/reduced|                   none|        18|           32|           28|
|female|       group B|               some college|    standard|                   none|        19|           38|           32|
|female|       group C|               some college|free/reduced|                   none|        22|     

#### sortWithinPartitions()

In [150]:
students.sortWithinPartitions("math score", ascending=False).show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|  male|       group E|         associate's degree|free/reduced|              completed|       100|          100|           93|
|female|       group E|               some college|    standard|                   none|       100|           92|           97|
|female|       group E|          bachelor's degree|    standard|                   none|       100|          100|          100|
|  male|       group A|               some college|    standard|              completed|       100|           96|           86|
|  male|       group D|               some college|    standard|              completed|       100|     

In [152]:
students.sortWithinPartitions("gender", ascending=True).show(5)

+------+--------------+---------------------------+--------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|   lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+--------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|standard|                   none|        72|           72|           74|
|female|       group C|               some college|standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|standard|                   none|        90|           95|           93|
|female|       group B|         associate's degree|standard|                   none|        71|           83|           78|
|female|       group B|               some college|standard|              completed|        88|           95|           92|
+------+

#### stat (property)

In [153]:
tour.stat

<pyspark.sql.dataframe.DataFrameStatFunctions at 0x11fc44a10>

In [154]:
students.stat

<pyspark.sql.dataframe.DataFrameStatFunctions at 0x106608c90>

#### storageLevel (property)

In [156]:
tour.storageLevel

StorageLevel(False, False, False, False, 1)

In [158]:
tour.cache().storageLevel

StorageLevel(True, True, False, True, 1)

In [160]:
students.persist(StorageLevel.DISK_ONLY_2).storageLevel

StorageLevel(True, False, False, False, 2)

#### subtract()
This is equivalent to EXCEPT DISTINCT in SQL.

In [162]:
data1 = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
data2 = [("Bob", 30), ("Eve", 26)]
columns = ["name", "age"]

df1 = spark.createDataFrame(data1, columns)
df2 = spark.createDataFrame(data2, columns)

df1.subtract(df2).show()

+-------+---+
|   name|age|
+-------+---+
|Charlie| 28|
|  Alice| 25|
+-------+---+



#### summary()

In [163]:
tour.summary().show()
tour.summary("count", "min", "25%", "75%", "max").show()
tour.select("Value", "Player name").summary("count").show()

23/10/10 10:00:56 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+--------------+-----------------+--------------------+--------------------+------------------+
|summary|   Player Name|           Season|           Statistic|            Variable|             Value|
+-------+--------------+-----------------+--------------------+--------------------+------------------+
|  count|       2740403|          2740403|             2740403|             2740403|           2696905|
|   mean|          NULL|2013.973479083186|                NULL|                NULL|137053.96543856157|
| stddev|          NULL|2.607050115517948|                NULL|                NULL| 6046342.246458653|
|    min|A.J. McInerney|             2010|% of Potential Pt...|% of Potential Pt...|        $1,001,580|
|    25%|          NULL|             2012|                NULL|                NULL|               7.0|
|    50%|          NULL|             2014|                NULL|                NULL|              58.0|
|    75%|          NULL|             2016|                NULL| 

                                                                                

+-------+--------------+-------+--------------------+--------------------+---------------+
|summary|   Player Name| Season|           Statistic|            Variable|          Value|
+-------+--------------+-------+--------------------+--------------------+---------------+
|  count|       2740403|2740403|             2740403|             2740403|        2696905|
|    min|A.J. McInerney|   2010|% of Potential Pt...|% of Potential Pt...|     $1,001,580|
|    25%|          NULL|   2012|                NULL|                NULL|            7.0|
|    75%|          NULL|   2016|                NULL|                NULL|           93.0|
|    max|    Zihao Chen|   2018|    World Money List|World Money List ...|the Memorial/Mu|
+-------+--------------+-------+--------------------+--------------------+---------------+

+-------+-------+-----------+
|summary|  Value|Player name|
+-------+-------+-----------+
|  count|2696905|    2740403|
+-------+-------+-----------+



In [166]:
students.summary().toPandas()

Unnamed: 0,summary,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,count,1000,1000,1000,1000,1000,1000.0,1000.0,1000.0
1,mean,,,,,,66.089,69.169,68.054
2,stddev,,,,,,15.163080096009454,14.600191937252225,15.19565701086966
3,min,female,group A,associate's degree,free/reduced,completed,0.0,17.0,10.0
4,25%,,,,,,57.0,59.0,57.0
5,50%,,,,,,66.0,70.0,69.0
6,75%,,,,,,77.0,79.0,79.0
7,max,male,group E,some high school,standard,none,100.0,100.0,100.0


In [167]:
students.summary("count", "min", "25%", "75%", "max").toPandas()

Unnamed: 0,summary,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,count,1000,1000,1000,1000,1000,1000,1000,1000
1,min,female,group A,associate's degree,free/reduced,completed,0,17,10
2,25%,,,,,,57,59,57
3,75%,,,,,,77,79,79
4,max,male,group E,some high school,standard,none,100,100,100


In [168]:
students.select("gender", "math score").summary("count").show()

+-------+------+----------+
|summary|gender|math score|
+-------+------+----------+
|  count|  1000|      1000|
+-------+------+----------+



#### tail()

In [172]:
tour.tail(1)

[Row(Player Name='Martin Flores', Season=2018, Statistic='Fairway Bunker Tendency', Variable='Fairway Bunker Tendency - (RELATIVE TO PAR)', Value='+0.181')]

In [173]:
students.tail(2)

[Row(gender='female', race/ethnicity='group D', parental level of education='some college', lunch='standard', test preparation course='completed', math score=68, reading score=78, writing score=77),
 Row(gender='female', race/ethnicity='group D', parental level of education='some college', lunch='free/reduced', test preparation course='none', math score=77, reading score=86, writing score=86)]

#### take()

In [174]:
students.take(2)

[Row(gender='female', race/ethnicity='group B', parental level of education="bachelor's degree", lunch='standard', test preparation course='none', math score=72, reading score=72, writing score=74),
 Row(gender='female', race/ethnicity='group C', parental level of education='some college', lunch='standard', test preparation course='completed', math score=69, reading score=90, writing score=88)]

In [175]:
tour.take(1)

[Row(Player Name='Robert Garrigus', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='71')]

#### toDF()

In [178]:
tour.limit(10).toDF('name', "year", "stat", "val", "age").show(5)

+---------------+----+----------------+--------------------+---+
|           name|year|            stat|                 val|age|
+---------------+----+----------------+--------------------+---+
|Robert Garrigus|2010|Driving Distance|Driving Distance ...| 71|
|   Bubba Watson|2010|Driving Distance|Driving Distance ...| 77|
| Dustin Johnson|2010|Driving Distance|Driving Distance ...| 83|
|Brett Wetterich|2010|Driving Distance|Driving Distance ...| 54|
|    J.B. Holmes|2010|Driving Distance|Driving Distance ...|100|
+---------------+----+----------------+--------------------+---+
only showing top 5 rows



#### toJSON()

In [181]:
json_ = students.limit(5).toJSON()
json_

MapPartitionsRDD[1096] at toJavaRDD at NativeMethodAccessorImpl.java:0

In [183]:
json_.first()

'{"gender":"female","race/ethnicity":"group B","parental level of education":"bachelor\'s degree","lunch":"standard","test preparation course":"none","math score":72,"reading score":72,"writing score":74}'

In [185]:
type(json_)

pyspark.rdd.RDD

#### toLocalIterator()

In [186]:
list(tour.limit(5).toLocalIterator())

[Row(Player Name='Robert Garrigus', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='71'),
 Row(Player Name='Bubba Watson', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='77'),
 Row(Player Name='Dustin Johnson', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='83'),
 Row(Player Name='Brett Wetterich', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='54'),
 Row(Player Name='J.B. Holmes', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='100')]

In [187]:
type(tour.limit(5).toLocalIterator())

generator

#### toPandas()

In [188]:
tour.limit(5).toPandas()

Unnamed: 0,Player Name,Season,Statistic,Variable,Value
0,Robert Garrigus,2010,Driving Distance,Driving Distance - (ROUNDS),71
1,Bubba Watson,2010,Driving Distance,Driving Distance - (ROUNDS),77
2,Dustin Johnson,2010,Driving Distance,Driving Distance - (ROUNDS),83
3,Brett Wetterich,2010,Driving Distance,Driving Distance - (ROUNDS),54
4,J.B. Holmes,2010,Driving Distance,Driving Distance - (ROUNDS),100


#### transform()

In [190]:
def cast_all_to_int(input_df):
    return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
def sort_columns_asc(input_df):
    return input_df.select(*sorted(input_df.columns))

df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
df.transform(cast_all_to_int).transform(sort_columns_asc).show()

+-----+---+
|float|int|
+-----+---+
|    1|  1|
|    2|  2|
+-----+---+



#### union()
This is equivalent to UNION ALL in SQL.

#### unionAll()
This is equivalent to UNION ALL in SQL.

In [194]:
data1 = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
data2 = [("David", 32), ("Eve", 26), ("Alice", 25)]
columns = ["name", "age"]

df1 = spark.createDataFrame(data1, columns)
df2 = spark.createDataFrame(data2, columns)

df1.union(df2).show()

df1.unionAll(df2).show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 28|
|  David| 32|
|    Eve| 26|
|  Alice| 25|
+-------+---+

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 28|
|  David| 32|
|    Eve| 26|
|  Alice| 25|
+-------+---+



#### unionByName()
This is different from both UNION ALL and UNION DISTINCT in SQL.

In [195]:
df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
df1.unionByName(df2).show()

+----+----+----+
|col0|col1|col2|
+----+----+----+
|   1|   2|   3|
|   6|   4|   5|
+----+----+----+



#### unpersist()

In [196]:
df.unpersist()

DataFrame[int: bigint, float: double]

#### where()

In [198]:
tour.where(tour.Value == 2).show(5)

+----------------+------+---------------+--------------------+-----+
|     Player Name|Season|      Statistic|            Variable|Value|
+----------------+------+---------------+--------------------+-----+
|Kevin Sutherland|  2010|Putting Average|Putting Average -...|    2|
| Cameron Beckman|  2010|   Total Eagles|Total Eagles - (T...|    2|
|     Kris Blanks|  2010|   Total Eagles|Total Eagles - (T...|    2|
|    Justin Bolli|  2010|   Total Eagles|Total Eagles - (T...|    2|
| Michael Bradley|  2010|   Total Eagles|Total Eagles - (T...|    2|
+----------------+------+---------------+--------------------+-----+
only showing top 5 rows



#### withColumn()

In [202]:
df = students.limit(5).withColumn('rating', (students["math score"] * 4 + students["reading score"] * 3 + students["writing score"] * 3) / 10 )
df.toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,rating
0,female,group B,bachelor's degree,standard,none,72,72,74,72.6
1,female,group C,some college,standard,completed,69,90,88,81.0
2,female,group B,master's degree,standard,none,90,95,93,92.4
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.1
4,male,group C,some college,standard,none,76,78,75,76.3


#### withColumnRenamed()

In [204]:
df.withColumnRenamed('rating', 'mark').toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,mark
0,female,group B,bachelor's degree,standard,none,72,72,74,72.6
1,female,group C,some college,standard,completed,69,90,88,81.0
2,female,group B,master's degree,standard,none,90,95,93,92.4
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.1
4,male,group C,some college,standard,none,76,78,75,76.3


#### withWatermark()

In [205]:
from pyspark.sql.types import TimestampType

data = [("Alice", "2023-09-28 10:00:00"),
        ("Bob", "2023-09-28 10:05:00"),
        ("Charlie", "2023-09-28 10:10:00")]
columns = ["name", "timestamp"]

In [206]:
df = spark.createDataFrame(data, columns)
df = df.withColumn("timestamp", col("timestamp").cast(TimestampType()))

In [211]:
df_with_watermark = df.withWatermark("timestamp", "5 minutes")
windowed_df = df_with_watermark.groupBy(window("timestamp", "10 minutes")).count()

In [213]:
type(df_with_watermark), type(windowed_df)

(pyspark.sql.dataframe.DataFrame, pyspark.sql.dataframe.DataFrame)

#### write (property)
Interface for saving the content of the non-streaming DataFrame out into external storage.

#### writeStream (property)
Interface for saving the content of the streaming DataFrame out into external storage.

In [None]:
"""
windowed_df.writeStream \
    .outputMode("update") \
    .format("console") \
    .start() \
    .awaitTermination()
"""
