In [1]:
import pyspark
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.appName("dataframe").getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/04 10:34:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
path = "datasets/"
students = spark.read.csv(path+"students.csv", inferSchema=True, header=True)

# students.limit(5).toPandas() #need to import pandas
students.show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

#### Get columns

In [3]:
students.lunch
#students["lunch"]
#from pyspark.sql.functions import col
#col("lunch")
#students.select("lunch") # return - DataFrame

Column<'lunch'>

#### agg()

In [4]:
students.agg({"math score": "max"}).collect()

[Row(max(math score)=100)]

In [5]:
students.agg(min(students["`math score`"])).collect()

[Row(min(math score)=0)]

#### alias()

In [6]:
students.filter(students["reading score"] > 80).groupBy("gender").agg(avg(col("math score")).alias("average")).show()

+------+-----------------+
|gender|          average|
+------+-----------------+
|female|79.16981132075472|
|  male| 87.6842105263158|
+------+-----------------+



In [7]:
tour = spark.read.csv(path+"pga_tour_historical.csv", inferSchema=True, header=True)
tour.show(5)

[Stage 13:>                                                       (0 + 12) / 12]

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+
only showing top 5 rows



                                                                                

In [8]:
df = tour.limit(10)
df_as1 = df.alias("df_as1")
df_as2 = df.alias("df_as2")
joined_df = df_as1.join(df_as2, col("df_as1.Player Name") == col("df_as2.Player Name"), 'inner')
joined_df.select("df_as1.Player Name", "df_as2.Player Name", "df_as2.Value").sort(desc("df_as1.Player Name")).collect()

[Row(Player Name='Robert Garrigus', Player Name='Robert Garrigus', Value='71'),
 Row(Player Name='John Daly', Player Name='John Daly', Value='63'),
 Row(Player Name='J.B. Holmes', Player Name='J.B. Holmes', Value='100'),
 Row(Player Name='Graham DeLaet', Player Name='Graham DeLaet', Value='88'),
 Row(Player Name='Dustin Johnson', Player Name='Dustin Johnson', Value='83'),
 Row(Player Name='D.J. Trahan', Player Name='D.J. Trahan', Value='92'),
 Row(Player Name='Charles Warren', Player Name='Charles Warren', Value='64'),
 Row(Player Name='Bubba Watson', Player Name='Bubba Watson', Value='77'),
 Row(Player Name='Brett Wetterich', Player Name='Brett Wetterich', Value='54'),
 Row(Player Name='Angel Cabrera', Player Name='Angel Cabrera', Value='64')]

#### approxQuantile()

In [9]:
quantiles = students.approxQuantile(col=["math score", "reading score", "writing score"], probabilities=[0.25, 0.5, 0.75], relativeError=1)
# students.columns - list of columns' name
print("Approximate Quantiles:")
print("25th Percentile (Q1):", quantiles[0])
print("50th Percentile (Median, Q2):", quantiles[1])
print("75th Percentile (Q3):", quantiles[2])

Approximate Quantiles:
25th Percentile (Q1): [0.0, 0.0, 0.0]
50th Percentile (Median, Q2): [17.0, 17.0, 17.0]
75th Percentile (Q3): [10.0, 10.0, 10.0]


#### cashe()
##### треба глянути

In [10]:
import time
df = students.limit(50)

df.cache()

start_time = time.time()

result1 = df.filter(df["reading score"] > 80).groupBy("gender").agg({"math score": "avg"})
result2 = df.filter(df["reading score"] < 80).groupBy("gender").agg({"math score": "avg"})

result1.show()
result2.show()

print(f"execution_time - {time.time() - start_time}")

+------+---------------+
|gender|avg(math score)|
+------+---------------+
|female|           76.0|
|  male|           87.0|
+------+---------------+

+------+---------------+
|gender|avg(math score)|
+------+---------------+
|female|          57.05|
|  male|           59.0|
+------+---------------+

execution_time - 0.42634105682373047


In [11]:
start_time = time.time()

result3 = df.filter(df["reading score"] > 80).groupBy("gender").agg({"math score": "avg"})
result4 = df.filter(df["reading score"] < 80).groupBy("gender").agg({"math score": "avg"})

result3.show()
result4.show()

print(f"execution_time - {time.time() - start_time}")

+------+---------------+
|gender|avg(math score)|
+------+---------------+
|female|           76.0|
|  male|           87.0|
+------+---------------+

+------+---------------+
|gender|avg(math score)|
+------+---------------+
|female|          57.05|
|  male|           59.0|
+------+---------------+

execution_time - 0.19073891639709473


In [12]:
df.cache()
start_time = time.time()
result1 = df.groupBy("gender").count()
result2 = df.groupBy("parental level of education").agg({"math score": "avg", "writing score": "avg", "reading score": "avg"})

result1.show()
result2.show()

print(f"execution_time - {time.time() - start_time}")
df.unpersist()
start_time = time.time()

result3 = df.groupBy("gender").count()
result4 = df.groupBy("parental level of education").agg({"math score": "avg", "writing score": "avg", "reading score": "avg"})

result3.show()
result4.show()

print(f"execution_time - {time.time() - start_time}")

23/10/04 10:34:25 WARN CacheManager: Asked to cache already cached data.


+------+-----+
|gender|count|
+------+-----+
|female|   27|
|  male|   23|
+------+-----+

+---------------------------+------------------+------------------+------------------+
|parental level of education|avg(reading score)|avg(writing score)|   avg(math score)|
+---------------------------+------------------+------------------+------------------+
|          bachelor's degree| 70.66666666666667| 76.33333333333333|              71.0|
|               some college| 68.76923076923077| 66.38461538461539| 65.92307692307692|
|            master's degree| 67.66666666666667| 68.16666666666667|62.833333333333336|
|         associate's degree|           65.8125|           63.8125|            59.375|
|                high school|              73.5|             70.25|            67.375|
|           some high school|              61.0|              59.5|              51.5|
+---------------------------+------------------+------------------+------------------+

execution_time - 0.2996370792388916
+-

#### checkpoint()
##### треба глянути

#### Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the logical plan of this DataFrame, which is especially useful in iterative algorithms where the plan may grow exponentially. It will be saved to files inside the checkpoint directory set with SparkContext.setCheckpointDir().

Parameters
eager – Whether to checkpoint this DataFrame immediately

#### Note: Experimental

#### coalesce()

In [13]:
print(f"{students.rdd.getNumPartitions()}\n"
      f"{df.rdd.getNumPartitions()}\n"
      f"{students.repartition(2).rdd.getNumPartitions()}\n"
      f"{students.rdd.getNumPartitions()}\n"
      f"{df.coalesce(1).rdd.getNumPartitions()}")

1
1
2
1
1


#### colRegex()

In [14]:
df.select(df.colRegex("`.+score.?`")).show(5)

+----------+-------------+-------------+
|math score|reading score|writing score|
+----------+-------------+-------------+
|        72|           72|           74|
|        69|           90|           88|
|        90|           95|           93|
|        47|           57|           44|
|        76|           78|           75|
+----------+-------------+-------------+
only showing top 5 rows



#### collect()

In [15]:
df.select("math score", "reading score", "writing score").sort(desc("math score")).limit(5).collect()

[Row(math score=97, reading score=87, writing score=82),
 Row(math score=90, reading score=95, writing score=93),
 Row(math score=88, reading score=95, writing score=92),
 Row(math score=88, reading score=89, writing score=86),
 Row(math score=82, reading score=84, writing score=82)]

#### corr()

In [16]:
students.corr("reading score", "writing score")

0.9545980771462476

In [17]:
students.corr("math score", "writing score")

0.8026420459498075

In [18]:
students.corr("math score", "reading score")

0.8175796636720539

#### count()

In [19]:
df.count()

50

In [20]:
students.count()

1000

In [21]:
tour.count()

2740403

#### cov()

In [22]:
df.cov("reading score", "writing score")

200.49632653061224

In [23]:
students.cov("reading score", "writing score")

211.7866606606608

#### createGlobalTempView()

In [24]:
#spark.catalog.dropGlobalTempView("students")
#students.createGlobalTempView("students")
students.createOrReplaceGlobalTempView("students")

In [25]:
result = spark.sql("SELECT * FROM global_temp.students WHERE `math score` > 80")
result.show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|female|       group B|               some college|    standard|              completed|        88|           95|           92|
|  male|       group C|                high school|    standard|                   none|        88|           89|           86|
|  male|       group E|               some college|    standard|                   none|        97|           87|           82|
|  male|       group E|         associate's degree|    standard|              completed|        81|     

#### crossJoin()

In [28]:
df = tour.limit(5).select("Value", "Player name")
df2 = tour.limit(5).select("Player name", "Season")
df.crossJoin(df2.select("Season")).select("Value", "Player name", "Season").show()

+-----+---------------+------+
|Value|    Player name|Season|
+-----+---------------+------+
|   71|Robert Garrigus|  2010|
|   71|Robert Garrigus|  2010|
|   71|Robert Garrigus|  2010|
|   71|Robert Garrigus|  2010|
|   71|Robert Garrigus|  2010|
|   77|   Bubba Watson|  2010|
|   77|   Bubba Watson|  2010|
|   77|   Bubba Watson|  2010|
|   77|   Bubba Watson|  2010|
|   77|   Bubba Watson|  2010|
|   83| Dustin Johnson|  2010|
|   83| Dustin Johnson|  2010|
|   83| Dustin Johnson|  2010|
|   83| Dustin Johnson|  2010|
|   83| Dustin Johnson|  2010|
|   54|Brett Wetterich|  2010|
|   54|Brett Wetterich|  2010|
|   54|Brett Wetterich|  2010|
|   54|Brett Wetterich|  2010|
|   54|Brett Wetterich|  2010|
+-----+---------------+------+
only showing top 20 rows



#### crosstab()

In [29]:
data = [("Alice", "Female", "A"),
        ("Bob", "Male", "B"),
        ("Charlie", "Male", "A"),
        ("David", "Male", "C"),
        ("Eve", "Female", "B")]

columns = ["Name", "Gender", "Grade"]
df = spark.createDataFrame(data, columns)

cross_tab = df.crosstab("Gender", "Grade")
cross_tab.show()

                                                                                

+------------+---+---+---+
|Gender_Grade|  A|  B|  C|
+------------+---+---+---+
|      Female|  1|  1|  0|
|        Male|  1|  1|  1|
+------------+---+---+---+



#### cube()

##### Create a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregations on them.

In [30]:
tour.cube("Player name", tour.Value).count().orderBy("Player name", "Value").show()

[Stage 89:>                                                       (0 + 12) / 13]

+-----------+----------+-------+
|Player name|     Value|  count|
+-----------+----------+-------+
|       NULL|      NULL|  43498|
|       NULL|      NULL|2740403|
|       NULL|$1,001,580|      1|
|       NULL|$1,001,581|      2|
|       NULL|$1,002,036|      4|
|       NULL|$1,003,359|      1|
|       NULL|$1,003,362|      2|
|       NULL|$1,003,363|      2|
|       NULL|$1,004,033|      1|
|       NULL|$1,004,034|      2|
|       NULL|$1,004,035|      2|
|       NULL|$1,004,159|      1|
|       NULL|$1,004,160|      2|
|       NULL|$1,004,161|      2|
|       NULL|$1,004,693|      1|
|       NULL|$1,004,696|      4|
|       NULL|$1,005,321|      1|
|       NULL|$1,005,322|      2|
|       NULL|$1,007,218|      3|
|       NULL|$1,007,219|      2|
+-----------+----------+-------+
only showing top 20 rows



                                                                                

#### describe()

In [31]:
students.describe("math score", "reading score", "writing score").show()

+-------+------------------+------------------+-----------------+
|summary|        math score|     reading score|    writing score|
+-------+------------------+------------------+-----------------+
|  count|              1000|              1000|             1000|
|   mean|            66.089|            69.169|           68.054|
| stddev|15.163080096009454|14.600191937252223|15.19565701086966|
|    min|                 0|                17|               10|
|    max|               100|               100|              100|
+-------+------------------+------------------+-----------------+



#### distinct()

In [33]:
tour.count()

2740403

In [36]:
tour.distinct().count()

23/10/04 10:42:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/04 10:42:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/04 10:42:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/04 10:42:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/04 10:42:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/04 10:42:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/04 10:42:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/04 10:42:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/04 10:42:55 WARN RowBasedKeyValueBatch: Calling spill() on

2740403

#### drop()

In [39]:
tour.limit(5).drop("Value").show(5, False)

+---------------+------+----------------+---------------------------+
|Player Name    |Season|Statistic       |Variable                   |
+---------------+------+----------------+---------------------------+
|Robert Garrigus|2010  |Driving Distance|Driving Distance - (ROUNDS)|
|Bubba Watson   |2010  |Driving Distance|Driving Distance - (ROUNDS)|
|Dustin Johnson |2010  |Driving Distance|Driving Distance - (ROUNDS)|
|Brett Wetterich|2010  |Driving Distance|Driving Distance - (ROUNDS)|
|J.B. Holmes    |2010  |Driving Distance|Driving Distance - (ROUNDS)|
+---------------+------+----------------+---------------------------+



In [48]:
students.drop("math score", "reading score", "writing score").show(5, False)

+------+--------------+---------------------------+------------+-----------------------+
|gender|race/ethnicity|parental level of education|lunch       |test preparation course|
+------+--------------+---------------------------+------------+-----------------------+
|female|group B       |bachelor's degree          |standard    |none                   |
|female|group C       |some college               |standard    |completed              |
|female|group B       |master's degree            |standard    |none                   |
|male  |group A       |associate's degree         |free/reduced|none                   |
|male  |group C       |some college               |standard    |none                   |
+------+--------------+---------------------------+------------+-----------------------+
only showing top 5 rows



#### dropDuplicates() / drop_duplicates()

In [51]:
df = spark.createDataFrame([ \
    Row(name='Alice', age=5, height=80), \
    Row(name='Alice', age=5, height=80), \
    Row(name='Alice', age=10, height=80)])
df.dropDuplicates().show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
|Alice| 10|    80|
+-----+---+------+



In [52]:
df.dropDuplicates(['name', 'height']).show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
+-----+---+------+



In [54]:
tour.dropDuplicates(["Player name"]).count()

                                                                                

3053

#### dropna()

In [55]:
tour.dropna().count()

                                                                                

2696905

In [56]:
students.dropna().count()

1000

#### dtypes (property)

In [57]:
tour.dtypes

[('Player Name', 'string'),
 ('Season', 'int'),
 ('Statistic', 'string'),
 ('Variable', 'string'),
 ('Value', 'string')]

In [58]:
students.dtypes

[('gender', 'string'),
 ('race/ethnicity', 'string'),
 ('parental level of education', 'string'),
 ('lunch', 'string'),
 ('test preparation course', 'string'),
 ('math score', 'int'),
 ('reading score', 'int'),
 ('writing score', 'int')]

#### exceptAll()
##### Return a new DataFrame containing rows in this DataFrame but not in another DataFrame while preserving duplicates.

This is equivalent to EXCEPT ALL in SQL.

In [60]:
df1 = spark.createDataFrame([("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b",  3), ("c", 4)], ["C1", "C2"])
df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
df1.exceptAll(df2).show()
# Also as standard in SQL, this function resolves columns by position (not by name).

+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  a|  1|
|  a|  2|
|  c|  4|
+---+---+



#### explain()

In [61]:
tour.explain()

== Physical Plan ==
FileScan csv [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/zsavchenko/spark_env/training/datasets/pga_tour_historical..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Player Name:string,Season:int,Statistic:string,Variable:string,Value:string>




In [64]:
tour.explain(mode="formatted")

== Physical Plan ==
Scan csv  (1)


(1) Scan csv 
Output [5]: [Player Name#156, Season#157, Statistic#158, Variable#159, Value#160]
Batched: false
Location: InMemoryFileIndex [file:/Users/zsavchenko/spark_env/training/datasets/pga_tour_historical.csv]
ReadSchema: struct<Player Name:string,Season:int,Statistic:string,Variable:string,Value:string>




In [65]:
tour.explain(True)

== Parsed Logical Plan ==
Relation [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] csv

== Analyzed Logical Plan ==
Player Name: string, Season: int, Statistic: string, Variable: string, Value: string
Relation [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] csv

== Optimized Logical Plan ==
Relation [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] csv

== Physical Plan ==
FileScan csv [Player Name#156,Season#157,Statistic#158,Variable#159,Value#160] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/zsavchenko/spark_env/training/datasets/pga_tour_historical..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Player Name:string,Season:int,Statistic:string,Variable:string,Value:string>



#### fillna()
Replace null values, alias for na.fill().

In [67]:
tour.na.fill(50).dropna().count()

2696905

In [75]:
df = tour.na.fill({'Value': 50, 'Player name': 'unknown'})
df.count()

2740403

In [76]:
df.dropna().count()

                                                                                

2740403

#### filter()

In [80]:
df.filter(df.Value > 3).show(5)
df.where(df.Value == 2).show(5)

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+
only showing top 5 rows

+----------------+------+---------------+--------------------+-----+
|     Player Name|Season|      Statistic|            Variable|Value|
+----------------+------+---------------+--------------------+-----+
|Kevin Sutherland|  2010|Putting Average|Putting Average -...|    2|
| Cameron Beckman|  2010|   Total Eagles|Total Eagles - (T...|    2|
|     Kri

In [82]:
df.filter("Value > 3").show(5)
df.where("Value = 2").show(5)

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+
only showing top 5 rows

+----------------+------+---------------+--------------------+-----+
|     Player Name|Season|      Statistic|            Variable|Value|
+----------------+------+---------------+--------------------+-----+
|Kevin Sutherland|  2010|Putting Average|Putting Average -...|    2|
| Cameron Beckman|  2010|   Total Eagles|Total Eagles - (T...|    2|
|     Kri

#### first()

In [83]:
df.first()

Row(Player Name='Robert Garrigus', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='71')

#### foreach()

In [98]:
def f(person):
    # if person.Value == 100:
    print(person.Value)
df.limit(10).foreach(f)

71
77
83
54
100
63
88
64
64
92


#### foreachPosition()

In [99]:
def f(people):
    for person in people:
        print(person.Value)
df.limit(5).foreachPartition(f)

71
77
83
54
100


#### freqItems()
##### DataFrame.freqItems() and DataFrameStatFunctions.freqItems() are aliases.

In [106]:
tour.freqItems(["Value"]).show()

+--------------------+
|     Value_freqItems|
+--------------------+
|[8, 6.7, 67, 27, ...|
+--------------------+



                                                                                

#### groupBy() / groupby()

In [107]:
students.groupBy().avg().collect()

[Row(avg(math score)=66.089, avg(reading score)=69.169, avg(writing score)=68.054)]

In [108]:
sorted(students.groupBy('gender').agg({'math score': 'mean'}).collect())

[Row(gender='female', avg(math score)=63.633204633204635),
 Row(gender='male', avg(math score)=68.72821576763485)]

In [109]:
sorted(students.groupBy(['gender', students.gender]).count().collect())

[Row(gender='female', gender='female', count=518),
 Row(gender='male', gender='male', count=482)]

In [111]:
sorted(students.groupBy(students.gender).avg().collect())

[Row(gender='female', avg(math score)=63.633204633204635, avg(reading score)=72.60810810810811, avg(writing score)=72.46718146718146),
 Row(gender='male', avg(math score)=68.72821576763485, avg(reading score)=65.47302904564316, avg(writing score)=63.31120331950208)]

#### head()

In [116]:
students.head(5)
# n – int, default 1. Number of rows to return.

[Row(gender='female', race/ethnicity='group B', parental level of education="bachelor's degree", lunch='standard', test preparation course='none', math score=72, reading score=72, writing score=74),
 Row(gender='female', race/ethnicity='group C', parental level of education='some college', lunch='standard', test preparation course='completed', math score=69, reading score=90, writing score=88),
 Row(gender='female', race/ethnicity='group B', parental level of education="master's degree", lunch='standard', test preparation course='none', math score=90, reading score=95, writing score=93),
 Row(gender='male', race/ethnicity='group A', parental level of education="associate's degree", lunch='free/reduced', test preparation course='none', math score=47, reading score=57, writing score=44),
 Row(gender='male', race/ethnicity='group C', parental level of education='some college', lunch='standard', test preparation course='none', math score=76, reading score=78, writing score=75)]

#### hint()
Specifies some hint on the current DataFrame.

Parameters:
name – A name of the hint.
parameters – Optional parameters.

Returns - DataFrame

In [118]:
students.limit(5).join(students.limit(10).hint("broadcast"), "gender").toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,race/ethnicity.1,parental level of education.1,lunch.1,test preparation course.1,math score.1,reading score.1,writing score.1
0,female,group B,bachelor's degree,standard,none,72,72,74,group B,high school,free/reduced,none,38,60,50
1,female,group B,bachelor's degree,standard,none,72,72,74,group B,some college,standard,completed,88,95,92
2,female,group B,bachelor's degree,standard,none,72,72,74,group B,associate's degree,standard,none,71,83,78
3,female,group B,bachelor's degree,standard,none,72,72,74,group B,master's degree,standard,none,90,95,93
4,female,group B,bachelor's degree,standard,none,72,72,74,group C,some college,standard,completed,69,90,88
5,female,group B,bachelor's degree,standard,none,72,72,74,group B,bachelor's degree,standard,none,72,72,74
6,female,group C,some college,standard,completed,69,90,88,group B,high school,free/reduced,none,38,60,50
7,female,group C,some college,standard,completed,69,90,88,group B,some college,standard,completed,88,95,92
8,female,group C,some college,standard,completed,69,90,88,group B,associate's degree,standard,none,71,83,78
9,female,group C,some college,standard,completed,69,90,88,group B,master's degree,standard,none,90,95,93


#### intersect()

In [128]:
data1 = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
data2 = [("Alice", 25), ("David", 32), ("Eve", 26)]
columns = ["name", "age"]

df1 = spark.createDataFrame(data1, columns)
df2 = spark.createDataFrame(data2, columns)

In [129]:
intersect_df = df1.intersect(df2)
intersect_df.show()

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
+-----+---+



#### intersectAll()

In [131]:
data3 = [("Alice", 25), ("Charlie", 28), ("Frank", 22)]
df3 = spark.createDataFrame(data3, columns)

In [132]:
df1.intersectAll(df2).show()
intersect_all_df = df1.intersectAll(df2).intersectAll(df3).show()

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
+-----+---+

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
+-----+---+



#### isLocal()

In [135]:
if tour.isLocal:
    print("Local")
else:
    print("Cluster")

Local


#### isStreaming (property)

In [138]:
tour.isStreaming

False

### join

#### limit()

In [119]:
tour.limit(5)

DataFrame[Player Name: string, Season: int, Statistic: string, Variable: string, Value: string]

#### localCheckpoint()

#### mapInPandas()

#### na (property)

In [122]:
tour.na

<pyspark.sql.dataframe.DataFrameNaFunctions at 0x11f666550>

#### orderBy()

#### persist()

#### printSchema()

In [123]:
tour.printSchema()

root
 |-- Player Name: string (nullable = true)
 |-- Season: integer (nullable = true)
 |-- Statistic: string (nullable = true)
 |-- Variable: string (nullable = true)
 |-- Value: string (nullable = true)



#### randomSplit()

In [126]:
tour.randomSplit([1.0, 2.0], 24)

[DataFrame[Player Name: string, Season: int, Statistic: string, Variable: string, Value: string],
 DataFrame[Player Name: string, Season: int, Statistic: string, Variable: string, Value: string]]

#### rdd (property)

In [127]:
tour.rdd

MapPartitionsRDD[681] at javaToPython at NativeMethodAccessorImpl.java:0

#### registerTempTable()

#### repartition()

#### repartitionByRange()

#### replace()

#### rollup()

#### sample()

#### sampleBy()

#### schema (property)

#### select()

#### selectExpr()

#### show()

#### sort()

#### sortWithinPartitions()

#### stat (property)

#### storageLevel (property)

#### subtract()

#### summary()

#### tail()

#### take()

#### toDF()

#### toJSON()

#### toLocalIterator()

#### toPandas()

#### transform()

#### union()

#### unionAll()

#### unionByName()

#### unpersist()

#### where()

#### withColumn()

#### withColumnRenamed()

#### withWatermark()

#### write (property)

#### writeStream (property)