In [1]:
import pyspark
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.appName("column_rows").getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/04 01:44:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
path = "datasets/"
students = spark.read.csv(path+"students.csv", inferSchema=True, header=True)

students.limit(5).toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
tour = spark.read.csv(path+"pga_tour_historical.csv", inferSchema=True, header=True)
tour.show(5)
tour.limit(100).dropna().dropDuplicates().limit(5).toPandas()

                                                                                

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+
only showing top 5 rows



Unnamed: 0,Player Name,Season,Statistic,Variable,Value
0,Robert Garrigus,2010,Driving Distance,Driving Distance - (ROUNDS),71
1,Bubba Watson,2010,Driving Distance,Driving Distance - (ROUNDS),77
2,Dustin Johnson,2010,Driving Distance,Driving Distance - (ROUNDS),83
3,Brett Wetterich,2010,Driving Distance,Driving Distance - (ROUNDS),54
4,J.B. Holmes,2010,Driving Distance,Driving Distance - (ROUNDS),100


## Column

In [4]:
df = tour.limit(50)
df.Season

Column<'Season'>

In [5]:
df["Season"]

Column<'Season'>

In [6]:
df.Season + 1

Column<'(Season + 1)'>

In [7]:
1 / df.Season

Column<'(1 / Season)'>

#### alias(*alias, **kwargs)

In [8]:
df.select("Player Name", df.Season.alias("Year"), "Value").limit(5).collect()

[Row(Player Name='Robert Garrigus', Year=2010, Value='71'),
 Row(Player Name='Bubba Watson', Year=2010, Value='77'),
 Row(Player Name='Dustin Johnson', Year=2010, Value='83'),
 Row(Player Name='Brett Wetterich', Year=2010, Value='54'),
 Row(Player Name='J.B. Holmes', Year=2010, Value='100')]

In [9]:
df.select(df.Value.alias("vl", metadata={'max': 99})).schema['vl'].metadata['max']

99

In [10]:
df.select(df.Value.alias("vl", metadata={'max': 99}))

DataFrame[vl: string]

In [11]:
df.select(df.Value.alias("vl", metadata={'max': 99, "author": "Znn"})).schema['vl'].metadata['author']

'Znn'

In [12]:
df.select(df.Value.alias("vl", metadata={'max': 99, "author": "Znn"}))

DataFrame[vl: string]

#### asc()

In [13]:
df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
df.select(df.name).orderBy(df.name.asc()).collect()

                                                                                

[Row(name='Alice'), Row(name='Tom')]

In [14]:
tour.count()

                                                                                

2740403

In [15]:
tour.limit(100000).select("Player Name", "Season", "Value").dropna().dropDuplicates().orderBy(tour.Value.asc()).show(5)

23/10/04 01:44:29 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
[Stage 17:>                                                       (0 + 12) / 12]

+--------------+------+----------+
|   Player Name|Season|     Value|
+--------------+------+----------+
|   Chris Riley|  2010|$1,001,581|
|   Josh Teater|  2010|$1,005,322|
| Shaun Micheel|  2010|$1,025,500|
|Justin Leonard|  2010|$1,026,445|
|   Ryuji Imada|  2010|$1,028,868|
+--------------+------+----------+
only showing top 5 rows



                                                                                

In [16]:
tour.limit(100000).select("Player Name", "Season", "Value").dropna().dropDuplicates().orderBy(tour.Value.asc()).collect()[:5]

[Row(Player Name='Chris Riley', Season=2010, Value='$1,001,581'),
 Row(Player Name='Josh Teater', Season=2010, Value='$1,005,322'),
 Row(Player Name='Shaun Micheel', Season=2010, Value='$1,025,500'),
 Row(Player Name='Justin Leonard', Season=2010, Value='$1,026,445'),
 Row(Player Name='Ryuji Imada', Season=2010, Value='$1,028,868')]

In [17]:
type(tour.limit(100000).select("Player Name", "Season", "Value").dropna().dropDuplicates().orderBy(tour.Value.asc()))

pyspark.sql.dataframe.DataFrame

In [18]:
type(tour.limit(100000).select("Player Name", "Season", "Value").dropna().dropDuplicates().orderBy(tour.Value.asc()).collect())

list

#### asc_nulls_first()

In [19]:
df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])

In [20]:
df.select(df.name).orderBy(df.name.asc_nulls_first()).collect()

[Row(name=None), Row(name='Alice'), Row(name='Tom')]

In [21]:
tour.select("Player Name", "Season", "Value").dropDuplicates().orderBy(tour.Value.asc_nulls_first()).show(5)

[Stage 30:====>                                                   (1 + 11) / 12]

+-------------+------+-----+
|  Player Name|Season|Value|
+-------------+------+-----+
| Geoff Ogilvy|  2010| NULL|
|   Lee Janzen|  2010| NULL|
|Patrick Moore|  2010| NULL|
|    Jeff Peck|  2010| NULL|
|   Ben Martin|  2010| NULL|
+-------------+------+-----+
only showing top 5 rows



                                                                                

#### asc_nulls_last()

In [22]:
df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
df.select(df.name).orderBy(df.name.asc_nulls_last()).collect()

[Row(name='Alice'), Row(name='Tom'), Row(name=None)]

In [23]:
tour.select("Player Name", "Season", "Value").dropDuplicates().orderBy(tour.Value.asc_nulls_last()).show(5)

[Stage 37:====>                                                   (1 + 11) / 12]

+--------------+------+----------+
|   Player Name|Season|     Value|
+--------------+------+----------+
|   Chris Riley|  2010|$1,001,580|
|   Chris Riley|  2010|$1,001,581|
|Steve Stricker|  2017|$1,002,036|
|  Robert Streb|  2016|$1,003,359|
|  Robert Streb|  2016|$1,003,362|
+--------------+------+----------+
only showing top 5 rows



                                                                                

#### astype(dataType)

In [24]:
from pyspark.sql.types import IntegerType
df = tour.withColumn("Value", tour["Value"].astype("int"))
df.printSchema()

root
 |-- Player Name: string (nullable = true)
 |-- Season: integer (nullable = true)
 |-- Statistic: string (nullable = true)
 |-- Variable: string (nullable = true)
 |-- Value: integer (nullable = true)



In [25]:
tour["Value"].astype("int")

Column<'CAST(Value AS INT)'>

#### between(lowerBound, upperBound)

In [26]:
df = tour.withColumn("Value", tour["Value"].astype("int"))
df.select("Player name", df.Value.between(0, 75)).show(5)

+---------------+--------------------------------+
|    Player name|((Value >= 0) AND (Value <= 75))|
+---------------+--------------------------------+
|Robert Garrigus|                            true|
|   Bubba Watson|                           false|
| Dustin Johnson|                           false|
|Brett Wetterich|                            true|
|    J.B. Holmes|                           false|
+---------------+--------------------------------+
only showing top 5 rows



#### bitwiseAND(other)

In [27]:
df = spark.createDataFrame([pyspark.sql.Row(a=7, b=3)])
df.select(df.a.bitwiseAND(df.b)).collect()

[Row((a & b)=3)]

In [28]:
df = tour.limit(100).withColumn("Value", tour["Value"].astype("int"))
df.select("Player name", df.Season.bitwiseAND(df.Value)).show(5)

+---------------+----------------+
|    Player name|(Season & Value)|
+---------------+----------------+
|Robert Garrigus|              66|
|   Bubba Watson|              72|
| Dustin Johnson|              82|
|Brett Wetterich|              18|
|    J.B. Holmes|              64|
+---------------+----------------+
only showing top 5 rows



#### bitwiseOR(other)

In [29]:
df = spark.createDataFrame([pyspark.sql.Row(a=7, b=3)])
df.select(df.a.bitwiseOR(df.b)).collect()

[Row((a | b)=7)]

In [30]:
tour.select("Player name", tour.Season.bitwiseAND(tour.Value.astype("int"))).show(5)
tour.select("Player name", tour.Season.bitwiseOR(tour.Value.astype("int"))).show(5)
tour.select("Player name", tour.Season.bitwiseXOR(tour.Value.astype("int"))).show(5)

+---------------+-----------------------------+
|    Player name|(Season & CAST(Value AS INT))|
+---------------+-----------------------------+
|Robert Garrigus|                           66|
|   Bubba Watson|                           72|
| Dustin Johnson|                           82|
|Brett Wetterich|                           18|
|    J.B. Holmes|                           64|
+---------------+-----------------------------+
only showing top 5 rows

+---------------+-----------------------------+
|    Player name|(Season | CAST(Value AS INT))|
+---------------+-----------------------------+
|Robert Garrigus|                         2015|
|   Bubba Watson|                         2015|
| Dustin Johnson|                         2011|
|Brett Wetterich|                         2046|
|    J.B. Holmes|                         2046|
+---------------+-----------------------------+
only showing top 5 rows

+---------------+-----------------------------+
|    Player name|(Season ^ CAST(Value 

#### bitwiseXOR(other)

In [31]:
df = spark.createDataFrame([pyspark.sql.Row(a=7, b=3)])
df.select(df.a.bitwiseXOR(df.b)).collect()

[Row((a ^ b)=4)]

#### cast(dataType)

In [32]:
tour.select("*", tour.Value.cast(IntegerType())).printSchema()

root
 |-- Player Name: string (nullable = true)
 |-- Season: integer (nullable = true)
 |-- Statistic: string (nullable = true)
 |-- Variable: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Value: integer (nullable = true)



In [33]:
students.printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: integer (nullable = true)
 |-- reading score: integer (nullable = true)
 |-- writing score: integer (nullable = true)



In [34]:
from pyspark.sql.types import StringType

students.withColumn("math score", when(col("math score") < 75, "Not pass").otherwise(col("math score").cast(StringType()))).show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|  Not pass|           72|           74|
|female|       group C|               some college|    standard|              completed|  Not pass|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|  Not pass|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [35]:
from pyspark.sql.types import DoubleType

students.withColumn("math score", col("math score").cast(DoubleType())).withColumn("reading score", col("reading score").cast(DoubleType())).show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|      72.0|         72.0|           74|
|female|       group C|               some college|    standard|              completed|      69.0|         90.0|           88|
|female|       group B|            master's degree|    standard|                   none|      90.0|         95.0|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|      47.0|         57.0|           44|
|  male|       group C|               some college|    standard|                   none|      76.0|     

#### contains(other)

In [36]:
tour.filter(tour["Player name"].contains("enk")).show()

+---------------+------+--------------------+--------------------+-------+
|    Player Name|Season|           Statistic|            Variable|  Value|
+---------------+------+--------------------+--------------------+-------+
|Tommy Biershenk|  2010|Total Money (Offi...|Total Money (Offi...|      1|
|Tommy Biershenk|  2010|Total Money (Offi...|Total Money (Offi...|  5,000|
|    Nolan Henke|  2010|Best YTD Streak w...|Best YTD Streak w...|      1|
|    Nolan Henke|  2010|Best YTD Streak w...|Best YTD Streak w...|     26|
|    Nolan Henke|  2010|Best YTD Streak w...|Best YTD Streak w...|      9|
|    Nolan Henke|  2010|Best YTD 1-Putt o...|Best YTD 1-Putt o...|      1|
|    Nolan Henke|  2010|Best YTD 1-Putt o...|Best YTD 1-Putt o...|      3|
|    Nolan Henke|  2010|Consecutive Sand ...|Consecutive Sand ...|      1|
|    Nolan Henke|  2010|Consecutive Sand ...|Consecutive Sand ...|      1|
|    Nolan Henke|  2010|Consecutive Sand ...|Consecutive Sand ...|   NULL|
|    Nolan Henke|  2010|C

#### desc()

In [37]:
df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
df.select(df.name).orderBy(df.name.desc()).show()

+-----+
| name|
+-----+
|  Tom|
|Alice|
+-----+



#### desc_nulls_first()

In [38]:
df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
df.select(df.name).orderBy(df.name.desc_nulls_first()).show()

+-----+
| name|
+-----+
| NULL|
|  Tom|
|Alice|
+-----+



#### desc_nulls_last()

In [39]:
df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
df.select(df.name).orderBy(df.name.desc_nulls_last()).show()


+-----+
| name|
+-----+
|  Tom|
|Alice|
| NULL|
+-----+



#### endswith(other)

In [40]:
tour.filter(tour["Player name"].endswith("ko")).show()

+-----------+------+--------------------+--------------------+------+
|Player Name|Season|           Statistic|            Variable| Value|
+-----------+------+--------------------+--------------------+------+
|Roope Kakko|  2010|Official World Go...|Official World Go...|    40|
|Roope Kakko|  2010|Official World Go...|Official World Go...|   .18|
|Roope Kakko|  2010|Official World Go...|Official World Go...|  7.12|
|Roope Kakko|  2010|Official World Go...|Official World Go...|-17.21|
|Roope Kakko|  2010|Official World Go...|Official World Go...|  1.74|
|Roope Kakko|  2010|Official World Go...|Official World Go...|   FIN|
|Roope Kakko|  2012|Official World Go...|Official World Go...|    40|
|Roope Kakko|  2012|Official World Go...|Official World Go...|   .34|
|Roope Kakko|  2012|Official World Go...|Official World Go...| 13.61|
|Roope Kakko|  2012|Official World Go...|Official World Go...| -2.01|
|Roope Kakko|  2012|Official World Go...|Official World Go...| 14.29|
|Roope Kakko|  2012|

#### eqNullSafe(other)

In [41]:
from pyspark.sql import Row
df1 = spark.createDataFrame([
    Row(id=1, value='foo'),
    Row(id=2, value=None)
])

df2 = spark.createDataFrame([
    Row(value = 'bar'),
    Row(value = None)
])

df1.select(
    df1['value'] == 'foo',
    df1['value'].eqNullSafe('foo'),
    df1['value'].eqNullSafe(None)
).show()

+-------------+---------------+----------------+
|(value = foo)|(value <=> foo)|(value <=> NULL)|
+-------------+---------------+----------------+
|         true|           true|           false|
|         NULL|          false|            true|
+-------------+---------------+----------------+



In [42]:
df1.join(df2, df1["value"] == df2["value"]).count()

0

In [43]:
df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count()

1

In [44]:
df2 = spark.createDataFrame([
    Row(id=1, value=float('NaN')),
    Row(id=2, value=42.0),
    Row(id=3, value=None)
])
df2.select(
    df2['value'].eqNullSafe(None),
    df2['value'].eqNullSafe(float('NaN')),
    df2['value'].eqNullSafe(42.0)
).show()

+----------------+---------------+----------------+
|(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)|
+----------------+---------------+----------------+
|           false|           true|           false|
|           false|          false|            true|
|            true|          false|           false|
+----------------+---------------+----------------+



#### getField(name)

In [45]:
df = spark.createDataFrame([Row(r=tour.first())])
df.select(df.r.getField("Player name")).show()

+---------------+
|  r.Player name|
+---------------+
|Robert Garrigus|
+---------------+



In [46]:
df.withColumn("Value", col("r").getField("Value")).show(1, False)

+--------------------------------------------------------------------------+-----+
|r                                                                         |Value|
+--------------------------------------------------------------------------+-----+
|{Robert Garrigus, 2010, Driving Distance, Driving Distance - (ROUNDS), 71}|71   |
+--------------------------------------------------------------------------+-----+



#### getItem(key) 

In [47]:
df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"])
df.select(df.l.getItem(0), df.d.getItem("key")).show()

+----+------+
|l[0]|d[key]|
+----+------+
|   1| value|
+----+------+



#### isNotNull()

In [48]:
df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
df.filter(df.height.isNotNull()).collect()

[Row(name='Tom', height=80)]

In [49]:
tour.count()

2740403

In [50]:
tour.filter(tour.Value.isNotNull()).count()

                                                                                

2696905

#### isNull()

In [51]:
df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
df.filter(df.height.isNull()).collect()

[Row(name='Alice', height=None)]

In [52]:
tour.count()

2740403

In [53]:
tour.filter(tour.Value.isNull()).count()

43498

#### isin(*cols)

In [54]:
tour[tour["Player name"].isin("Bob", "Mike")].collect()

[]

In [55]:
tour[tour.Value.isin([1, 2, 3])].collect()

[Row(Player Name='Joe Durant', Season=2010, Statistic='Putting Average', Variable='Putting Average - (GIR RANK)', Value='3'),
 Row(Player Name='Kevin Sutherland', Season=2010, Statistic='Putting Average', Variable='Putting Average - (GIR RANK)', Value='2'),
 Row(Player Name='John Senden', Season=2010, Statistic='Putting Average', Variable='Putting Average - (GIR RANK)', Value='1'),
 Row(Player Name='David Duval', Season=2010, Statistic='Total Eagles', Variable='Total Eagles - (TOTAL)', Value='3'),
 Row(Player Name='Ernie Els', Season=2010, Statistic='Total Eagles', Variable='Total Eagles - (TOTAL)', Value='3'),
 Row(Player Name='Bob Estes', Season=2010, Statistic='Total Eagles', Variable='Total Eagles - (TOTAL)', Value='3'),
 Row(Player Name='Matt Every', Season=2010, Statistic='Total Eagles', Variable='Total Eagles - (TOTAL)', Value='3'),
 Row(Player Name='Jim Furyk', Season=2010, Statistic='Total Eagles', Variable='Total Eagles - (TOTAL)', Value='3'),
 Row(Player Name='Jeff Gove', Se

#### like(other)

In [56]:
tour.filter(tour["Player name"].like('%All%')).collect()

[Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='76'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (AVG.)', Value='289.1'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (TOTAL DISTANCE)', Value='42,505'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (TOTAL DRIVES)', Value='147'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Accuracy Percentage', Variable='Driving Accuracy Percentage - (ROUNDS)', Value='76'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Accuracy Percentage', Variable='Driving Accuracy Percentage - (%)', Value='64.75'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Accuracy Percentage', Variable='Driving Accuracy Percentage - (FAIRWAYS HIT)', Value=

#### name(*alias, **kwargs)

In [57]:
tour["Value"].name()

Column<'multialias(Value)'>

In [58]:
data = [("John", "Doe", 25), ("Jane", "Smith", 22)]
columns = ["first_name", "last_name", "age"]
df = spark.createDataFrame(data, columns)

df["first_name"].name()

# name = df["first_name"].name()
# print(name)

Column<'multialias(first_name)'>

#### otherwise(value)

In [59]:
students.withColumn("lunch", when(col("gender") == "male", "standard").otherwise(col("lunch"))).show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|    standard|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

#### over(window)

In [60]:
from pyspark.sql import Window
window = Window.partitionBy("Player name").orderBy("Value").rowsBetween(Window.unboundedPreceding, Window.currentRow)
tour.withColumn("rank", rank().over(window)).withColumn("min", min('Value').over(window)).sort(desc("Value")).show()



+---------------+------+--------------------+--------------------+--------------------+-----+------------+
|    Player Name|Season|           Statistic|            Variable|               Value| rank|         min|
+---------------+------+--------------------+--------------------+--------------------+-----+------------+
|    Kenny Perry|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu| 2147| $10,260,000|
|    Andrew Dorn|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|  117|  $1,602,000|
|    Harry Ellis|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|   86|           1|
|   Ryan Ruffels|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|  336|     $11,131|
|      John Hahn|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|  102|      -12.97|
|   Jason Dufner|  2017|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|13163|  $1,007,996|
| Soren Kjeldsen|  2016|        Lowes

                                                                                

#### rlike(other)

In [61]:
tour.filter(tour["Player name"].rlike('.?All.?')).collect()

                                                                                

[Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='76'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (AVG.)', Value='289.1'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (TOTAL DISTANCE)', Value='42,505'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (TOTAL DRIVES)', Value='147'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Accuracy Percentage', Variable='Driving Accuracy Percentage - (ROUNDS)', Value='76'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Accuracy Percentage', Variable='Driving Accuracy Percentage - (%)', Value='64.75'),
 Row(Player Name='Robert Allenby', Season=2010, Statistic='Driving Accuracy Percentage', Variable='Driving Accuracy Percentage - (FAIRWAYS HIT)', Value=

#### startswith(other)

In [62]:
tour.filter(tour["Player name"].startswith('Al')).collect()

[Row(Player Name='Alex Prugh', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='88'),
 Row(Player Name='Alex Cejka', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (ROUNDS)', Value='81'),
 Row(Player Name='Alex Prugh', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (AVG.)', Value='295.7'),
 Row(Player Name='Alex Cejka', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (AVG.)', Value='277.4'),
 Row(Player Name='Alex Prugh', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (TOTAL DISTANCE)', Value='50,856'),
 Row(Player Name='Alex Cejka', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (TOTAL DISTANCE)', Value='43,834'),
 Row(Player Name='Alex Prugh', Season=2010, Statistic='Driving Distance', Variable='Driving Distance - (TOTAL DRIVES)', Value='172'),
 Row(Player Name='Alex Cejka', Season=2010, Statistic='Driving Distance', Vari

In [63]:
tour.filter(tour["Player name"].startswith('^Al')).collect()

[]

#### substr(startPos, length)

In [64]:
tour.limit(1000).select(tour["Player name"].substr(1, 5).alias("First 5 letter")).collect()

[Row(First 5 letter='Rober'),
 Row(First 5 letter='Bubba'),
 Row(First 5 letter='Dusti'),
 Row(First 5 letter='Brett'),
 Row(First 5 letter='J.B. '),
 Row(First 5 letter='John '),
 Row(First 5 letter='Graha'),
 Row(First 5 letter='Angel'),
 Row(First 5 letter='Charl'),
 Row(First 5 letter='D.J. '),
 Row(First 5 letter='Marti'),
 Row(First 5 letter='Rory '),
 Row(First 5 letter='Phil '),
 Row(First 5 letter='Aaron'),
 Row(First 5 letter='Jason'),
 Row(First 5 letter='Charl'),
 Row(First 5 letter='Davis'),
 Row(First 5 letter='Jeff '),
 Row(First 5 letter='Scott'),
 Row(First 5 letter='Marti'),
 Row(First 5 letter='Nick '),
 Row(First 5 letter='Troy '),
 Row(First 5 letter='Jarro'),
 Row(First 5 letter='Andre'),
 Row(First 5 letter='Danie'),
 Row(First 5 letter='Ryan '),
 Row(First 5 letter='Lucas'),
 Row(First 5 letter='Alex '),
 Row(First 5 letter='Garth'),
 Row(First 5 letter='John '),
 Row(First 5 letter='Marc '),
 Row(First 5 letter='Camer'),
 Row(First 5 letter='Adam '),
 Row(First

#### when(condition, value)

In [65]:
students.select(students["parental level of education"], when(students["math score"] > 90, 1).when(students["math score"] < 50, -1).otherwise(0)).show()

+---------------------------+----------------------------------------------------------------------------+
|parental level of education|CASE WHEN (math score > 90) THEN 1 WHEN (math score < 50) THEN -1 ELSE 0 END|
+---------------------------+----------------------------------------------------------------------------+
|          bachelor's degree|                                                                           0|
|               some college|                                                                           0|
|            master's degree|                                                                           0|
|         associate's degree|                                                                          -1|
|               some college|                                                                           0|
|         associate's degree|                                                                           0|
|               some college|        

## Row

In [66]:
row = Row(name="Alice", age=11)
row

Row(name='Alice', age=11)

In [67]:
row['name'], row['age']

('Alice', 11)

In [68]:
row.name, row.age

('Alice', 11)

In [69]:
'name' in row

True

In [70]:
'wrong_key' in row

False

In [71]:
Person = Row("name", "age")
Person

<Row('name', 'age')>

In [72]:
Person("Alice", 11)

Row(name='Alice', age=11)

##### треба переглянути

In [73]:
row1 = Row("Alice", 11)
row2 = Row(name="Alice", age=11)
row1 == row2
# В документації тут False

True

In [74]:
row3 = Row(a="Alice", b=11)
row1 == row3

True

#### asDict(recursive=False)

In [75]:
tour.first().asDict()

{'Player Name': 'Robert Garrigus',
 'Season': 2010,
 'Statistic': 'Driving Distance',
 'Variable': 'Driving Distance - (ROUNDS)',
 'Value': '71'}

#### count(value)

In [76]:
tour.first().count(2010)

1

#### index(value[, start[, stop]])
##### треба переглянути

In [78]:
row = Row(name="John", age=30, city="New York")

#name_index = row.index("name")
#age_index = row.index("age")
#city_index = row.index("city")

#print(name_index)
#print(age_index)
#print(city_index)

#### drop(how="any", thresh=None, subset=None)
##### треба переглянути

In [79]:
tour.count()

2740403

In [80]:
tour.dropna().count()

                                                                                

2696905

In [81]:
row = Row(name="Alice", age=25, city="New York")
# row.drop("age")

#### fill(value, subset=None)

##### Replace null values, alias for na.fill(). DataFrame.fillna() and DataFrameNaFunctions.fill() are aliases of each other.

In [82]:
#data = [("John", None), ("Jane", 25), ("Mike", None)]
#columns = ["name", "age"]
#df = spark.createDataFrame(data, columns)

#df_filled = df.first().fill(0, subset=["age"])
#df_filled.show()

In [83]:
#df = tour.filter(col("Value").isNull())
df = tour.filter(tour.Value < 2)

#Replace 0 for null for all integer columns
df.na.fill(value=0).show()

#Replace 0 for null on only population column 
#df.na.fill(value=0,subset=["population"]).show()

+----------------+------+--------------------+--------------------+-----+
|     Player Name|Season|           Statistic|            Variable|Value|
+----------------+------+--------------------+--------------------+-----+
|     John Senden|  2010|Greens in Regulat...|Greens in Regulat...|-0.23|
|Kevin Sutherland|  2010|Greens in Regulat...|Greens in Regulat...|-0.24|
|      Joe Durant|  2010|Greens in Regulat...|Greens in Regulat...|-0.25|
|  Charles Warren|  2010|Greens in Regulat...|Greens in Regulat...|-0.25|
|   Troy Matteson|  2010|Greens in Regulat...|Greens in Regulat...|-0.22|
|   Graham DeLaet|  2010|Greens in Regulat...|Greens in Regulat...|-0.25|
|     D.J. Trahan|  2010|Greens in Regulat...|Greens in Regulat...|-0.26|
|     Kris Blanks|  2010|Greens in Regulat...|Greens in Regulat...|-0.20|
| Paul Stankowski|  2010|Greens in Regulat...|Greens in Regulat...|-0.28|
|      Aron Price|  2010|Greens in Regulat...|Greens in Regulat...|-0.25|
|   Cameron Percy|  2010|Greens in Reg

In [84]:
df.na.fill("").show(5, False)

+----------------+------+-------------------------------+------------------------------------------------+-----+
|Player Name     |Season|Statistic                      |Variable                                        |Value|
+----------------+------+-------------------------------+------------------------------------------------+-----+
|John Senden     |2010  |Greens in Regulation Percentage|Greens in Regulation Percentage - (RELATIVE/PAR)|-0.23|
|Kevin Sutherland|2010  |Greens in Regulation Percentage|Greens in Regulation Percentage - (RELATIVE/PAR)|-0.24|
|Joe Durant      |2010  |Greens in Regulation Percentage|Greens in Regulation Percentage - (RELATIVE/PAR)|-0.25|
|Charles Warren  |2010  |Greens in Regulation Percentage|Greens in Regulation Percentage - (RELATIVE/PAR)|-0.25|
|Troy Matteson   |2010  |Greens in Regulation Percentage|Greens in Regulation Percentage - (RELATIVE/PAR)|-0.22|
+----------------+------+-------------------------------+---------------------------------------

#### replace(to_replace, value=<no value>, subset=None)
##### треба переглянути, метод є для дф, а з роу,
Returns a new DataFrame replacing a value with another value. DataFrame.replace() and DataFrameNaFunctions.replace() are aliases of each other. Values to_replace and value must have the same type and can only be numerics, booleans, or strings. Value can have None. When replacing, the new value will be cast to the type of the existing column. For numeric replacements all values to be replaced should have unique floating point representation. In case of conflicts (for example with {42: -1, 42.0: 1}) and arbitrary replacement will be used.

Parameters
to_replace – bool, int, long, float, string, list or dict. Value to be replaced. If the value is a dict, then value is ignored or can be omitted, and to_replace must be a mapping between a value and a replacement.

value – bool, int, long, float, string, list or None. The replacement value must be a bool, int, long, float, string or None. If value is a list, value should be of the same length and type as to_replace. If value is a scalar and to_replace is a sequence, then value is used as a replacement for each item in to_replace.

subset – optional list of column names to consider. Columns specified in subset that do not have matching data type are ignored. For example, if value is a string, and subset contains a non-string column, then the non-string column is simply ignored.