## PySpark Data Frames

In [1]:
import pyspark
from pyspark.sql import SparkSession
import pandas

In [2]:
spark = SparkSession.builder.appName("PySpark Testing").getOrCreate()

In [3]:
spark

In [4]:
df_pyspark = spark.read.csv("sample_data.csv")
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string]

In [5]:
df_pyspark.show(10)

+---+----------+---------+--------------------+-----------+----------+
|_c0|       _c1|      _c2|                 _c3|        _c4|       _c5|
+---+----------+---------+--------------------+-----------+----------+
| id|first_name|last_name|               email|     gender|     phone|
|  1|    Bidget| Mirfield|bmirfield0@scient...|     Female|5628618353|
|  2|   Gonzalo|    Vango|    gvango1@ning.com|       Male|9556535457|
|  3|      Rock| Pampling|rpampling2@guardi...|   Bigender|4472741337|
|  4|   Dorella|  Edelman|dedelman3@histats...|     Female|4303062344|
|  5|     Faber|  Thwaite|fthwaite4@google....|Genderqueer|1348658809|
|  6|     Debee| Philcott|dphilcott5@cafepr...|     Female|7906881842|
|  7| Guendolen|  Methuen|gmethuen6@yellowp...|     Female|7934481697|
|  8|    Fredra|    Dowty|    fdowty7@fema.gov|     Female|6813510934|
|  9|    Onfroi|    Landy|  olandy8@cdbaby.com|       Male|7723271304|
+---+----------+---------+--------------------+-----------+----------+
only s

In [6]:
df_pyspark.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [7]:
df_pyspark = spark.read.csv("sample_data.csv", header=True, inferSchema=True)

In [8]:
df_pyspark.show(5)

+---+----------+---------+--------------------+-----------+----------+
| id|first_name|last_name|               email|     gender|     phone|
+---+----------+---------+--------------------+-----------+----------+
|  1|    Bidget| Mirfield|bmirfield0@scient...|     Female|5628618353|
|  2|   Gonzalo|    Vango|    gvango1@ning.com|       Male|9556535457|
|  3|      Rock| Pampling|rpampling2@guardi...|   Bigender|4472741337|
|  4|   Dorella|  Edelman|dedelman3@histats...|     Female|4303062344|
|  5|     Faber|  Thwaite|fthwaite4@google....|Genderqueer|1348658809|
+---+----------+---------+--------------------+-----------+----------+
only showing top 5 rows



In [9]:
df_pyspark.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- phone: long (nullable = true)



In [10]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [11]:
df_pyspark.columns

['id', 'first_name', 'last_name', 'email', 'gender', 'phone']

In [12]:
df_pyspark.head()

Row(id=1, first_name='Bidget', last_name='Mirfield', email='bmirfield0@scientificamerican.com', gender='Female', phone=5628618353)

In [13]:
df_pyspark.head(3)

[Row(id=1, first_name='Bidget', last_name='Mirfield', email='bmirfield0@scientificamerican.com', gender='Female', phone=5628618353),
 Row(id=2, first_name='Gonzalo', last_name='Vango', email='gvango1@ning.com', gender='Male', phone=9556535457),
 Row(id=3, first_name='Rock', last_name='Pampling', email='rpampling2@guardian.co.uk', gender='Bigender', phone=4472741337)]

In [14]:
df_pyspark.select("first_name").show(10)

+----------+
|first_name|
+----------+
|    Bidget|
|   Gonzalo|
|      Rock|
|   Dorella|
|     Faber|
|     Debee|
| Guendolen|
|    Fredra|
|    Onfroi|
|     Karyn|
+----------+
only showing top 10 rows



In [15]:
df_pyspark.select(["first_name", "phone"]).show(10)

+----------+----------+
|first_name|     phone|
+----------+----------+
|    Bidget|5628618353|
|   Gonzalo|9556535457|
|      Rock|4472741337|
|   Dorella|4303062344|
|     Faber|1348658809|
|     Debee|7906881842|
| Guendolen|7934481697|
|    Fredra|6813510934|
|    Onfroi|7723271304|
|     Karyn|7855476206|
+----------+----------+
only showing top 10 rows



In [16]:
df_pyspark.select(df_pyspark[0], df_pyspark[1]).show(10)

+---+----------+
| id|first_name|
+---+----------+
|  1|    Bidget|
|  2|   Gonzalo|
|  3|      Rock|
|  4|   Dorella|
|  5|     Faber|
|  6|     Debee|
|  7| Guendolen|
|  8|    Fredra|
|  9|    Onfroi|
| 10|     Karyn|
+---+----------+
only showing top 10 rows



In [17]:
df_pyspark.withColumn("id_addition", df_pyspark[0]+df_pyspark[0]*10).show(10)

+---+----------+---------+--------------------+-----------+----------+-----------+
| id|first_name|last_name|               email|     gender|     phone|id_addition|
+---+----------+---------+--------------------+-----------+----------+-----------+
|  1|    Bidget| Mirfield|bmirfield0@scient...|     Female|5628618353|         11|
|  2|   Gonzalo|    Vango|    gvango1@ning.com|       Male|9556535457|         22|
|  3|      Rock| Pampling|rpampling2@guardi...|   Bigender|4472741337|         33|
|  4|   Dorella|  Edelman|dedelman3@histats...|     Female|4303062344|         44|
|  5|     Faber|  Thwaite|fthwaite4@google....|Genderqueer|1348658809|         55|
|  6|     Debee| Philcott|dphilcott5@cafepr...|     Female|7906881842|         66|
|  7| Guendolen|  Methuen|gmethuen6@yellowp...|     Female|7934481697|         77|
|  8|    Fredra|    Dowty|    fdowty7@fema.gov|     Female|6813510934|         88|
|  9|    Onfroi|    Landy|  olandy8@cdbaby.com|       Male|7723271304|         99|
| 10

In [18]:
df_pyspark.show(10)

+---+----------+---------+--------------------+-----------+----------+
| id|first_name|last_name|               email|     gender|     phone|
+---+----------+---------+--------------------+-----------+----------+
|  1|    Bidget| Mirfield|bmirfield0@scient...|     Female|5628618353|
|  2|   Gonzalo|    Vango|    gvango1@ning.com|       Male|9556535457|
|  3|      Rock| Pampling|rpampling2@guardi...|   Bigender|4472741337|
|  4|   Dorella|  Edelman|dedelman3@histats...|     Female|4303062344|
|  5|     Faber|  Thwaite|fthwaite4@google....|Genderqueer|1348658809|
|  6|     Debee| Philcott|dphilcott5@cafepr...|     Female|7906881842|
|  7| Guendolen|  Methuen|gmethuen6@yellowp...|     Female|7934481697|
|  8|    Fredra|    Dowty|    fdowty7@fema.gov|     Female|6813510934|
|  9|    Onfroi|    Landy|  olandy8@cdbaby.com|       Male|7723271304|
| 10|     Karyn|     Dash|  kdash9@smugmug.com|     Female|7855476206|
+---+----------+---------+--------------------+-----------+----------+
only s

In [19]:
df_pyspark.withColumnRenamed("gender", "sex").show(10)

+---+----------+---------+--------------------+-----------+----------+
| id|first_name|last_name|               email|        sex|     phone|
+---+----------+---------+--------------------+-----------+----------+
|  1|    Bidget| Mirfield|bmirfield0@scient...|     Female|5628618353|
|  2|   Gonzalo|    Vango|    gvango1@ning.com|       Male|9556535457|
|  3|      Rock| Pampling|rpampling2@guardi...|   Bigender|4472741337|
|  4|   Dorella|  Edelman|dedelman3@histats...|     Female|4303062344|
|  5|     Faber|  Thwaite|fthwaite4@google....|Genderqueer|1348658809|
|  6|     Debee| Philcott|dphilcott5@cafepr...|     Female|7906881842|
|  7| Guendolen|  Methuen|gmethuen6@yellowp...|     Female|7934481697|
|  8|    Fredra|    Dowty|    fdowty7@fema.gov|     Female|6813510934|
|  9|    Onfroi|    Landy|  olandy8@cdbaby.com|       Male|7723271304|
| 10|     Karyn|     Dash|  kdash9@smugmug.com|     Female|7855476206|
+---+----------+---------+--------------------+-----------+----------+
only s

In [20]:
df_pyspark.withColumnRenamed("sex","gender").withColumnRenamed("phone", "ph").show(10)

+---+----------+---------+--------------------+-----------+----------+
| id|first_name|last_name|               email|     gender|        ph|
+---+----------+---------+--------------------+-----------+----------+
|  1|    Bidget| Mirfield|bmirfield0@scient...|     Female|5628618353|
|  2|   Gonzalo|    Vango|    gvango1@ning.com|       Male|9556535457|
|  3|      Rock| Pampling|rpampling2@guardi...|   Bigender|4472741337|
|  4|   Dorella|  Edelman|dedelman3@histats...|     Female|4303062344|
|  5|     Faber|  Thwaite|fthwaite4@google....|Genderqueer|1348658809|
|  6|     Debee| Philcott|dphilcott5@cafepr...|     Female|7906881842|
|  7| Guendolen|  Methuen|gmethuen6@yellowp...|     Female|7934481697|
|  8|    Fredra|    Dowty|    fdowty7@fema.gov|     Female|6813510934|
|  9|    Onfroi|    Landy|  olandy8@cdbaby.com|       Male|7723271304|
| 10|     Karyn|     Dash|  kdash9@smugmug.com|     Female|7855476206|
+---+----------+---------+--------------------+-----------+----------+
only s

In [21]:
df_pyspark = spark.read.csv("sample_data_null.csv", header=True, inferSchema=True)
df_pyspark.show(10)

+----+----------+---------+--------------------+------+------+
|  id|first_name|last_name|               email|gender|number|
+----+----------+---------+--------------------+------+------+
|   1|  Robinett|  Sitwell|rsitwell0@harvard...|Female|385.61|
|null|      null|     null|                null|  null|  null|
|   3|      Jobi| Hallgath|jhallgath2@geocit...|Female|521.61|
|   4|  Anallise| Lathwell|alathwell3@nifty.com|Female|715.49|
|   5|      null|Sacchetti|dsacchetti4@multi...|Female|395.41|
|   6|     Cindy|Danzelman|cdanzelman5@about.me|Female| 463.9|
|   7|      null|   Gobeau|ggobeau6@feedburn...|Female|298.26|
|   8|      null|   Ziehms|fziehms7@behance.net|  Male| 216.7|
|   9| Josephine|   Markie|jmarkie8@clickban...|Female|956.08|
|  10|  Madelina|  Almeida| malmeida9@fotki.com|Female|547.79|
+----+----------+---------+--------------------+------+------+
only showing top 10 rows



In [22]:
df_pyspark.na.drop().show(10)

+---+----------+---------+--------------------+----------+------+
| id|first_name|last_name|               email|    gender|number|
+---+----------+---------+--------------------+----------+------+
|  1|  Robinett|  Sitwell|rsitwell0@harvard...|    Female|385.61|
|  3|      Jobi| Hallgath|jhallgath2@geocit...|    Female|521.61|
|  4|  Anallise| Lathwell|alathwell3@nifty.com|    Female|715.49|
|  6|     Cindy|Danzelman|cdanzelman5@about.me|    Female| 463.9|
|  9| Josephine|   Markie|jmarkie8@clickban...|    Female|956.08|
| 10|  Madelina|  Almeida| malmeida9@fotki.com|    Female|547.79|
| 11|   Garrett|   Burgon|gburgona@biblegat...|      Male| 48.41|
| 12|      Bili|     Gear|bgearb@tuttocitta.it|    Female|880.56|
| 14|     Xever|     Dils|xdilsd@sciencedir...|Polygender| 611.9|
| 15|  Kerianne|     Fant|  kfante@cornell.edu|  Bigender|953.36|
+---+----------+---------+--------------------+----------+------+
only showing top 10 rows



In [23]:
df_pyspark.na.drop(how="all").show(10)

+---+----------+---------+--------------------+------+------+
| id|first_name|last_name|               email|gender|number|
+---+----------+---------+--------------------+------+------+
|  1|  Robinett|  Sitwell|rsitwell0@harvard...|Female|385.61|
|  3|      Jobi| Hallgath|jhallgath2@geocit...|Female|521.61|
|  4|  Anallise| Lathwell|alathwell3@nifty.com|Female|715.49|
|  5|      null|Sacchetti|dsacchetti4@multi...|Female|395.41|
|  6|     Cindy|Danzelman|cdanzelman5@about.me|Female| 463.9|
|  7|      null|   Gobeau|ggobeau6@feedburn...|Female|298.26|
|  8|      null|   Ziehms|fziehms7@behance.net|  Male| 216.7|
|  9| Josephine|   Markie|jmarkie8@clickban...|Female|956.08|
| 10|  Madelina|  Almeida| malmeida9@fotki.com|Female|547.79|
| 11|   Garrett|   Burgon|gburgona@biblegat...|  Male| 48.41|
+---+----------+---------+--------------------+------+------+
only showing top 10 rows



In [24]:
df_pyspark.na.drop(thresh=2, how="any").show(10)

+---+----------+---------+--------------------+------+------+
| id|first_name|last_name|               email|gender|number|
+---+----------+---------+--------------------+------+------+
|  1|  Robinett|  Sitwell|rsitwell0@harvard...|Female|385.61|
|  3|      Jobi| Hallgath|jhallgath2@geocit...|Female|521.61|
|  4|  Anallise| Lathwell|alathwell3@nifty.com|Female|715.49|
|  5|      null|Sacchetti|dsacchetti4@multi...|Female|395.41|
|  6|     Cindy|Danzelman|cdanzelman5@about.me|Female| 463.9|
|  7|      null|   Gobeau|ggobeau6@feedburn...|Female|298.26|
|  8|      null|   Ziehms|fziehms7@behance.net|  Male| 216.7|
|  9| Josephine|   Markie|jmarkie8@clickban...|Female|956.08|
| 10|  Madelina|  Almeida| malmeida9@fotki.com|Female|547.79|
| 11|   Garrett|   Burgon|gburgona@biblegat...|  Male| 48.41|
+---+----------+---------+--------------------+------+------+
only showing top 10 rows



In [25]:
df_pyspark.na.drop(how="any", subset=["first_name"]).show()

+---+----------+---------+--------------------+----------+------+
| id|first_name|last_name|               email|    gender|number|
+---+----------+---------+--------------------+----------+------+
|  1|  Robinett|  Sitwell|rsitwell0@harvard...|    Female|385.61|
|  3|      Jobi| Hallgath|jhallgath2@geocit...|    Female|521.61|
|  4|  Anallise| Lathwell|alathwell3@nifty.com|    Female|715.49|
|  6|     Cindy|Danzelman|cdanzelman5@about.me|    Female| 463.9|
|  9| Josephine|   Markie|jmarkie8@clickban...|    Female|956.08|
| 10|  Madelina|  Almeida| malmeida9@fotki.com|    Female|547.79|
| 11|   Garrett|   Burgon|gburgona@biblegat...|      Male| 48.41|
| 12|      Bili|     Gear|bgearb@tuttocitta.it|    Female|880.56|
| 13|   Creight|   Santos|    csantosc@com.com|      null| 746.5|
| 14|     Xever|     Dils|xdilsd@sciencedir...|Polygender| 611.9|
| 15|  Kerianne|     Fant|  kfante@cornell.edu|  Bigender|953.36|
| 17|  Britteny|     Caso|      bcasog@umn.edu|    Female|300.11|
| 18|     

In [26]:
df_pyspark.na.fill("Missing").show()

+----+----------+---------+--------------------+----------+------+
|  id|first_name|last_name|               email|    gender|number|
+----+----------+---------+--------------------+----------+------+
|   1|  Robinett|  Sitwell|rsitwell0@harvard...|    Female|385.61|
|null|   Missing|  Missing|             Missing|   Missing|  null|
|   3|      Jobi| Hallgath|jhallgath2@geocit...|    Female|521.61|
|   4|  Anallise| Lathwell|alathwell3@nifty.com|    Female|715.49|
|   5|   Missing|Sacchetti|dsacchetti4@multi...|    Female|395.41|
|   6|     Cindy|Danzelman|cdanzelman5@about.me|    Female| 463.9|
|   7|   Missing|   Gobeau|ggobeau6@feedburn...|    Female|298.26|
|   8|   Missing|   Ziehms|fziehms7@behance.net|      Male| 216.7|
|   9| Josephine|   Markie|jmarkie8@clickban...|    Female|956.08|
|  10|  Madelina|  Almeida| malmeida9@fotki.com|    Female|547.79|
|  11|   Garrett|   Burgon|gburgona@biblegat...|      Male| 48.41|
|  12|      Bili|     Gear|bgearb@tuttocitta.it|    Female|880

In [27]:
df_pyspark.na.fill(0).show()

+---+----------+---------+--------------------+----------+------+
| id|first_name|last_name|               email|    gender|number|
+---+----------+---------+--------------------+----------+------+
|  1|  Robinett|  Sitwell|rsitwell0@harvard...|    Female|385.61|
|  0|      null|     null|                null|      null|   0.0|
|  3|      Jobi| Hallgath|jhallgath2@geocit...|    Female|521.61|
|  4|  Anallise| Lathwell|alathwell3@nifty.com|    Female|715.49|
|  5|      null|Sacchetti|dsacchetti4@multi...|    Female|395.41|
|  6|     Cindy|Danzelman|cdanzelman5@about.me|    Female| 463.9|
|  7|      null|   Gobeau|ggobeau6@feedburn...|    Female|298.26|
|  8|      null|   Ziehms|fziehms7@behance.net|      Male| 216.7|
|  9| Josephine|   Markie|jmarkie8@clickban...|    Female|956.08|
| 10|  Madelina|  Almeida| malmeida9@fotki.com|    Female|547.79|
| 11|   Garrett|   Burgon|gburgona@biblegat...|      Male| 48.41|
| 12|      Bili|     Gear|bgearb@tuttocitta.it|    Female|880.56|
| 13|   Cr

In [28]:
df_pyspark.na.fill("Missing", ["gender", "first_name"]).show()

+----+----------+---------+--------------------+----------+------+
|  id|first_name|last_name|               email|    gender|number|
+----+----------+---------+--------------------+----------+------+
|   1|  Robinett|  Sitwell|rsitwell0@harvard...|    Female|385.61|
|null|   Missing|     null|                null|   Missing|  null|
|   3|      Jobi| Hallgath|jhallgath2@geocit...|    Female|521.61|
|   4|  Anallise| Lathwell|alathwell3@nifty.com|    Female|715.49|
|   5|   Missing|Sacchetti|dsacchetti4@multi...|    Female|395.41|
|   6|     Cindy|Danzelman|cdanzelman5@about.me|    Female| 463.9|
|   7|   Missing|   Gobeau|ggobeau6@feedburn...|    Female|298.26|
|   8|   Missing|   Ziehms|fziehms7@behance.net|      Male| 216.7|
|   9| Josephine|   Markie|jmarkie8@clickban...|    Female|956.08|
|  10|  Madelina|  Almeida| malmeida9@fotki.com|    Female|547.79|
|  11|   Garrett|   Burgon|gburgona@biblegat...|      Male| 48.41|
|  12|      Bili|     Gear|bgearb@tuttocitta.it|    Female|880

In [29]:
from pyspark.ml.feature import Imputer

In [30]:
imputer = Imputer(inputCols=["number", "id"], outputCols=["{}_imputed".format(i) for i in ["number", "id"]]).setStrategy("mean")

In [31]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+----+----------+---------+--------------------+----------+------+------------------+----------+
|  id|first_name|last_name|               email|    gender|number|    number_imputed|id_imputed|
+----+----------+---------+--------------------+----------+------+------------------+----------+
|   1|  Robinett|  Sitwell|rsitwell0@harvard...|    Female|385.61|            385.61|         1|
|null|      null|     null|                null|      null|  null|510.22238238238236|       500|
|   3|      Jobi| Hallgath|jhallgath2@geocit...|    Female|521.61|            521.61|         3|
|   4|  Anallise| Lathwell|alathwell3@nifty.com|    Female|715.49|            715.49|         4|
|   5|      null|Sacchetti|dsacchetti4@multi...|    Female|395.41|            395.41|         5|
|   6|     Cindy|Danzelman|cdanzelman5@about.me|    Female| 463.9|             463.9|         6|
|   7|      null|   Gobeau|ggobeau6@feedburn...|    Female|298.26|            298.26|         7|
|   8|      null|   Ziehms|fzi

In [32]:
df_pyspark.filter("number=715.49").show()

+---+----------+---------+--------------------+------+------+
| id|first_name|last_name|               email|gender|number|
+---+----------+---------+--------------------+------+------+
|  4|  Anallise| Lathwell|alathwell3@nifty.com|Female|715.49|
+---+----------+---------+--------------------+------+------+



In [33]:
df_pyspark.filter("number>100").select(["number", "gender"]).show()

+------+----------+
|number|    gender|
+------+----------+
|385.61|    Female|
|521.61|    Female|
|715.49|    Female|
|395.41|    Female|
| 463.9|    Female|
|298.26|    Female|
| 216.7|      Male|
|956.08|    Female|
|547.79|    Female|
|880.56|    Female|
| 746.5|      null|
| 611.9|Polygender|
|953.36|  Bigender|
|351.43|    Female|
|300.11|    Female|
|429.17|      Male|
|899.69|      Male|
|769.84|      null|
|379.22|    Female|
|913.75|  Bigender|
+------+----------+
only showing top 20 rows



In [34]:
df_pyspark.filter((df_pyspark["number"]>100) & (df_pyspark["number"]<300)).select(["gender", "first_name", "number"]).show()

+------+----------+------+
|gender|first_name|number|
+------+----------+------+
|Female|      null|298.26|
|  Male|      null| 216.7|
|  Male|    Archie|115.08|
|Female|    Kaylil|269.24|
|  null|     Ailee| 161.2|
|Female|   Ardelia| 165.9|
|Female|     Mamie|175.94|
|  Male|   Mallory|142.34|
|Female|  Elsinore|239.74|
|  null| Stanleigh|271.15|
|  Male|    Irvine|245.14|
|  null|     Bride|299.96|
|  Male|  Oliviero| 270.4|
|Female|      null|134.28|
|  Male|     Olvan|173.98|
|  Male|   Olivier|200.81|
|  Male|     Berne| 157.5|
|  Male|     Rorke|149.21|
|  Male|        Eb|192.87|
|  null|     Tansy|236.36|
+------+----------+------+
only showing top 20 rows



In [35]:
df_pyspark.filter(~(df_pyspark["number"]<100)).select("number").show()

+------+
|number|
+------+
|385.61|
|521.61|
|715.49|
|395.41|
| 463.9|
|298.26|
| 216.7|
|956.08|
|547.79|
|880.56|
| 746.5|
| 611.9|
|953.36|
|351.43|
|300.11|
|429.17|
|899.69|
|769.84|
|379.22|
|913.75|
+------+
only showing top 20 rows



In [36]:
df_pyspark.groupBy("gender").sum().show()

+-----------+-------+------------------+
|     gender|sum(id)|       sum(number)|
+-----------+-------+------------------+
|Genderqueer|   4843|            4375.3|
|       null|  76350| 83842.23000000001|
|    Agender|   9425|12985.250000000002|
|     Female| 199638|192442.45000000013|
| Polygender|   4749|3816.9700000000003|
|   Bigender|   7548|           9635.94|
| Non-binary|   8521|           8834.28|
|       Male| 182607|188121.86999999997|
|Genderfluid|   6817| 5657.869999999999|
+-----------+-------+------------------+



In [37]:
df_pyspark.groupBy("gender").count().show()

+-----------+-----+
|     gender|count|
+-----------+-----+
|Genderqueer|    9|
|       null|  153|
|    Agender|   20|
|     Female|  390|
| Polygender|   10|
|   Bigender|   15|
| Non-binary|   17|
|       Male|  371|
|Genderfluid|   15|
+-----------+-----+



In [38]:
df_pyspark.show(1000)

+----+--------------+----------------+--------------------+-----------+------+
|  id|    first_name|       last_name|               email|     gender|number|
+----+--------------+----------------+--------------------+-----------+------+
|   1|      Robinett|         Sitwell|rsitwell0@harvard...|     Female|385.61|
|null|          null|            null|                null|       null|  null|
|   3|          Jobi|        Hallgath|jhallgath2@geocit...|     Female|521.61|
|   4|      Anallise|        Lathwell|alathwell3@nifty.com|     Female|715.49|
|   5|          null|       Sacchetti|dsacchetti4@multi...|     Female|395.41|
|   6|         Cindy|       Danzelman|cdanzelman5@about.me|     Female| 463.9|
|   7|          null|          Gobeau|ggobeau6@feedburn...|     Female|298.26|
|   8|          null|          Ziehms|fziehms7@behance.net|       Male| 216.7|
|   9|     Josephine|          Markie|jmarkie8@clickban...|     Female|956.08|
|  10|      Madelina|         Almeida| malmeida9@fot

In [39]:
df_pyspark.groupBy("gender").max().select(["gender","max(number)"]).show()

+-----------+-----------+
|     gender|max(number)|
+-----------+-----------+
|Genderqueer|      961.9|
|       null|     998.84|
|    Agender|      991.2|
|     Female|     998.28|
| Polygender|     805.98|
|   Bigender|     986.85|
| Non-binary|     923.82|
|       Male|     998.14|
|Genderfluid|     935.91|
+-----------+-----------+



In [40]:
df_pyspark.groupBy("gender").max().show()

+-----------+-------+-----------+
|     gender|max(id)|max(number)|
+-----------+-------+-----------+
|Genderqueer|    820|      961.9|
|       null|    998|     998.84|
|    Agender|    841|      991.2|
|     Female|   1000|     998.28|
| Polygender|    922|     805.98|
|   Bigender|    988|     986.85|
| Non-binary|    958|     923.82|
|       Male|    999|     998.14|
|Genderfluid|    965|     935.91|
+-----------+-------+-----------+



In [41]:
df_pyspark.agg({"number": "average"}).show()

+------------------+
|       avg(number)|
+------------------+
|510.22238238238236|
+------------------+



In [42]:
df_pyspark.groupBy("first_name").agg({"number":"average", "id":"max"}).show()

+----------+------------------+-------+
|first_name|       avg(number)|max(id)|
+----------+------------------+-------+
|  Donielle|            565.73|    321|
|   Ethelda|            158.49|    666|
|    Marney|370.17499999999995|    757|
|  Angelina|             393.4|    267|
|       Rod|            717.74|    287|
|   Shannon|            369.24|    347|
| Silvester|            130.83|    992|
|      Dori|            717.22|    110|
|  Napoleon|            184.95|    514|
|      Ewan|            829.67|     66|
|    Elston|            557.87|    157|
|        Em|            440.16|    174|
|     Ruben|            760.61|    314|
|     Bride|            299.96|     68|
|    Thelma|            971.03|    170|
|     Sande|            480.47|    370|
|      Kata|            307.81|    939|
|    Elyssa|            247.28|    178|
|     Lyman|            142.54|    263|
|     Rocky|            166.35|    444|
+----------+------------------+-------+
only showing top 20 rows



In [43]:
spark.stop()

## PySpark RDDs

In [44]:
from pyspark import SparkContext
import findspark

In [45]:
# import findspark
# findspark.init()
# Initialize it before the creation of spark session

# Note:
# Windows seems has other dependencies, Not sure what was the issue but its fixed now. please pass it on detail like how this package help to resolve this.

findspark.init()

In [46]:
sc = SparkContext("local", "Test App")

In [47]:
rdd = range(1000)

In [48]:
rdd

range(0, 1000)

In [8]:
data = sc.parallelize(rdd)
data.count()

1000

In [9]:
data.take(4)

[0, 1, 2, 3]

In [10]:
rdd2 = sc.textFile("sample_data.csv")

In [15]:
rdd2.collect()

['id,first_name,last_name,email,gender,phone',
 '1,Bidget,Mirfield,bmirfield0@scientificamerican.com,Female,5628618353',
 '2,Gonzalo,Vango,gvango1@ning.com,Male,9556535457',
 '3,Rock,Pampling,rpampling2@guardian.co.uk,Bigender,4472741337',
 '4,Dorella,Edelman,dedelman3@histats.com,Female,4303062344',
 '5,Faber,Thwaite,fthwaite4@google.co.jp,Genderqueer,1348658809',
 '6,Debee,Philcott,dphilcott5@cafepress.com,Female,7906881842',
 '7,Guendolen,Methuen,gmethuen6@yellowpages.com,Female,7934481697',
 '8,Fredra,Dowty,fdowty7@fema.gov,Female,6813510934',
 '9,Onfroi,Landy,olandy8@cdbaby.com,Male,7723271304',
 '10,Karyn,Dash,kdash9@smugmug.com,Female,7855476206',
 '11,Klement,Borthwick,kborthwicka@yolasite.com,Male,4087774252',
 '12,Donnie,Calbaithe,dcalbaitheb@deviantart.com,Male,4816763974',
 '13,Hesther,Sowman,hsowmanc@com.com,Female,4331563480',
 '14,Woodman,Meddick,wmeddickd@gmpg.org,Male,3558800983',
 '15,Johannes,Bott,jbotte@privacy.gov.au,Male,2761167480',
 '16,Kristi,Franciottoi,kfranc

In [16]:
rdd2.take(5)

['id,first_name,last_name,email,gender,phone',
 '1,Bidget,Mirfield,bmirfield0@scientificamerican.com,Female,5628618353',
 '2,Gonzalo,Vango,gvango1@ning.com,Male,9556535457',
 '3,Rock,Pampling,rpampling2@guardian.co.uk,Bigender,4472741337',
 '4,Dorella,Edelman,dedelman3@histats.com,Female,4303062344']

In [17]:
rdd2.count()

1001

In [25]:
rdd2.map(lambda line: line.split(",")).take(3)

[['id', 'first_name', 'last_name', 'email', 'gender', 'phone'],
 ['1',
  'Bidget',
  'Mirfield',
  'bmirfield0@scientificamerican.com',
  'Female',
  '5628618353'],
 ['2', 'Gonzalo', 'Vango', 'gvango1@ning.com', 'Male', '9556535457']]

In [28]:
rdd2.flatMap(lambda line: line.split(",")).take(3)

['id', 'first_name', 'last_name']

In [35]:
rdd2.filter(lambda line: ("female" in line.lower())).take(3)

['1,Bidget,Mirfield,bmirfield0@scientificamerican.com,Female,5628618353',
 '4,Dorella,Edelman,dedelman3@histats.com,Female,4303062344',
 '6,Debee,Philcott,dphilcott5@cafepress.com,Female,7906881842']

In [40]:
rdd3 = sc.textFile("sample_data.csv")

In [49]:
rdd3.sample(True, 0.1, 1234).take(3)

['10,Karyn,Dash,kdash9@smugmug.com,Female,7855476206',
 '22,Brooke,Stepney,bstepneyl@scribd.com,Female,8923236379',
 '35,Corissa,De Dei,cdedeiy@google.com,Genderqueer,9001494155']

In [51]:
list1 = [1,2,3]
list2 = [1,5,6]
rdd_a = sc.parallelize(list1)
rdd_b = sc.parallelize(list2)
rdd_a.union(rdd_b).collect()

[1, 2, 3, 1, 5, 6]

In [54]:
list1 = [("a", 1),("b", 2)]
list2 = [("a", 3),("b", 4)]
rdd_a = sc.parallelize(list1)
rdd_b = sc.parallelize(list2)
rdd_a.join(rdd_b).collect()

[('b', (2, 4)), ('a', (1, 3))]