In [1]:
from pyspark.sql import SparkSession
spark = SparkSession\
  .builder\
  .appName('SGD_Chapter05')\
  .getOrCreate()

In [2]:
df = spark.read.format('json')\
  .load('/home/jagadeesh/git/Spark-The-Definitive-Guide/data/flight-data/json/2015-summary.json')

In [3]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [4]:
df.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [5]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

In [6]:
myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", StringType(), True),
  StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
  StructField("count", LongType(), False, metadata={"hello": "world"})
])

In [7]:
df = spark.read.format("json").schema(myManualSchema)\
  .load("/home/jagadeesh/git/Spark-The-Definitive-Guide/data/flight-data/json/2010-summary.json")

In [8]:
df.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [9]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]>

In [10]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [11]:
from pyspark.sql.functions import col, column
col('DEST_COUNTRY_NAME')

Column<b'DEST_COUNTRY_NAME'>

In [12]:
column('ORIGIN_COUNTRY_NAME')

Column<b'ORIGIN_COUNTRY_NAME'>

In [16]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [17]:
from pyspark.sql.functions import expr

In [18]:
expr('count') - 5

Column<b'(count - 5)'>

In [19]:
df.take(5)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=264),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=69),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=24),
 Row(DEST_COUNTRY_NAME='Equatorial Guinea', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [20]:
expr('count + 5')

Column<b'(count + 5)'>

In [21]:
df.take(5)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=264),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=69),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=24),
 Row(DEST_COUNTRY_NAME='Equatorial Guinea', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [26]:
df['count']

Column<b'count'>

In [30]:
df.select(expr('count - 1')).take(5)

[Row((count - 1)=0),
 Row((count - 1)=263),
 Row((count - 1)=68),
 Row((count - 1)=23),
 Row((count - 1)=0)]

In [28]:
df.select(col('count'))

DataFrame[count: bigint]

In [29]:
df.select('count').take(5)

[Row(count=1), Row(count=264), Row(count=69), Row(count=24), Row(count=1)]

In [31]:
df.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=1)

In [32]:
from pyspark.sql import Row
myRow = Row('hello', None, 1, False)

In [33]:
myRow[0]

'hello'

In [34]:
myRow[2]

1

In [35]:
df.createOrReplaceTempView('dfTable')

In [39]:
spark.sql("""
SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME 
FROM dfTable
LIMIT 2
""").show()

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Ireland|
+-----------------+-------------------+



In [40]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, LongType

In [41]:
myManualSchema = StructType([
  StructField('some', StringType(), True),
  StructField('col', StringType(), True),
  StructField('names', LongType(), False)
])

In [43]:
myRow = Row('Hello', '1', 1)
myDf = spark.createDataFrame([myRow], myManualSchema)
myDf.show()

+-----+---+-----+
| some|col|names|
+-----+---+-----+
|Hello|  1|    1|
+-----+---+-----+



In [50]:
df.select(
  expr('DEST_COUNTRY_NAME AS destination'),
  col('DEST_COUNTRY_NAME'),
  column('DEST_COUNTRY_NAME'),)\
  .show(2)

+-------------+-----------------+-----------------+
|  destination|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-------------+-----------------+-----------------+
|United States|    United States|    United States|
|United States|    United States|    United States|
+-------------+-----------------+-----------------+
only showing top 2 rows



In [45]:
df.select('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME').show(5)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Ireland|
|    United States|              India|
|            Egypt|      United States|
|Equatorial Guinea|      United States|
+-----------------+-------------------+
only showing top 5 rows



In [52]:
df.select(expr('DEST_COUNTRY_NAME AS destination')\
  .alias('DEST_COUNTRY_NAME')).show(3)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
|    United States|
+-----------------+
only showing top 3 rows



In [53]:
df.selectExpr(
  '*',
  '(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) AS withinCountry')\
  .show(5)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|    1|        false|
|    United States|            Ireland|  264|        false|
|    United States|              India|   69|        false|
|            Egypt|      United States|   24|        false|
|Equatorial Guinea|      United States|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 5 rows



In [56]:
df.selectExpr(
  'avg(count) as average', 'count(distinct(DEST_COUNTRY_NAME))').show()

+-----------------+---------------------------------+
|          average|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------------+---------------------------------+
|1655.956862745098|                              125|
+-----------------+---------------------------------+



In [57]:
from pyspark.sql.functions import lit
df.select(expr('*')).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
+-----------------+-------------------+-----+
only showing top 2 rows



In [58]:
df.select(expr('*'), lit(1).alias('one')).show(5)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|one|
+-----------------+-------------------+-----+---+
|    United States|            Romania|    1|  1|
|    United States|            Ireland|  264|  1|
|    United States|              India|   69|  1|
|            Egypt|      United States|   24|  1|
|Equatorial Guinea|      United States|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 5 rows



In [59]:
df.withColumn('numberOne', lit(1)).show(5)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|    1|        1|
|    United States|            Ireland|  264|        1|
|    United States|              India|   69|        1|
|            Egypt|      United States|   24|        1|
|Equatorial Guinea|      United States|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 5 rows



In [60]:
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [64]:
df.withColumn('withinCountry', expr('ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME')).show()

+--------------------+-------------------+-----+-------------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+--------------------+-------------------+-----+-------------+
|       United States|            Romania|    1|        false|
|       United States|            Ireland|  264|        false|
|       United States|              India|   69|        false|
|               Egypt|      United States|   24|        false|
|   Equatorial Guinea|      United States|    1|        false|
|       United States|          Singapore|   25|        false|
|       United States|            Grenada|   54|        false|
|          Costa Rica|      United States|  477|        false|
|             Senegal|      United States|   29|        false|
|       United States|   Marshall Islands|   44|        false|
|              Guyana|      United States|   17|        false|
|       United States|       Sint Maarten|   53|        false|
|               Malta|      United States|    1|       

In [65]:
# Adding a column into a dataframe
df.withColumn('Destination', expr('DEST_COUNTRY_NAME')).columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count', 'Destination']

In [66]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [67]:
df.withColumnRenamed('DEST_COUNTRY_NAME', 'dest').columns

['dest', 'ORIGIN_COUNTRY_NAME', 'count']

In [70]:
print("hi jaggu")

hi jaggu


In [79]:
dfWithLongColName = df.withColumn(
  "This is Long Column Name" ,
  expr("ORIGIN_COUNTRY_NAME")
)

In [84]:
dfWithLongColName.show(4)

+-----------------+-------------------+-----+------------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|This is Long Column Name|
+-----------------+-------------------+-----+------------------------+
|    United States|            Romania|    1|                 Romania|
|    United States|            Ireland|  264|                 Ireland|
|    United States|              India|   69|                   India|
|            Egypt|      United States|   24|           United States|
+-----------------+-------------------+-----+------------------------+
only showing top 4 rows



In [85]:
dfWithLongColName.selectExpr(
  "`This is Long Column Name`",
  "`This is Long Column Name` as `new col`"
).show(3)

+------------------------+-------+
|This is Long Column Name|new col|
+------------------------+-------+
|                 Romania|Romania|
|                 Ireland|Ireland|
|                   India|  India|
+------------------------+-------+
only showing top 3 rows



In [86]:
# Drop columns
df.drop('ORIGIN_COUNTRY_NAME').columns

['DEST_COUNTRY_NAME', 'count']

In [87]:
dfWithLongColName.drop('ORIGIN_COUNTRY_NAME', 'DEST_COUNTRY_NAME').columns

['count', 'This is Long Column Name']

In [88]:
dfWithLongColName.columns

['DEST_COUNTRY_NAME',
 'ORIGIN_COUNTRY_NAME',
 'count',
 'This is Long Column Name']

In [89]:
#Changing column type(cast)
df.withColumn('count2', col('count').cast('long'))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, count2: bigint]

In [90]:
df

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [91]:
df.filter(col('count') < 2).show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|   Equatorial Guinea|      United States|    1|
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|            Slovakia|      United States|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [92]:
df.where('count < 2').show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|   Equatorial Guinea|      United States|    1|
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|            Slovakia|      United States|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [95]:
df.where('count < 2').where(col('ORIGIN_COUNTRY_NAME') != 'Croatia')\
  .show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|   Equatorial Guinea|      United States|    1|
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|            Slovakia|      United States|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [96]:
df.select('ORIGIN_COUNTRY_NAME', 'DEST_COUNTRY_NAME').distinct().count()

255

In [99]:
df.select('ORIGIN_COUNTRY_NAME').distinct().count()

131

In [3]:
# Random sample
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

126

In [4]:
dataFrames = df.randomSplit([0.25, 0.75], seed)
dataFrames[0].count() > dataFrames[1].count()

False

In [5]:
dataFrames[0].count()

60

In [6]:
dataFrames[1].count()

196

In [10]:
dataFrames

[DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint],
 DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]]

In [12]:
# Concatenating and Appeding Rows(Union). Dataframe is immutable, Therefore appending is not possible.
from pyspark.sql import Row
schema = df.schema
newRows = [
  Row('New Country', 'Other Country', 5),
  Row('New Country 2', 'Other Country 2', 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

In [20]:
from pyspark.sql.functions import col
df.where('count = 1').where(col('ORIGIN_COUNTRY_NAME') != 'United States').show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
+-----------------+-------------------+-----+



In [21]:
df.union(newDF)\
  .where('count = 1')\
  .where(col('ORIGIN_COUNTRY_NAME') != 'United States')\
  .show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
|    New Country 2|    Other Country 2|    1|
+-----------------+-------------------+-----+



In [23]:
df.where('count = 5').show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    French Guiana|      United States|    5|
+-----------------+-------------------+-----+



In [34]:
df.where(col('ORIGIN_COUNTRY_NAME') == '`Other Country 2`').show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
+-----------------+-------------------+-----+



In [35]:
# Sorting rows: can be done using column expressions, string and multiple columns as well.
# Default sorting in ascending order
df.sort('count').show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|          Moldova|      United States|    1|
|    United States|          Singapore|    1|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 3 rows



In [36]:
df.orderBy('count', 'DEST_COUNTRY_NAME').show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [37]:
df.orderBy(col('count'), col('DEST_COUNTRY_NAME')).show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [43]:
from pyspark.sql.functions import expr, asc, desc
df.orderBy(expr('count desc')).show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [44]:
df.orderBy(col('count').desc()).show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
+-----------------+-------------------+------+
only showing top 5 rows



In [49]:
df.orderBy(col('count').desc(), col('ORIGIN_COUNTRY_NAME').asc()).show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
+-----------------+-------------------+------+
only showing top 5 rows



In [48]:
df.orderBy(col('count').asc(), col('ORIGIN_COUNTRY_NAME').desc()).show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|            Suriname|      United States|    1|
|              Zambia|      United States|    1|
|Saint Vincent and...|      United States|    1|
|             Moldova|      United States|    1|
|            Djibouti|      United States|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [78]:
from pyspark.sql.types import StructField, StructType, LongType, FloatType, StringType
myNewSchema1 = StructType([
  StructField('column1', LongType(), True),
  StructField('Column2', FloatType(), True) ])

In [79]:
myNewSchema2 = StructType([
  StructField('StructColumn1', myNewSchema1, True),
  StructField('column3', StringType(), True),
])

In [81]:
myNewSchema1

StructType(List(StructField(column1,LongType,true),StructField(Column2,FloatType,true)))

In [80]:
myNewSchema2

StructType(List(StructField(StructColumn1,StructType(List(StructField(column1,LongType,true),StructField(Column2,FloatType,true))),true),StructField(column3,StringType,true)))

In [59]:
myNewSchema2.fieldNames

<bound method StructType.fieldNames of StructType(List(StructField(StructColumn1,StructType(List(StructField(column1,LongType,true),StructField(Column2,FloatType,true))),true),StructField(column3,StringType,true)))>

In [60]:
myNewSchema2.fields

[StructField(StructColumn1,StructType(List(StructField(column1,LongType,true),StructField(Column2,FloatType,true))),true),
 StructField(column3,StringType,true)]

In [61]:
myNewSchema2.typeName

<bound method DataType.typeName of <class 'pyspark.sql.types.StructType'>>

In [95]:
myNewRows = [
  Row(1, 2.3),
]
#parallelizedRows = spark.sparkContext.parallelize(myNewRows)
myDF = spark.createDataFrame(myNewRows, myNewSchema1)

In [96]:
myDF.printSchema()

root
 |-- column1: long (nullable = true)
 |-- Column2: float (nullable = true)



In [102]:
myDF.show()

+-------+-------+
|column1|Column2|
+-------+-------+
|      1|    2.3|
+-------+-------+



In [103]:
myNewRows1 = [
  Row(myDF, 'Jagadeesh')
]
#parallelizedRows1 = spark.sparkContext.parallelize(myNewRows1)
myDF1 = spark.createDataFrame(myNewRows1, myNewSchema2)

In [104]:
myDF1.printSchema()

root
 |-- StructColumn1: struct (nullable = true)
 |    |-- column1: long (nullable = true)
 |    |-- Column2: float (nullable = true)
 |-- column3: string (nullable = true)



In [105]:
myDF1.show()

+-------------+---------+
|StructColumn1|  column3|
+-------------+---------+
|          [,]|Jagadeesh|
+-------------+---------+



In [106]:
myDF1.collect()

[Row(StructColumn1=Row(column1=None, Column2=None), column3='Jagadeesh')]

In [107]:
# Repartition and Coalesce
# Repartition will incur a full shuffle of the data, regardless of whether one is necessary
# Coalesce will not incur a full shuffle of the data and will try to combine partitions.
df.rdd.getNumPartitions()

1

In [111]:
df.repartition(5)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [112]:
df.rdd.getNumPartitions()

1

In [114]:
df.repartition(col('DEST_COUNTRY_NAME'))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [118]:
df.repartition(5, col('DEST_COUNTRY_NAME')).rdd.getNumPartitions()

5

In [119]:
data = df.repartition(5, col('DEST_COUNTRY_NAME'))

In [120]:
data.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|             Moldova|      United States|    1|
|             Bolivia|      United States|   30|
|             Algeria|      United States|    4|
|Turks and Caicos ...|      United States|  230|
|            Pakistan|      United States|   12|
|    Marshall Islands|      United States|   42|
|            Suriname|      United States|    1|
|              Panama|      United States|  510|
|         New Zealand|      United States|  111|
|             Liberia|      United States|    2|
|             Ireland|      United States|  335|
|              Zambia|      United States|    1|
|            Malaysia|      United States|    2|
|               Japan|      United States| 1548|
|    French Polynesia|      United States|   43|
|           Singapore|      United States|    3|
|             Denmark|      United States|  153|
|               Spai

In [121]:
data.rdd.getNumPartitions()

5

In [122]:
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [124]:
data.describe()

DataFrame[summary: string, DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: string]

In [125]:
df.repartition(5, col('DEST_COUNTRY_NAME')).coalesce(2)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]