In [1]:
# Import PySpark
import pyspark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder\
                    .master("local[1]")\
                    .appName("SparkByExamples.com")\
                    .getOrCreate()

In [2]:
# Reading in the countries.csv file and specifying the schema
countries_path = '../data/countries.csv'
 
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType
countries_schema = StructType([
                    StructField("COUNTRY_ID", IntegerType(), False),
                    StructField("NAME", StringType(), False),
                    StructField("NATIONALITY", StringType(), False),
                    StructField("COUNTRY_CODE", StringType(), False),
                    StructField("ISO_ALPHA2", StringType(), False),
                    StructField("CAPITAL", StringType(), False),
                    StructField("POPULATION", DoubleType(), False),
                    StructField("AREA_KM2", IntegerType(), False),
                    StructField("REGION_ID", IntegerType(), True),
                    StructField("SUB_REGION_ID", IntegerType(), True),
                    StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
                    StructField("ORGANIZATION_REGION_ID", IntegerType(), True)
                    ]
                    )
 
countries=spark.read.csv(path=countries_path, header=True, schema=countries_schema)

In [4]:
countries.show(2)

+----------+-----------+-----------+------------+----------+-------+-----------+--------+---------+-------------+----------------------+----------------------+
|COUNTRY_ID|       NAME|NATIONALITY|COUNTRY_CODE|ISO_ALPHA2|CAPITAL| POPULATION|AREA_KM2|REGION_ID|SUB_REGION_ID|INTERMEDIATE_REGION_ID|ORGANIZATION_REGION_ID|
+----------+-----------+-----------+------------+----------+-------+-----------+--------+---------+-------------+----------------------+----------------------+
|         1|Afghanistan|     Afghan|         AFG|        AF|  Kabul|3.8041754E7|  652230|       30|           30|                  null|                    30|
|         2|    Albania|   Albanian|         ALB|        AL| Tirana|  2880917.0|   28748|       20|           70|                  null|                    20|
+----------+-----------+-----------+------------+----------+-------+-----------+--------+---------+-------------+----------------------+----------------------+
only showing top 2 rows



In [None]:
# Importing asc to sort in ascending order
from pyspark.sql.functions import asc
countries.sort(countries['population'].asc()).display()

In [None]:
# Importing desc to sort in descending order
from pyspark.sql.functions import desc
countries.sort(countries['population'].desc()).display()