In [1]:
# Import PySpark
import pyspark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder\
                    .master("local[1]")\
                    .appName("SparkByExamples.com")\
                    .getOrCreate()

In [2]:
# Reading in the countries.csv file and specifying the schema
countries_path = '../data/countries.csv'
 
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType
countries_schema = StructType([
                    StructField("COUNTRY_ID", IntegerType(), False),
                    StructField("NAME", StringType(), False),
                    StructField("NATIONALITY", StringType(), False),
                    StructField("COUNTRY_CODE", StringType(), False),
                    StructField("ISO_ALPHA2", StringType(), False),
                    StructField("CAPITAL", StringType(), False),
                    StructField("POPULATION", DoubleType(), False),
                    StructField("AREA_KM2", IntegerType(), False),
                    StructField("REGION_ID", IntegerType(), True),
                    StructField("SUB_REGION_ID", IntegerType(), True),
                    StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
                    StructField("ORGANIZATION_REGION_ID", IntegerType(), True)
                    ]
                    )
 
countries=spark.read.csv(path=countries_path, header=True, schema=countries_schema)

In [5]:
countries.head(2)

[Row(COUNTRY_ID=1, NAME='Afghanistan', NATIONALITY='Afghan', COUNTRY_CODE='AFG', ISO_ALPHA2='AF', CAPITAL='Kabul', POPULATION=38041754.0, AREA_KM2=652230, REGION_ID=30, SUB_REGION_ID=30, INTERMEDIATE_REGION_ID=None, ORGANIZATION_REGION_ID=30),
 Row(COUNTRY_ID=2, NAME='Albania', NATIONALITY='Albanian', COUNTRY_CODE='ALB', ISO_ALPHA2='AL', CAPITAL='Tirana', POPULATION=2880917.0, AREA_KM2=28748, REGION_ID=20, SUB_REGION_ID=70, INTERMEDIATE_REGION_ID=None, ORGANIZATION_REGION_ID=20)]

In [None]:
# Importing asc to sort in ascending order
from pyspark.sql.functions import asc
countries.sort(countries['population'].asc()).display()

In [None]:
# Importing desc to sort in descending order
from pyspark.sql.functions import desc
countries.sort(countries['population'].desc()).display()