In [1]:
# Import PySpark
import pyspark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder\
                    .master("local[1]")\
                    .appName("SparkByExamples.com")\
                    .getOrCreate()

In [3]:
# Reading in the countries.csv file and specifying the schema
countries_path = '../data/countries.csv'
 
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType
countries_schema = StructType([
                    StructField("COUNTRY_ID", IntegerType(), False),
                    StructField("NAME", StringType(), False),
                    StructField("NATIONALITY", StringType(), False),
                    StructField("COUNTRY_CODE", StringType(), False),
                    StructField("ISO_ALPHA2", StringType(), False),
                    StructField("CAPITAL", StringType(), False),
                    StructField("POPULATION", DoubleType(), False),
                    StructField("AREA_KM2", IntegerType(), False),
                    StructField("REGION_ID", IntegerType(), True),
                    StructField("SUB_REGION_ID", IntegerType(), True),
                    StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
                    StructField("ORGANIZATION_REGION_ID", IntegerType(), True)
                    ]
                    )
 
countries=spark.read.csv(path=countries_path, header=True, schema=countries_schema)

In [5]:
# Using the filter method to filter records where populaton is less than 1000000000
countries.filter(countries['population']>1000000000).show(2)

+----------+-----+-----------+------------+----------+---------+-------------+--------+---------+-------------+----------------------+----------------------+
|COUNTRY_ID| NAME|NATIONALITY|COUNTRY_CODE|ISO_ALPHA2|  CAPITAL|   POPULATION|AREA_KM2|REGION_ID|SUB_REGION_ID|INTERMEDIATE_REGION_ID|ORGANIZATION_REGION_ID|
+----------+-----+-----------+------------+----------+---------+-------------+--------+---------+-------------+----------------------+----------------------+
|        45|China|    Chinese|         CHN|        CN|  Beijing|1.433783686E9| 9706961|       30|           60|                  null|                    30|
|       103|India|     Indian|         IND|        IN|New Delhi|1.366417754E9| 3287590|       30|           30|                  null|                    30|
+----------+-----+-----------+------------+----------+---------+-------------+--------+---------+-------------+----------------------+----------------------+



In [6]:
# Using the locate function inside the filter condition
# showing countries starting with B
from pyspark.sql.functions import locate
countries.filter(locate("B", countries['capital'])==1).show(1)

+----------+---------+-----------+------------+----------+------------+-----------+--------+---------+-------------+----------------------+----------------------+
|COUNTRY_ID|     NAME|NATIONALITY|COUNTRY_CODE|ISO_ALPHA2|     CAPITAL| POPULATION|AREA_KM2|REGION_ID|SUB_REGION_ID|INTERMEDIATE_REGION_ID|ORGANIZATION_REGION_ID|
+----------+---------+-----------+------------+----------+------------+-----------+--------+---------+-------------+----------------------+----------------------+
|        10|Argentina|  Argentine|         ARG|        AR|Buenos Aires|4.4780677E7| 2780400|       10|           10|                    40|                    40|
+----------+---------+-----------+------------+----------+------------+-----------+--------+---------+-------------+----------------------+----------------------+
only showing top 1 row



In [None]:
# Using 'AND' condition via & operator
countries.filter( (length(countries['name']) > 15) & (countries['region_id'] != 10 )  ).display()

In [None]:
# Using SQL syntax in the filter method
countries.filter("length(name)>15 and region_id != 10").display()