In [5]:
# Import PySpark
import pyspark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder\
                    .master("local[1]")\
                    .appName("SparkByExamples.com")\
                    .getOrCreate()

In [8]:
# Reading in the countries.csv file and specifying the schema
countries_path = 'data/countries.csv'
 
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType
countries_schema = StructType([
                    StructField("COUNTRY_ID", IntegerType(), False),
                    StructField("NAME", StringType(), False),
                    StructField("NATIONALITY", StringType(), False),
                    StructField("COUNTRY_CODE", StringType(), False),
                    StructField("ISO_ALPHA2", StringType(), False),
                    StructField("CAPITAL", StringType(), False),
                    StructField("POPULATION", DoubleType(), False),
                    StructField("AREA_KM2", IntegerType(), False),
                    StructField("REGION_ID", IntegerType(), True),
                    StructField("SUB_REGION_ID", IntegerType(), True),
                    StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
                    StructField("ORGANIZATION_REGION_ID", IntegerType(), True)
                    ]
                    )
 
countries=spark.read.csv(path=countries_path, header=True, schema=countries_schema)

In [9]:
countries.count()

249

In [10]:
countries.dtypes

[('COUNTRY_ID', 'int'),
 ('NAME', 'string'),
 ('NATIONALITY', 'string'),
 ('COUNTRY_CODE', 'string'),
 ('ISO_ALPHA2', 'string'),
 ('CAPITAL', 'string'),
 ('POPULATION', 'double'),
 ('AREA_KM2', 'int'),
 ('REGION_ID', 'int'),
 ('SUB_REGION_ID', 'int'),
 ('INTERMEDIATE_REGION_ID', 'int'),
 ('ORGANIZATION_REGION_ID', 'int')]

In [11]:
# Reading the countries file without specifying the schema, into a new variable
countries_dt = spark.read.csv(path=countries_path, header=True)

In [12]:
countries_dt.dtypes

[('COUNTRY_ID', 'string'),
 ('NAME', 'string'),
 ('NATIONALITY', 'string'),
 ('COUNTRY_CODE', 'string'),
 ('ISO_ALPHA2', 'string'),
 ('CAPITAL', 'string'),
 ('POPULATION', 'string'),
 ('AREA_KM2', 'string'),
 ('REGION_ID', 'string'),
 ('SUB_REGION_ID', 'string'),
 ('INTERMEDIATE_REGION_ID', 'string'),
 ('ORGANIZATION_REGION_ID', 'string')]

In [13]:
# Using the cast method to cast the population column as IntegerType(), IntegerType() has already been imported in the first cell
countries_dt.select(countries_dt['population'].cast(IntegerType())).dtypes

[('population', 'int')]