# Changing Data Types

In [None]:
'''
Resources:
data types in spark: https://spark.apache.org/docs/latest/sql-ref-datatypes.html
cast: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Column.cast.html?highlight=cast#pyspark.sql.Column.cast
'''

In [None]:
# Reading in the countries.csv file and specifying the schema
countries_path = '/FileStore/tables/countries.csv'
 
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType
countries_schema = StructType([
                    StructField("COUNTRY_ID", IntegerType(), False),
                    StructField("NAME", StringType(), False),
                    StructField("NATIONALITY", StringType(), False),
                    StructField("COUNTRY_CODE", StringType(), False),
                    StructField("ISO_ALPHA2", StringType(), False),
                    StructField("CAPITAL", StringType(), False),
                    StructField("POPULATION", DoubleType(), False),
                    StructField("AREA_KM2", IntegerType(), False),
                    StructField("REGION_ID", IntegerType(), True),
                    StructField("SUB_REGION_ID", IntegerType(), True),
                    StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
                    StructField("ORGANIZATION_REGION_ID", IntegerType(), True)
                    ]
                    )
 
countries=spark.read.csv(path=countries_path, header=True, schema=countries_schema)

In [None]:
countries.dtypes

''''''
Out[2]: [('COUNTRY_ID', 'int'),
 ('NAME', 'string'),
 ('NATIONALITY', 'string'),
 ('COUNTRY_CODE', 'string'),
 ('ISO_ALPHA2', 'string'),
 ('CAPITAL', 'string'),
 ('POPULATION', 'double'),
 ('AREA_KM2', 'int'),
 ('REGION_ID', 'int'),
 ('SUB_REGION_ID', 'int'),
 ('INTERMEDIATE_REGION_ID', 'int'),
 ('ORGANIZATION_REGION_ID', 'int')]
''''''

In [None]:
# Reading the countries file without specifying the schema, into a new variable
countries_dt = spark.read.csv(path=countries_path, header=True)

In [None]:
# Note the data types are all string
countries_dt.dtypes


''''''
Out[4]: [('COUNTRY_ID', 'string'),
 ('NAME', 'string'),
 ('NATIONALITY', 'string'),
 ('COUNTRY_CODE', 'string'),
 ('ISO_ALPHA2', 'string'),
 ('CAPITAL', 'string'),
 ('POPULATION', 'string'),
 ('AREA_KM2', 'string'),
 ('REGION_ID', 'string'),
 ('SUB_REGION_ID', 'string'),
 ('INTERMEDIATE_REGION_ID', 'string'),
 ('ORGANIZATION_REGION_ID', 'string')]
''''''

In [None]:
# Using the cast method to cast the population column as IntegerType(), IntegerType() has already been imported in the first cell
countries_dt.select(countries_dt['population'].cast(IntegerType())).dtypes

''''''
Out[6]: [('population', 'int')]
''''''

In [None]:
# Using the cast method to cast the population column as StringType(), StringType() has already been imported in the first cell
countries.select(countries['population'].cast(StringType())).dtypes

''''''
Out[7]: [('population', 'string')]
''''''