# Changing Data Types of Columns 

In [1]:
#importing libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import DoubleType, DateType, StringType, BooleanType, IntegerType
from pyspark.sql.functions import when, to_date

In [2]:
#loading data
spark = SparkSession.builder.appName('abc').getOrCreate()
df = spark.read ..#path here

In [3]:
#creating a dictionary that contains the desired data type
dtype_dict = {'a': 'double',
 'b': 'double',
 'c': 'string',
 'd': 'string',
 'e': 'boolean',
 'order_ts': 'timestamp',
 'g': 'date'}

In [4]:
#This function would convert the specified column in the specified pyspark dataframe to the the specified dtype.
#A datframe is returned with the dtype of the specified column cast to the specified dtype 
def change_dtype(df, col, dtype):
    try:
    
        def conv_to_timestamp(df,col,*args):
            df = df.withColumn(col, (df[col].cast('double')/1000).cast('timestamp'))
            return df

        def conv_to_date(df, col, *args):
            df = df.withColumn(col, to_date(df[col], 'mm/dd/yyyy'))
            return df

        def cast_dtype(df, col, dtype):
            df = df.withColumn(col, df[col].cast(dtype))
            return df

        fn_dtype_dict = {'string':cast_dtype, 'integer':cast_dtype, 
                         'timestamp':conv_to_timestamp, 'boolean': cast_dtype, 'date': conv_to_date, 'double':cast_dtype}

        df = fn_dtype_dict[dtype](df, col, dtype)

        return df
    except: 
        print(f'There is an error converting {col} to {dtype}')

In [5]:
#testing it for 'order_ts' 
change_dtype(df, 'order_ts', 'timestamp').select('order_ts').show()

+--------------------+
|            order_ts|
+--------------------+
| 2018-05-09 01:29:53|
|2018-05-09 06:29:...|
|2018-05-09 06:29:...|
| 2018-05-09 05:26:32|
| 2018-05-09 05:26:29|
| 2018-05-09 05:26:53|
| 2018-05-09 01:30:03|
| 2018-05-09 05:26:39|
|2018-05-09 05:29:...|
|2018-05-09 06:30:...|
|2018-05-09 06:30:...|
| 2018-05-09 05:29:46|
|2018-05-09 06:30:...|
|2018-05-09 05:30:...|
|2018-05-09 06:30:...|
| 2018-05-09 05:27:24|
|2018-05-09 05:30:...|
|2018-05-09 06:31:...|
|2018-05-09 06:31:...|
|2018-05-09 06:31:...|
+--------------------+
only showing top 20 rows



In [6]:
#running it for all columns
for i in dtype_dict:
    if dtype_dict[i] == df.select(i).dtypes[0][1]:
        continue 
    else:
        df = change_dtype(df, i , dtype_dict[i])
        

In [None]:
#checking if the columns have been cast to the right dtypes
df.printSchema()