### Path to CSV

In [15]:
path = r"C:\Users\ankur\Downloads\Datasets\simpleTestDs.csv"

### Initialize Findspark
#### Helps locate PySpark in local environment

In [16]:
import findspark
findspark.init()

### Import Pyspark and Necessary Libraries

In [18]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import upper

### Initialize SparkSession

In [19]:
ss = SparkSession.builder.master('local').appName('appOne').getOrCreate()

### Read the CSV

In [20]:
df = ss.read.options(header=True, inferSchema=True).csv(path)
df.show()

### Define UDFS

In [22]:
def fixAge(age):
    if age>110:
        return age-100
    return age

fixAgeUDF = udf(lambda x: fixAge(x), IntegerType())

### Transformations
#### 1. fix name errors
#### 2. interchange age and sal for id=3
#### 3. fix age over 100 = age-100
#### 4. fix sal errors
#### 5. set new schema

In [23]:
df = df.withColumn('fname', lower(df.fname))
df = df.withColumn('lname', lower(df.lname))
df = df.withColumn('lname', regexp_replace('lname', 'halperts', 'halpert'))
df = df.withColumn('sal', regexp_replace('sal', ',', ''))
df = df.withColumn('sal', regexp_replace('sal', '\$', ''))

df.show()

+---+-------+--------+-----+-----+
| id|  fname|   lname|  age|  sal|
+---+-------+--------+-----+-----+
|  1|    jim| halpert|  135|40000|
|  2|    pam|  beesly|   34|36000|
|  3|michael|   scott|60000|   41|
|  4|   toby|flencher|   44|66000|
|  5|  oscar|martinez|   38|42000|
+---+-------+--------+-----+-----+



In [24]:
df = df.withColumn("age", when(df["id"]==3, 41).otherwise(df['age']))
df = df.withColumn("sal", when(df["id"]==3, 60000).otherwise(df['sal']))

df.show()

+---+-------+--------+---+-----+
| id|  fname|   lname|age|  sal|
+---+-------+--------+---+-----+
|  1|    jim| halpert|135|40000|
|  2|    pam|  beesly| 34|36000|
|  3|michael|   scott| 41|60000|
|  4|   toby|flencher| 44|66000|
|  5|  oscar|martinez| 38|42000|
+---+-------+--------+---+-----+



In [25]:
df = df.withColumn('age', fixAgeUDF(df.age)) # comes after age-sal interchange for id=3

### Check Schema

In [27]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sal: string (nullable = true)



### Change Datatype

In [29]:
df = df.withColumn('sal', df['sal'].cast("float"))
df.show()

+---+-------+--------+---+-------+
| id|  fname|   lname|age|    sal|
+---+-------+--------+---+-------+
|  1|    jim| halpert| 35|40000.0|
|  2|    pam|  beesly| 34|36000.0|
|  3|michael|   scott| 41|60000.0|
|  4|   toby|flencher| 44|66000.0|
|  5|  oscar|martinez| 38|42000.0|
+---+-------+--------+---+-------+

