In [0]:
%python
df1=spark.read.option("inferSchema","True").csv(path="/Volumes/myowndatasets/malformed_data/malformed_csv/malformed_data_4.txt",mode="PERMISSIVE",header=True)
display(df1)

#Passive Munging:



##Manual way observations
- duplicate id,
- null id
- null rows
- null values in other few columns,
- invalid date format,
- invalid integer formats
- invalid salary column 

##Programtic way of passive Munging

###Rejection Statergy

In [0]:
%python
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DataType,DecimalType

schema12=StructType(
    [StructField("id",IntegerType(),False),
    StructField("name",StringType(),True),
    StructField("age",IntegerType(),True),
    StructField("join_date",StringType(),True),
    StructField("salary",DecimalType(),True),
    StructField("email",StringType(),True),
    StructField("corrupt_rows",StringType(),True)])
                    
df2=spark.read.option("inferSchema","True").csv(path="/Volumes/myowndatasets/malformed_data/malformed_csv/malformed_data_4.txt",mode="PERMISSIVE",columnNameOfCorruptRecord="corrupt_rows",header=True,schema=schema12)

df2.where(df2.corrupt_rows.isNotNull()).display()
#Rejection Statery is sending this file to source to crct the data and proceed with rest

###Structural Understanding

In [0]:
%python

df1.printSchema()
#Obeservations-> age, join_date,salary should be numeric, but due to wrong data formats, it is converted to null
df1.schema
df1.columns
df1.dtypes

###Duplicates Understanding

In [0]:
%python
display(df1.distinct()) #one row was excluded meaning one row level duplicate

#display(df1.dropDuplicates(["id"])) -> dropped the duplicate id(3)

###Statistical Understanding

In [0]:
%python
display(df1.describe())
display(df1.summary())

#Active Munging

##Handling Nulls

###Drop Nulls

In [0]:
%python
null_dropped=df1.na.drop(how="all",subset=["id"])
display(null_dropped)
#drop rows with id as null

###Fill nulls with default values

In [0]:
%python
#null_handled=null_dropped.fillna("0","salary").fillna("Not Applicable",subset=["email","name"])

#or
null_handled=null_dropped.fillna({"name":"Not Applicable","email":"Not Applicable","salary":"0"})
display(null_handled)

##Find and replace values

In [0]:
%python
from pyspark.sql.functions import regexp_replace,col
#na.replace
values_to_replace={"thirty":"30","twenty five":"25"}
replaced_df=null_handled.na.replace(values_to_replace,subset=["age"])
display(replaced_df)
#regex replace
replaced_df=replaced_df.withColumn("salary",(regexp_replace(col("salary"), '[^0-9]', '0')))
display(replaced_df)


##Standardization

###date Standardization

In [0]:
%python
replaced_df=replaced_df.withColumn("join_date",regexp_replace(col("join_date"), '/', '-'))
display(replaced_df)

In [0]:
%python
from pyspark.sql.functions import *
formats=["yyyy-MM-dd","yyyy-dd-MM","dd-MM-yyyy","MM-dd-yyyy"]
date_std = replaced_df.withColumn("join_date_std", coalesce(*[try_to_date(col("join_date"),i) for i in  formats]))
display(date_std)

In [0]:
%python
date_std=date_std.drop("join_date").withColumnRenamed("join_date_std","join_date")
display(date_std)

In [0]:
%python
from pyspark.sql.types import *

date_std.schema
schema1=StructType([StructField('id', IntegerType(), False), StructField('name', StringType(), True), StructField('age', StringType(), True), StructField('salary', StringType(), True), StructField('email', StringType(), True), StructField('join_date', DateType(), True)])
date_std.write.saveAsTable("myowndatasets.default.date_std")

In [0]:
%python
date_std1=spark.read.schema("schema1").table("myowndatasets.default.date_std")
display(date_std1)
       
date_std1.printSchema()

###Type casting

In [0]:
%python
date_std1=date_std1.withColumns({"join_date":col("join_date").cast("date"),"salary":col("salary").cast("decimal(10,0)"),"id":col("id").cast("integer"),"age":col("age").cast("integer")})
display(date_std1)
date_std1.printSchema()
       


###drop duplicate id

In [0]:
%python
date_std2=date_std1.dropDuplicates(["id"])
display(date_std2)
       


#Data Enrichment

##Adding Columns

In [0]:
%python
column_add1 = date_std2.withColumn(
    "salary_grade",
    when(col("salary") <= 50000, "Grade C")
    .when((col("salary") > 50000) & (col("salary") <= 75000), "Grade B")
    .when(col("salary") > 75000, "Grade A")
)
display(column_add1)

In [0]:
%python
column_add2 = column_add1.withColumn(
    "age_classification",
    when(col("age") <= 30, "Below 30")
    .when(col("age") > 30, "Above 30")
)
display(column_add2)

##Rename Column

In [0]:
%python
column_rename=column_add2.withColumnRenamed("age_classification","age_classifier")
column_rename.printSchema()

##Column Order change

In [0]:
%python
column_rename.columns
column_order=column_rename.select('id',
 'name',
 'age','age_classifier',
 'salary','salary_grade',
 'email',
 'join_date'
 )
column_order.printSchema()
