In [None]:
#Task 1 Data Ingession
import logging
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeAttendance").getOrCreate()
logging.basicConfig(filename='/content/sample_data/logs/attendance_log.log', level=logging.INFO)
csv_file_path = "/content/sample_data/employee_attendance.csv"

try:
    attendance_df = spark.read.option("header", "true").csv(csv_file_path)
    attendance_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/employee_attendance_raw")

    logging.info("Employee attendance data ingested successfully.")

except Exception as e:
    logging.error(f"Error ingesting data: {str(e)}")
    print(f"Error: {str(e)}")


In [None]:
#Task 2 Data Cleaning
from pyspark.sql.functions import col, unix_timestamp, round
attendance_df = spark.read.format("delta").load("/content/sample_data/delta/employee_attendance_raw")

cleaned_df = attendance_df.filter(col("CheckInTime").isNotNull() & col("CheckOutTime").isNotNull())
cleaned_df = cleaned_df.withColumn(
    "HoursWorked",
    round((unix_timestamp(col("CheckOutTime"), "HH:mm") - unix_timestamp(col("CheckInTime"), "HH:mm")) / 3600, 2)
)
cleaned_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/employee_attendance_cleaned")

print("Employee attendance cleaning completed successfully.")


In [None]:
#Task 3 Attendance Summary
from pyspark.sql.functions import sum
cleaned_df = spark.read.format("delta").load("/content/sample_data/delta/employee_attendance_cleaned")

attendance_summary = cleaned_df.groupBy("EmployeeID").agg(sum("HoursWorked").alias("TotalHoursWorked"))
overtime_df = cleaned_df.filter(col("HoursWorked") > 8).select("EmployeeID", "Date", "HoursWorked")

attendance_summary.write.format("delta").mode("overwrite").save("/content/sample_data/delta/employee_attendance_summary")
overtime_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/employee_overtime_summary")

logging.info("Employee attendance summary and overtime analysis completed.")


In [None]:
#Task 4 Create an Attendance Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, sum

spark = SparkSession.builder.appName("EmployeeAttendancePipeline").getOrCreate()

def attendance_pipeline():
    try:

        attendance_df = spark.read.option("header", "true").csv("/content/sample_data/employee_attendance.csv")
        attendance_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/attendance")
        cleaned_df = attendance_df.filter(col("CheckInTime").isNotNull() & col("CheckOutTime").isNotNull())

        cleaned_df = cleaned_df.withColumn(
            "HoursWorked",
            (unix_timestamp(col("CheckOutTime"), 'HH:mm') - unix_timestamp(col("CheckInTime"), 'HH:mm')) / 3600
        )
        cleaned_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/cleaned_attendance")
        monthly_summary_df = cleaned_df.groupBy("EmployeeID").agg(sum("HoursWorked").alias("TotalHoursWorked"))
        overtime_df = cleaned_df.filter(col("HoursWorked") > 8)

        monthly_summary_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/attendance_summary")
        overtime_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/overtime_summary")

        print("Attendance pipeline completed successfully.")

    except FileNotFoundError:
        print("CSV file is missing.")
    except Exception as e:
        print(f"Error in pipeline: {e}")

attendance_pipeline()



In [None]:
# Task-05

attendance_df = spark.read.format("delta").option("versionAsOf", 1).load("/content/sample_data/delta/employee_attendance_cleaned")

spark.sql("DESCRIBE HISTORY '/content/sample_data/delta/employee_attendance_cleaned'").show()
