In [15]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Initialize Spark session
spark = SparkSession.builder.appName("ColumnValidation").getOrCreate()

# Load the DataFrame from the input file
df = spark.read.csv("input_file.txt", header=True, inferSchema=True)

# Define the expected schema
expected_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salary", DoubleType(), True)
])

# Extract expected columns from the schema
expected_columns = set([field.name for field in expected_schema.fields])

# Extract actual columns from the DataFrame
actual_columns = set(df.columns)

# Validate columns
missing_columns = expected_columns - actual_columns
extra_columns = actual_columns - expected_columns

# Print validation results
print("Expected Columns:", expected_columns)
print("Actual Columns:", actual_columns)
print("Missing Columns:", missing_columns)
print("Extra Columns:", extra_columns)

# Check for data type mismatches
mismatched_types = []
for field in expected_schema.fields:
    if field.name in df.columns:
        actual_data_type = df.schema[field.name].dataType
        if actual_data_type != field.dataType:
            mismatched_types.append((field.name, actual_data_type, field.dataType))

print("Mismatched Data Types:", mismatched_types)


Expected Columns: {'id', 'age', 'salary', 'name'}
Actual Columns: {'department', 'salary', 'name', 'id', 'age'}
Missing Columns: set()
Extra Columns: {'department'}
Mismatched Data Types: []
