In [2]:

from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, IntegerType, BooleanType, TimestampType
schema = StructType([
    StructField("party_id", StringType(), True),
    StructField("salutation", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("middle_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("personal_title", StringType(), True),
    StructField("suffix", StringType(), True),
    StructField("nickname", StringType(), True),
    StructField("first_name_local", StringType(), True),
    StructField("middle_name_local", StringType(), True),
    StructField("last_name_local", StringType(), True),
    StructField("other_local", StringType(), True),
    StructField("member_id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("birth_date", DateType(), True),
    StructField("deceased_date", DateType(), True),
    StructField("height", DoubleType(), True),
    StructField("weight", DoubleType(), True),
    StructField("mothers_maiden_name", StringType(), True),
    StructField("marital_status", StringType(), True),
    StructField("marital_status_enum_id", StringType(), True),
    StructField("social_security_number", StringType(), True),
    StructField("passport_number", StringType(), True),
    StructField("passport_expire_date", DateType(), True),
    StructField("total_years_work_experience", DoubleType(), True),
    StructField("comments", StringType(), True),
    StructField("employment_status_enum_id", StringType(), True),
    StructField("residence_status_enum_id", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("years_with_employer", IntegerType(), True),
    StructField("months_with_employer", IntegerType(), True),
    StructField("existing_customer", BooleanType(), True),
    StructField("card_id", StringType(), True),
    StructField("last_updated_stamp", TimestampType(), True),
    StructField("last_updated_tx_stamp", TimestampType(), True),
    StructField("created_stamp", TimestampType(), True),
    StructField("created_tx_stamp", TimestampType(), True),
])


In [5]:
from pyspark.sql import SparkSession
import datetime
# Create Spark session
    # 

spark = SparkSession.builder \
    .config('spark.executor.memory','4g')\
    .config("spark.jars", "/jars/postgresql-42.7.5.jar") \
    .config('spark.driver.memory','10g')\
    .appName("Read Parquet") \
    .getOrCreate()
from pyspark.sql.functions import expr

# Read Parquet file
empl_appl_detail = spark.read.parquet("D:/BigData/empfiles/empl_application_detail.parquet")
employment_details = spark.read.parquet('D:/BigData/empfiles/employment_details.parquet')
empl_sal_breakup = spark.read.parquet('D:/BigData/empfiles/employment_salary-breakup.parquet')
party = spark.read.parquet('D:/BigData/empfiles/party.parquet')
person = spark.read.csv("D:/BigData/empfiles/person.csv",header=True,schema=schema)
person=person.select("party_id","first_name","middle_name","last_name","total_years_work_experience")\
      .where("first_name is not null")
application_sandbox = spark.read.parquet('D:/BigData/empfiles/application_sandbox.parquet')
employment_app = spark.read.parquet('D:/BigData/empfiles/employment_app.parquet') \
                 .select("application_id","applying_party_id","status_id","application_date")

    #    and party_id='PR_KV04W9E4VZ'
print(f'spark session created at {datetime.datetime.now()}')

spark session created at 2025-05-17 08:57:57.238964


In [68]:
# employment_app.printSchema()
employment_app.where("applying_party_id='PR_KV04W9E4VZ'") \
    .show(n=10,truncate=False)

+--------------+-----------------+---------+--------------------------+
|application_id|applying_party_id|status_id|application_date          |
+--------------+-----------------+---------+--------------------------+
|APP_BE76T9V8MI|PR_KV04W9E4VZ    |HIRED    |2024-04-24 10:23:22.191166|
+--------------+-----------------+---------+--------------------------+



In [None]:
empl_appl_detail

In [7]:
from pyspark.sql.functions import expr,col,cast
empl_appl_detail=empl_appl_detail\
    .select("application_id","reporting_manager","recomended_ctc",col("approved_ctc").cast(IntegerType())\
      ,"ctc_approved","joining_branch","credit_report","n6")\
    .where(expr("ctc_approved is not null and reporting_manager is not null")) \
        .orderBy(expr("reporting_manager"))    # .show(n=1000)
# employment_details.select("")

empl_sal_breakup=empl_sal_breakup.select( "application_id","basic_monthly","basic_yearly","net_monthly","net_yearly","salary_breakup")
    # .where()
# .show(n=1000)

In [8]:
# PR_KV04W9E4VZ
# "ea.application_id", ,"p.party_id",substring(col("ea.application_date"),1,10), ,"basic_monthly","basic_yearly"
final_dataset=employment_app.alias("ea").join(person.alias("p"),col("ea.applying_party_id") == col("p.party_id"),"inner") \
              .join(empl_sal_breakup.alias("esb"),col("ea.application_id") == col("esb.application_id"),"inner") \
              .join(empl_appl_detail.alias("ead"),col("ea.application_id") == col("ead.application_id"),"inner") \
.select("first_name","middle_name","last_name" ,"ead.recomended_ctc","ead.approved_ctc"\
        ,"net_monthly","net_yearly","ea.status_id") \
        .where("approved_ctc is not null") \
    .orderBy("first_name")
    # .show(n=1000,truncate=False)
    # .where("party_id = 'PR_KV04W9E4VZ'") \


In [10]:
''' Expalin Analyze '''

final_dataset.explain(mode='formatted')

# # empl_appl_detail.printSchema()
# employment_details.printSchema()
# # empl_sal_breakup.printSchema()
# # party.printSchema()
# # person.printSchema()

== Physical Plan ==
AdaptiveSparkPlan (21)
+- Sort (20)
   +- Exchange (19)
      +- Project (18)
         +- BroadcastHashJoin Inner BuildRight (17)
            :- Project (12)
            :  +- BroadcastHashJoin Inner BuildRight (11)
            :     :- Project (7)
            :     :  +- BroadcastHashJoin Inner BuildLeft (6)
            :     :     :- BroadcastExchange (3)
            :     :     :  +- Filter (2)
            :     :     :     +- Scan parquet  (1)
            :     :     +- Filter (5)
            :     :        +- Scan csv  (4)
            :     +- BroadcastExchange (10)
            :        +- Filter (9)
            :           +- Scan parquet  (8)
            +- BroadcastExchange (16)
               +- Project (15)
                  +- Filter (14)
                     +- Scan parquet  (13)


(1) Scan parquet 
Output [3]: [application_id#896, status_id#898, applying_party_id#900]
Batched: true
Location: InMemoryFileIndex [file:/D:/BigData/empfiles/employment_app.pa

In [5]:
# final_dataset.write.csv('file:///D:/BigData/empfiles/salaries.csv')
final_dataset.show(n=1000,truncate=False)


+---------------------------+-----------+------------+--------------+------------+-----------+----------+--------------------+
|first_name                 |middle_name|last_name   |recomended_ctc|approved_ctc|net_monthly|net_yearly|status_id           |
+---------------------------+-----------+------------+--------------+------------+-----------+----------+--------------------+
|ABHISHEK                   |NULL       |NULL        |240000        |240000      |16042      |192500    |HIRED               |
|ADDULA                     |NULL       |ARAVIND     |510000        |510000      |37688      |452256    |HIRED               |
|ANIKET                     |ANAND      |CHABUKSWAR  |650000        |650000      |48035      |576416    |HIRED               |
|ANKIT                      |NULL       |Sharma      |134652        |134652      |11630      |139562    |HIRED               |
|ASHWANI                    |NULL       |KUMAR       |238000        |238000      |15908      |190896    |HIRED 

In [125]:
import pandas
from distutils.core import setup
# Step 1: Convert PySpark DataFrame to Pandas
pandas_df = final_dataset.toPandas()

# Step 2: Save as Excel file
pandas_df.to_excel("D:/salaries.xlsx", index=False)

ModuleNotFoundError: No module named 'distutils'