In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
spark = SparkSession.builder\
    .appName('extractor2')\
    .getOrCreate()


In [None]:
# Create a SparkSession

# Define the schema for salaries_2
schema = StructType([
    StructField("work_year", IntegerType(), True),
    StructField("experience_level", StringType(), True),
    StructField("employment_type", StringType(), True),
    StructField("job_title", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("salary_currency", StringType(), True),
    StructField("salary_in_usd", IntegerType(), True),
    StructField("employee_residence", StringType(), True),
    StructField("remote_ratio", IntegerType(), True),
    StructField("company_location", StringType(), True),
    StructField("company_size", StringType(), True)
])

# Read the CSV files into DataFrames using the correct path and schema
data1 = spark.read.csv('../../data/raw/salaries_2.csv', header=True, schema=schema)
data2 = spark.read.csv('../../data/raw/jobs_in_data.csv', header=True, inferSchema=True)
data3 = spark.read.csv('jobs.csv', header=True, inferSchema=True)

# Transform jobs_in_Data to match the schema of salaries_2
data2 = data2.select(
    "work_year",
    "experience_level",
    "employment_type",
    "job_title",
    "salary",
    "salary_currency",
    "salary_in_usd",
    "employee_residence",
    "work_setting",
    "company_location",
    "company_size"
).withColumnRenamed("work_setting", "remote_ratio")

data3 = data3.select(
    data3["date_posted"].cast(IntegerType()).alias("work_year"),
    data3["job_level"].alias("experience_level"),
    data3["job_type"].alias("employment_type"),
    data3["title"].alias("job_title"),
    data3["min_amount"].cast(IntegerType()).alias("salary"),
    data3["currency"].alias("salary_currency"),
    data3["max_amount"].cast(IntegerType()).alias("salary_in_usd"),
    data3["location"].alias("employee_residence"),
    data3["is_remote"].cast(IntegerType()).alias("remote_ratio"),
    data3["location"].alias("company_location"),
    data3["company_num_employees"].alias("company_size")
)




In [None]:
df_parquet= spark.read.parquet('../../data/raw/data.parquet')
df_json= spark.read.option("multiline", "true").json('../../data/raw/data.json')
# View the structure of the DataFrames
print("Structure of data1:")
data1.printSchema()

print("Structure of data2:")
data2.printSchema()

print("Structure of data3:")
data3.printSchema()
# View the structure of the DataFrames
print("Structure of data1:")
df_parquet.printSchema()




In [None]:
merged_dataset = data1.union(data2).union(data3).union(df_parquet)

# View the structure of the merged dataset
print("Structure of merged_dataset:")
merged_dataset.printSchema()

# Show a few rows of the merged dataset
merged_dataset.show(5)

In [None]:
# Show the size of the merged dataset
merged_size = merged_dataset.count()
print(f"Size of the merged dataset: {merged_size} rows")

In [34]:
merged_dataset.coalesce(1).write.parquet("../../data/processed", mode='overwrite')
