In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from job_scraper import scrape_and_merge_jobs

In [37]:
joblocations = [
    "Dallas, TX", 
    "San Francisco, CA",
    "Los Angeles, CA",
]

In [38]:
scrape_and_merge_jobs(
    locations=joblocations,
    site_name=["indeed"#, "zip_recruiter"
               , "glassdoor"],
    search_term='"data engineer"',
    results_wanted=40,
    hours_old=512,
    country_indeed='USA',
    output_file="jobs.csv"
)

2024-10-02 20:28:46,095 - JobSpy - INFO - Indeed search page: 1
2024-10-02 20:28:47,026 - JobSpy - INFO - Glassdoor search page: 1
2024-10-02 20:28:48,445 - JobSpy - INFO - Indeed finished scraping
2024-10-02 20:28:51,935 - JobSpy - INFO - Glassdoor search page: 2
2024-10-02 20:28:52,561 - JobSpy - INFO - Glassdoor finished scraping
2024-10-02 20:28:52,828 - JobSpy - INFO - Indeed search page: 1
2024-10-02 20:28:53,356 - JobSpy - INFO - Glassdoor search page: 1
2024-10-02 20:28:53,872 - JobSpy - INFO - Indeed finished scraping
2024-10-02 20:28:55,497 - JobSpy - INFO - Glassdoor search page: 2
2024-10-02 20:28:56,019 - JobSpy - INFO - Glassdoor finished scraping
2024-10-02 20:28:56,201 - JobSpy - INFO - Indeed search page: 1
2024-10-02 20:28:57,019 - JobSpy - INFO - Glassdoor search page: 1
2024-10-02 20:28:57,205 - JobSpy - INFO - Indeed finished scraping
2024-10-02 20:28:57,549 - JobSpy - INFO - Glassdoor finished scraping


Found 117 jobs in total


In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Define the schema for salaries_2
schema = StructType([
    StructField("work_year", IntegerType(), True),
    StructField("experience_level", StringType(), True),
    StructField("employment_type", StringType(), True),
    StructField("job_title", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("salary_currency", StringType(), True),
    StructField("salary_in_usd", IntegerType(), True),
    StructField("employee_residence", StringType(), True),
    StructField("remote_ratio", IntegerType(), True),
    StructField("company_location", StringType(), True),
    StructField("company_size", StringType(), True)
])

# Read the CSV files into DataFrames using the correct path and schema
salaries_2 = spark.read.csv('../../data/raw/salaries_2.csv', header=True, schema=schema)
jobs_in_Data = spark.read.csv('../../data/raw/jobs_in_data.csv', header=True, inferSchema=True)
latestjobs = spark.read.csv('jobs.csv', header=True, inferSchema=True)

# Transform jobs_in_Data to match the schema of salaries_2
jobs_in_Data = jobs_in_Data.select(
    "work_year",
    "experience_level",
    "employment_type",
    "job_title",
    "salary",
    "salary_currency",
    "salary_in_usd",
    "employee_residence",
    "work_setting",
    "company_location",
    "company_size"
).withColumnRenamed("work_setting", "remote_ratio")

latestjobs = latestjobs.select(
    latestjobs["date_posted"].cast(IntegerType()).alias("work_year"),
    latestjobs["job_level"].alias("experience_level"),
    latestjobs["job_type"].alias("employment_type"),
    latestjobs["title"].alias("job_title"),
    latestjobs["min_amount"].cast(IntegerType()).alias("salary"),
    latestjobs["currency"].alias("salary_currency"),
    latestjobs["max_amount"].cast(IntegerType()).alias("salary_in_usd"),
    latestjobs["location"].alias("employee_residence"),
    latestjobs["is_remote"].cast(IntegerType()).alias("remote_ratio"),
    latestjobs["location"].alias("company_location"),
    latestjobs["company_num_employees"].alias("company_size")
)


# View the structure of the DataFrames
print("Structure of salaries_2:")
salaries_2.printSchema()

print("Structure of jobs_in_Data:")
jobs_in_Data.printSchema()

print("Structure of latestjobs:")
latestjobs.printSchema()

Structure of salaries_2:
root
 |-- work_year: integer (nullable = true)
 |-- experience_level: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- salary_currency: string (nullable = true)
 |-- salary_in_usd: integer (nullable = true)
 |-- employee_residence: string (nullable = true)
 |-- remote_ratio: integer (nullable = true)
 |-- company_location: string (nullable = true)
 |-- company_size: string (nullable = true)

Structure of jobs_in_Data:
root
 |-- work_year: integer (nullable = true)
 |-- experience_level: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- salary_currency: string (nullable = true)
 |-- salary_in_usd: integer (nullable = true)
 |-- employee_residence: string (nullable = true)
 |-- remote_ratio: string (nullable = true)
 |-- company_location: string (nulla

In [43]:
df_parquet= spark.read.parquet('../../data/raw/data.parquet')
df_json= spark.read.option("multiline", "true").json('../../data/raw/data.json')

In [44]:
merged_dataset = salaries_2.union(jobs_in_Data).union(latestjobs).union(df_parquet).union(df_json)

# View the structure of the merged dataset
print("Structure of merged_dataset:")
merged_dataset.printSchema()

# Show a few rows of the merged dataset
merged_dataset.show(5)

Structure of merged_dataset:
root
 |-- work_year: string (nullable = true)
 |-- experience_level: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- salary_currency: string (nullable = true)
 |-- salary_in_usd: string (nullable = true)
 |-- employee_residence: string (nullable = true)
 |-- remote_ratio: string (nullable = true)
 |-- company_location: string (nullable = true)
 |-- company_size: string (nullable = true)

+---------+----------------+---------------+--------------------+------+---------------+-------------+------------------+------------+----------------+------------+
|work_year|experience_level|employment_type|           job_title|salary|salary_currency|salary_in_usd|employee_residence|remote_ratio|company_location|company_size|
+---------+----------------+---------------+--------------------+------+---------------+-------------+------------------+------------+--------

In [45]:
# Show the size of the merged dataset
merged_size = merged_dataset.count()
print(f"Size of the merged dataset: {merged_size} rows")

Size of the merged dataset: 63554 rows


In [48]:
merged_dataset.write.parquet("../../data/processed", mode='overwrite')
