In [1]:
!pip install dotenv
!pip install sqlalchemy
!pip install pandas



In [1]:
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError
import pandas as pd

# Load credentials from .env
load_dotenv()

# Database connection info
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

# Create SQLAlchemy engine
engine = create_engine(
    f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
)




In [None]:
pd.set_option('display.max_rows', None)

# ======================================================
# 📊 Indeed Jobs - Descriptive Query
# ======================================================

# Business Question:
# What are the most common job titles on Indeed and their average listed salaries?

sql_query = '''
WITH cleaned_salaries AS (
    SELECT 
        job_title,
        CAST(job_salary AS INTEGER) AS salary
    FROM indeed_jobs
    WHERE job_salary ~ '^\d{5,6}$'
),
job_summary AS (
    SELECT 
        job_title,
        COUNT(*) AS job_count,
        AVG(salary) AS avg_salary
    FROM cleaned_salaries
    GROUP BY job_title
)
SELECT 
    job_title,
    job_count,
    ROUND(avg_salary) AS avg_salary
FROM job_summary
ORDER BY job_count DESC
LIMIT 10;
'''

indeed_jobs_summary = pd.read_sql(sql_query, con=engine)
indeed_jobs_summary


Unnamed: 0,job_title,job_count,avg_salary
0,Financial Analyst,7,
1,Corporate Financial Analyst,1,
2,Senior Financial Analyst - Operations,1,
3,Analyst / Associate - Financial Modeling,1,
4,Financial Planning & Analyst,1,
5,Financial Systems Analyst,1,
6,Procurement Financial Analyst,1,
7,FINANCIAL ANALYST,1,
8,Finance Analyst,1,


In [None]:

# Insight:
# Job titles with the highest frequency often have highly variable salary data.

# Recommendation:
# Normalize salary data further or target roles with consistent salary representation.

# Prediction:
# Job roles with frequent listings may face increasing competition and wage pressure.


# ======================================================
# 🕵️ Indeed Jobs - Diagnostic Query
# ======================================================

# Business Question:
# Which companies most frequently post for the top job titles on Indeed?

sql_query = '''
WITH parsed_companies AS (
    SELECT 
        job_title,
        SPLIT_PART(company_name_location, 'Remote', 1) AS company_name
    FROM indeed_jobs
),
ranked_companies AS (
    SELECT 
        job_title,
        TRIM(company_name) AS company_name,
        COUNT(*) AS count,
        RANK() OVER (PARTITION BY job_title ORDER BY COUNT(*) DESC) AS rank
    FROM parsed_companies
    GROUP BY job_title, company_name
)
SELECT 
    job_title,
    company_name,
    count
FROM ranked_companies
WHERE rank = 1
ORDER BY count DESC;
'''

indeed_top_companies = pd.read_sql(sql_query, con=engine)
indeed_top_companies

# Insight:
# The most frequent job titles tend to be associated with specific companies repeatedly.

# Recommendation:
# Consider targeting these employers in job scraping or analysis to identify hiring trends.

# Prediction:
# These companies may reflect strong growth or high employee turnover in key roles.


# Insight:
# Many top job titles are repeatedly posted in specific geographic areas.

# Recommendation:
# Highlight these regions when tailoring content for job seekers.

# Prediction:
# These hubs may grow in demand for remote-to-hybrid roles.
