## Setting Global Config

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
import numpy as np
np.bool = np.bool_

In [3]:
#current notebook name
notebook_name = __session__.replace('.ipynb','')[__session__.rfind('/')+1:] 

In [4]:
# HDFS base paths
hdfs_lakehouse_base_path = 'hdfs://localhost:9000/lakehouse/'
hdfs_warehouse_base_path = 'hdfs://localhost:9000/warehouse'

## Creating SparkSession

In [5]:
import os
dependencies = ["org.apache.spark:spark-avro_2.12:3.5.0",
                "io.delta:delta-iceberg_2.12:3.0.0"]
os.environ['PYSPARK_SUBMIT_ARGS']= f"--packages {','.join(dependencies)} pyspark-shell"
os.environ['PYARROW_IGNORE_TIMEZONE'] = 'true'

In [6]:
from pyspark.sql.session import SparkSession

spark = (SparkSession.builder
    .appName(notebook_name)
    .config("spark.log.level","ERROR")
    .config("spark.sql.warehouse.dir",hdfs_warehouse_base_path)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .enableHiveSupport()
    .getOrCreate()
)

24/12/15 03:49:33 WARN Utils: Your hostname, osbdet resolves to a loopback address: 127.0.0.1; using 10.0.2.15 instead (on interface enp0s3)
24/12/15 03:49:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/osbdet/.ivy2/cache
The jars for the packages stored in: /home/osbdet/.ivy2/jars
org.apache.spark#spark-avro_2.12 added as a dependency
io.delta#delta-iceberg_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-831fc378-567a-4b20-99a5-031e597c58d7;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.5.0 in central
	found org.tukaani#xz;1.9 in central
	found io.delta#delta-iceberg_2.12;3.0.0 in central
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.1.1 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.3 in central
	found org.checkerframework#checker-qual;3.19.0 in central
	found com.google.errorprone#error_prone_annotations;2.10.0 in central
:: resolution report :: resolve 1755ms :: artifacts dl 92ms
	:: modules in use:
	com.github.ben-m

## Listing current database

In [7]:
%%sparksql
show databases

Execution time: 1.98 seconds


                                                                                

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

namespace
default
it_job_ratings
movie_recommender
movielens
pokemon


In [8]:
%%sparksql
use it_job_ratings

Execution time: 0.13 seconds


                                                                                

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

In [9]:
%%sparksql
show tables

Execution time: 0.31 seconds


                                                                                

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

namespace,tableName,isTemporary
it_job_ratings,job_ratings,False
it_job_ratings,review_scores,False


## Identifying Industry Growth or Declining
based on job postings

In [10]:
highest_growth_industries = spark.sql("select industry, sum(jobs) as job_postings from it_job_ratings.job_ratings group by industry order by sum(jobs) desc limit 10").toPandas()

                                                                                

In [11]:
highest_growth_industries

Unnamed: 0,industry,job_postings
0,Consulting,39185
1,Information Technology,12074
2,Financial Services,3001
3,Business Process Outsourcing,2899
4,Insurance,2693
5,Banking & Financial Services,2513
6,Telecommunications,2011
7,Banking,1922
8,Pharmaceuticals,1554
9,E-Commerce,1313


In [12]:
lowest_growth_industries = spark.sql("select industry, sum(jobs) as job_postings from it_job_ratings.job_ratings group by industry order by sum(jobs) asc limit 10").toPandas()
lowest_growth_industries = lowest_growth_industries.sort_values(by='job_postings', ascending=False)

                                                                                

In [13]:
lowest_growth_industries

Unnamed: 0,industry,job_postings
8,Travel & Tourism,14
9,Aerospace & Defense,14
7,Agriculture Technology,9
6,Electronics Manufacturing,8
5,Oil & Gas,6
0,Government & Defense,0
1,Construction Machinery,0
2,E-Commerce & Logistics,0
3,Non-Profit Organization,0
4,Transportation,0


## Identifying best rated companies

In [14]:
best_ratings_company_industry = spark.sql("""
SELECT company_name, rating, reviews, industry
FROM it_job_ratings.job_ratings
WHERE rating > 4
AND reviews > (
    SELECT AVG(reviews)
    FROM it_job_ratings.job_ratings
    WHERE rating > 4
)
""").toPandas()

                                                                                

In [15]:
best_rated_companies = best_ratings_company_industry[["company_name", "rating", "reviews"]].sort_values(by='reviews', ascending=False).head(10)

In [16]:
best_rated_companies

Unnamed: 0,company_name,rating,reviews
0,Amazon,4.1,23800
1,IBM,4.1,20900
2,iEnergizer,4.7,19500
3,Vodafone Idea,4.1,14800
4,Tata Motors,4.2,11200
35,AU Small Finance Bank,4.3,9400
34,Mahindra & Mahindra,4.1,9400
5,Maruti Suzuki,4.2,7100
6,Ericsson,4.2,6800
7,Shriram Finance,4.1,6700


## Identifying best rated industries

In [17]:
best_rated_industries = best_ratings_company_industry.groupby('industry')[["reviews"]].sum().sort_values('reviews', ascending=False).reset_index().head(10)

In [18]:
best_rated_industries

Unnamed: 0,industry,reviews
0,Automotive,43000
1,Pharmaceuticals,38800
2,Financial Services,33600
3,Information Technology,28000
4,Telecommunications,24800
5,E-Commerce & Cloud Computing,23800
6,Engineering & Construction,19900
7,Customer Support Services,19500
8,Banking,18400
9,Engineering & Electronics,17000


## Identifying best companies to work for (good salary/benefits and work life balance)

In [20]:
spark.sql("""
SELECT 
    rs.company_name, 
    rs.overall_score,
    rs.good_work_life_balance,
    rs.good_salary_and_benefits,
    jr.rating, 
    jr.reviews
FROM 
    it_job_ratings.review_scores rs
JOIN 
    it_job_ratings.job_ratings jr
ON 
    rs.company_name = jr.company_name
WHERE rs.overall_score = 3 AND rs.good_work_life_balance = 1 AND rs.good_salary_and_benefits = 1 AND jr.rating > 4
ORDER BY 
    rs.overall_score DESC,
    jr.reviews DESC,
    jr.rating DESC
    LIMIT 10
""").toPandas()

                                                                                

Unnamed: 0,company_name,overall_score,good_work_life_balance,good_salary_and_benefits,rating,reviews
0,Amazon,3,1,1,4.1,23800
1,Vodafone Idea,3,1,1,4.1,14800
2,Cipla,3,1,1,4.1,6500
3,Intas Pharmaceuticals,3,1,1,4.1,3200
4,Viatris,3,1,1,4.2,2900
5,Ford Motor,3,1,1,4.4,2700
6,PepsiCo,3,1,1,4.1,2400
7,Google,3,1,1,4.4,1800
8,Life Insurance Corporation of India,3,1,1,4.4,1800
9,Parle Agro,3,1,1,4.1,1800


## Identifying most common highly rated factors and critically rated factors

In [23]:
highly_rated = spark.sql("""
SELECT 
    SUM(good_salary_and_benefits) AS good_salary_and_benefits,
    SUM(good_work_life_balance) AS good_work_life_balance,
    SUM(good_job_security) AS good_job_security,
    SUM(good_company_culture) AS good_company_culture,
    SUM(good_promotions_appraisal) AS good_promotions_appraisal,
    SUM(good_skill_development_learning) AS good_skill_development_learning,
    SUM(good_work_satisfaction) AS good_work_satisfaction
FROM it_job_ratings.review_scores
""").toPandas().to_dict(orient='records')[0]

most_common_highly_rated = max(highly_rated, key=highly_rated.get)

print(f"Max highly rated factor: {most_common_highly_rated} with value {highly_rated[most_common_highly_rated]}")

                                                                                

Max highly rated factor: good_job_security with value 322


In [24]:
critically_rated = spark.sql("""
SELECT 
    SUM(bad_salary_and_benefits) AS bad_salary_and_benefits,
    SUM(bad_work_life_balance) AS bad_work_life_balance,
    SUM(bad_job_security) AS bad_job_security,
    SUM(bad_company_culture) AS bad_company_culture,
    SUM(bad_promotions_appraisal) AS bad_promotions_appraisal,
    SUM(bad_skill_development_learning) AS bad_skill_development_learning,
    SUM(bad_work_satisfaction) AS bad_work_satisfaction
FROM it_job_ratings.review_scores
""").toPandas().to_dict(orient='records')[0]

most_common_critically_rated = max(critically_rated, key=critically_rated.get)

print(f"Max critically rated factor: {most_common_critically_rated} with value {critically_rated[most_common_critically_rated]}")

                                                                                

Max critically rated factor: bad_promotions_appraisal with value 357
