In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("sales").getOrCreate()

path = "/content/drive/MyDrive/Colab Notebooks/newDataSets/Cleaned_DS_Jobs.csv"



In [3]:
df = spark.read.csv(path,header = True, inferSchema = True)

df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|   Business Services|       VA|         52|     0|    0|      0|
|      Data Scientist|       137-171 |   3.8|

In [4]:
df = df.withColumn("min_salary", F.regexp_extract(F.col("Salary Estimate"),r"(\d+)",1).cast("int"))

In [6]:
df.show(5)

+-----------------+---------------+------+-------------+--------------------+--------------------+--------------------+-----------------+---------+-----------+------+-----+-------+----------+
|        Job Title|Salary Estimate|Rating|     Location|                Size|   Type of ownership|            Industry|           Sector|job_state|company_age|python|spark|tableau|min_salary|
+-----------------+---------------+------+-------------+--------------------+--------------------+--------------------+-----------------+---------+-----------+------+-----+-------+----------+
|Sr Data Scientist|       137-171 |   3.1| New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|        Insurance|       NY|         27|     0|    0|      0|       137|
|   Data Scientist|       137-171 |   4.2|Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|Business Services|       VA|         52|     0|    0|      0|       137|
|   Data Scientist|       137-171 |   3.

In [5]:
df = df.withColumn("max_salary", F.regexp_extract(F.col("Salary Estimate"), r"-(\d+)",1).cast("int"))

df.show(5)

+-----------------+---------------+------+-------------+--------------------+--------------------+--------------------+-----------------+---------+-----------+------+-----+-------+----------+----------+
|        Job Title|Salary Estimate|Rating|     Location|                Size|   Type of ownership|            Industry|           Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|
+-----------------+---------------+------+-------------+--------------------+--------------------+--------------------+-----------------+---------+-----------+------+-----+-------+----------+----------+
|Sr Data Scientist|       137-171 |   3.1| New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|        Insurance|       NY|         27|     0|    0|      0|       137|       171|
|   Data Scientist|       137-171 |   4.2|Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|Business Services|       VA|         52|     0|    0|      0|       1

In [6]:
df = df.withColumn("average_salary", ((F.col("min_salary") + F.col("max_salary"))/2))

df.show(5)

+-----------------+---------------+------+-------------+--------------------+--------------------+--------------------+-----------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|        Job Title|Salary Estimate|Rating|     Location|                Size|   Type of ownership|            Industry|           Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+-----------------+---------------+------+-------------+--------------------+--------------------+--------------------+-----------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|Sr Data Scientist|       137-171 |   3.1| New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|        Insurance|       NY|         27|     0|    0|      0|       137|       171|         154.0|
|   Data Scientist|       137-171 |   4.2|Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|Business 

In [7]:
df = df.withColumn("Rating",
                   F.when((F.col("Rating") ==  -1) | (F.col("Rating") == 0),1).otherwise(F.col("Rating"))
                   )
df.show()
# df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|         154.0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    

In [8]:
null_columns = [

                F.col for F.col in df.columns if df.filter(df[F.col].isNull()).count() > 0
]

print(null_columns)

['Size', 'Type of ownership', 'Industry', 'Sector']


In [9]:
# for col_name in null_columns:
#   df = df.withColumn(col_name, F.col(col_name).fillna(-1))

df = df.fillna({"Size":-1})
df = df.fillna({"Type of ownership": -1})
df = df.fillna({"Sector": -1})

df.show(5)

+-----------------+---------------+------+-------------+--------------------+--------------------+--------------------+-----------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|        Job Title|Salary Estimate|Rating|     Location|                Size|   Type of ownership|            Industry|           Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+-----------------+---------------+------+-------------+--------------------+--------------------+--------------------+-----------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|Sr Data Scientist|       137-171 |   3.1| New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|        Insurance|       NY|         27|     0|    0|      0|       137|       171|         154.0|
|   Data Scientist|       137-171 |   4.2|Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|Business 

In [10]:
df_temp = df.groupBy("Job Title").agg(F.avg("average_salary").alias("mean_salary"))

df_temp.show()

+--------------------+-----------------+
|           Job Title|      mean_salary|
+--------------------+-----------------+
|Senior Data Scien...|99.33333333333333|
|Clinical Data Ana...|            164.5|
|Senior Business I...|             90.0|
|Data Analyst/Engi...|            115.5|
|Staff BI and Data...|            107.0|
|Intelligence Data...|            90.75|
|Report Writer-Dat...|             92.5|
|Hydrogen/Tritium ...|            148.0|
|Business Intellig...|           109.25|
|        Data Modeler|            154.0|
|Scientist / Group...|            197.5|
|Senior Research S...|            105.0|
|Software Engineer...|            164.5|
|   Sr Data Scientist|           126.75|
|COMPUTER SCIENTIS...|            271.5|
|Data Scientist/Ma...|            125.5|
|Data Scientist - ...|            120.5|
|  Decision Scientist|             94.5|
|Data Scientist - ...|            97.75|
|Data Scientist / ...|            128.5|
+--------------------+-----------------+
only showing top

In [20]:
df_avg_salary = df.groupBy("size").agg(F.avg("average_salary").alias("avg_salary")).orderBy("avg_salary", ascending = False)

df_avg_salary.show()

+--------------------+------------------+
|                size|        avg_salary|
+--------------------+------------------+
|             Unknown|143.38235294117646|
|                  -1| 130.7962962962963|
| 51 to 200 employees|      127.83203125|
|5001 to 10000 emp...| 126.6639344262295|
|    10000+ employees|122.80379746835443|
|1001 to 5000 empl...| 121.7548076923077|
|501 to 1000 emplo...|120.93506493506493|
|   1 to 50 employees|             120.0|
|201 to 500 employees| 119.1987951807229|
+--------------------+------------------+



In [11]:
df.write.mode("overwrite").option("header","True").csv("/content/jobs_data.csv")