In [2]:
# Install Java, Spark, and Findspark
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.142)] [1 InRelease 0 B/88.7 kB 0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to cloud.r-pr0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Connecting to cloud.r-proj                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Connecting to cloud.r-proj                                                                               Hit:3 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Connecting to cloud.r-proj                                                                               Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [1 InRelease gpgv 88.7 kB]

In [3]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [4]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

In [6]:
from pyspark import SparkFiles

df = spark.read.csv(SparkFiles.get("/content/indeed_data_processed_desciption.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+---+--------------------+
|_c0|      jobdescription|
+---+--------------------+
|  0|Slack is hiring a...|
|  1|"At Hub Spot we u...|
|  2|Description UST G...|
|  3|Data Scientist - ...|
|  4|Publishers Cleari...|
|  5|Sr Data Scientist...|
|  6|Data Scientist – ...|
|  7|"COMPANY OVERVIEW...|
|  8|Facebook's missio...|
|  9|Data Science - De...|
| 10|About Paysafe Gro...|
| 11|Firebird AST is s...|
| 12|It’s a new day in...|
| 13|"Duties Summary A...|
| 14|Do you want to tr...|
| 15|Amazon aims to ex...|
| 16|Amazon aims to ex...|
| 17|Job Description W...|
| 18|Applies developed...|
| 19|Introduction As a...|
+---+--------------------+
only showing top 20 rows



In [7]:
tokened = Tokenizer(inputCol="jobdescription", outputCol="words")
tokened_transformed = tokened.transform(df)
tokened_transformed.show()

+---+--------------------+--------------------+
|_c0|      jobdescription|               words|
+---+--------------------+--------------------+
|  0|Slack is hiring a...|[slack, is, hirin...|
|  1|"At Hub Spot we u...|["at, hub, spot, ...|
|  2|Description UST G...|[description, ust...|
|  3|Data Scientist - ...|[data, scientist,...|
|  4|Publishers Cleari...|[publishers, clea...|
|  5|Sr Data Scientist...|[sr, data, scient...|
|  6|Data Scientist – ...|[data, scientist,...|
|  7|"COMPANY OVERVIEW...|["company, overvi...|
|  8|Facebook's missio...|[facebook's, miss...|
|  9|Data Science - De...|[data, science, -...|
| 10|About Paysafe Gro...|[about, paysafe, ...|
| 11|Firebird AST is s...|[firebird, ast, i...|
| 12|It’s a new day in...|[it’s, a, new, da...|
| 13|"Duties Summary A...|["duties, summary...|
| 14|Do you want to tr...|[do, you, want, t...|
| 15|Amazon aims to ex...|[amazon, aims, to...|
| 16|Amazon aims to ex...|[amazon, aims, to...|
| 17|Job Description W...|[job, descript

In [8]:
# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
removed_frame = remover.transform(tokened_transformed)
removed_frame.show()

+---+--------------------+--------------------+--------------------+
|_c0|      jobdescription|               words|            filtered|
+---+--------------------+--------------------+--------------------+
|  0|Slack is hiring a...|[slack, is, hirin...|[slack, hiring, e...|
|  1|"At Hub Spot we u...|["at, hub, spot, ...|["at, hub, spot, ...|
|  2|Description UST G...|[description, ust...|[description, ust...|
|  3|Data Scientist - ...|[data, scientist,...|[data, scientist,...|
|  4|Publishers Cleari...|[publishers, clea...|[publishers, clea...|
|  5|Sr Data Scientist...|[sr, data, scient...|[sr, data, scient...|
|  6|Data Scientist – ...|[data, scientist,...|[data, scientist,...|
|  7|"COMPANY OVERVIEW...|["company, overvi...|["company, overvi...|
|  8|Facebook's missio...|[facebook's, miss...|[facebook's, miss...|
|  9|Data Science - De...|[data, science, -...|[data, science, -...|
| 10|About Paysafe Gro...|[about, paysafe, ...|[paysafe, group, ...|
| 11|Firebird AST is s...|[firebir

In [9]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="filtered", outputCol="hashedValues", numFeatures=pow(2,20))

# Transform into a DF
hashed_df = hashing.transform(removed_frame)
hashed_df.show()

+---+--------------------+--------------------+--------------------+--------------------+
|_c0|      jobdescription|               words|            filtered|        hashedValues|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|Slack is hiring a...|[slack, is, hirin...|[slack, hiring, e...|(1048576,[3373,45...|
|  1|"At Hub Spot we u...|["at, hub, spot, ...|["at, hub, spot, ...|(1048576,[966,337...|
|  2|Description UST G...|[description, ust...|[description, ust...|(1048576,[966,333...|
|  3|Data Scientist - ...|[data, scientist,...|[data, scientist,...|(1048576,[3373,11...|
|  4|Publishers Cleari...|[publishers, clea...|[publishers, clea...|(1048576,[167,150...|
|  5|Sr Data Scientist...|[sr, data, scient...|[sr, data, scient...|(1048576,[966,547...|
|  6|Data Scientist – ...|[data, scientist,...|[data, scientist,...|(1048576,[1115,33...|
|  7|"COMPANY OVERVIEW...|["company, overvi...|["company, overvi...|(1048576,[966,337...|
|  8|Faceb

In [12]:
# Fit the IDF on the data set 
idf = IDF(inputCol="hashedValues", outputCol="features")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [13]:
# Display the DataFrame
rescaledData.select("words", "features").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------