In [1]:
# Install Java, Spark, and Findspark
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Wait0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to security.u                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to security.u                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/r

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [3]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

In [4]:
from pyspark import SparkFiles

df = spark.read.csv(SparkFiles.get("/content/indeed_data_processed.csv"), sep=",", header=True)

df = df.na.drop()

df = df[["jobclass", "jobdescription"]]

# Show DataFrame
df.show()

+--------------+--------------------+
|      jobclass|      jobdescription|
+--------------+--------------------+
|data scientist|Job Overview The ...|
|data scientist|Why choose betwee...|
|data scientist|B S in operations...|
|data scientist|Please make sure ...|
|data scientist|Join a team recog...|
|data scientist|BS in STEM Scienc...|
|data scientist|"About the Team A...|
|data scientist|SUMMARY The CMC D...|
|data scientist|Formation provide...|
|data scientist|MS in STEM Scienc...|
|data scientist|At Varen our perf...|
|data scientist|We are looking fo...|
|data scientist|Overview As an in...|
|data scientist|Degree in a quant...|
|data scientist|About Pinterest M...|
|data scientist|Join us and make ...|
|data scientist|Requisition ID # ...|
|data scientist|Purpose of Job Th...|
|data scientist|By joining the Bi...|
|data scientist|Minimum qualifica...|
+--------------+--------------------+
only showing top 20 rows



In [5]:

from pyspark.sql.functions import length
# Create a length column to be used as a future feature 
data_df = df.withColumn('length', length(df['jobdescription']))
data_df.show()

+--------------+--------------------+------+
|      jobclass|      jobdescription|length|
+--------------+--------------------+------+
|data scientist|Job Overview The ...|  1927|
|data scientist|Why choose betwee...|  2471|
|data scientist|B S in operations...|  1836|
|data scientist|Please make sure ...|  6640|
|data scientist|Join a team recog...|  4478|
|data scientist|BS in STEM Scienc...|  2709|
|data scientist|"About the Team A...|  4435|
|data scientist|SUMMARY The CMC D...|  2196|
|data scientist|Formation provide...|  3969|
|data scientist|MS in STEM Scienc...|  2864|
|data scientist|At Varen our perf...|  1357|
|data scientist|We are looking fo...|  2265|
|data scientist|Overview As an in...|  2968|
|data scientist|Degree in a quant...|  4209|
|data scientist|About Pinterest M...|  3678|
|data scientist|Join us and make ...|  4440|
|data scientist|Requisition ID # ...|  3066|
|data scientist|Purpose of Job Th...|  4128|
|data scientist|By joining the Bi...|  4268|
|data scie

In [6]:

from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='jobclass',outputCol='label')
tokenizer = Tokenizer(inputCol="jobdescription", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [8]:

# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [9]:

# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [10]:

# Show label and resulting features
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(262145,[966,1079...|
|  1.0|(262145,[4525,549...|
|  1.0|(262145,[966,1836...|
|  1.0|(262145,[1079,183...|
|  1.0|(262145,[966,1836...|
|  1.0|(262145,[619,966,...|
|  1.0|(262145,[1115,183...|
|  1.0|(262145,[966,1667...|
|  1.0|(262145,[966,1836...|
|  1.0|(262145,[619,966,...|
|  1.0|(262145,[1836,337...|
|  1.0|(262145,[619,966,...|
|  1.0|(262145,[966,1079...|
|  1.0|(262145,[15,966,1...|
|  1.0|(262145,[1232,232...|
|  1.0|(262145,[573,619,...|
|  1.0|(262145,[619,966,...|
|  1.0|(262145,[619,1156...|
|  1.0|(262145,[329,619,...|
|  1.0|(262145,[619,1836...|
+-----+--------------------+
only showing top 20 rows



In [11]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [12]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    jobclass|      jobdescription|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|data analyst|ALLIANCE OF COMMU...|  5680|  0.0|[alliance, of, co...|[alliance, commun...|(262144,[619,966,...|(262144,[619,966,...|(262145,[619,966,...|[-13425.519099713...|[1.0,0.0,0.0,0.0,...|       0.0|
|data analyst|About the Justice...|  7450|  0.0|[about, the, just...|[justice, adminis...|(262144,[619,966,...|(262144,[619,966,...|(262145,[619,966,...|[-24671.443524147..

In [13]:

# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting jobclass was: %f" % acc)

Accuracy of model at predicting jobclass was: 0.879140
