In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, IndexToString, RegexTokenizer,\
    StopWordsRemover, Word2Vec, CountVectorizer, IDF, HashingTF
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'nlpHW'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [3]:
# get the data
fil = '../../data/fake_job_postings.csv'
schem = StructType([StructField('job_id', IntegerType()), StructField('title', StringType()),
                    StructField('location', StringType()), StructField('department', StringType()),
                    StructField('salary_range', StringType()), StructField('company_profile', StringType()),
                    StructField('description', StringType()), StructField('requirements', StringType()),
                    StructField('benefits', StringType()), StructField('telecommuting', IntegerType()),
                    StructField('has_company_logo', IntegerType()), StructField('has_questions', IntegerType()),
                    StructField('employment_type', StringType()), StructField('required_experience', StringType()),
                    StructField('required_education', StringType()), StructField('industry', StringType()),
                    StructField('function', StringType()), StructField('fraudulent', IntegerType())])
jobs = spark.read.format('csv').options(header=True).schema(schem).load(fil)

# talk
cnt = jobs.count()
print('%d records'%cnt)
display(jobs.limit(10).toPandas())

17880 records


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
5,6,Accounting Clerk,"US, MD,",,,,Job OverviewApex is an environmental consultin...,,,0,0,0,,,,,,0
6,7,Head of Content (m/f),"DE, BE, Berlin",ANDROIDPIT,20000-28000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,Your Benefits: Being part of a fast-growing co...,0,1,1,Full-time,Mid-Senior level,Master's Degree,Online Media,Management,0
7,8,Lead Guest Service Specialist,"US, CA, San Francisco",,,Airenvy’s mission is to provide lucrative yet ...,Who is Airenvy?Hey there! We are seasoned entr...,"Experience with CRM software, live chat, and p...",Competitive Pay. You'll be able to eat steak e...,0,1,1,,,,,,0
8,9,HP BSM SME,"US, FL, Pensacola",,,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,,0,1,1,Full-time,Associate,,Information Technology and Services,,0
9,10,Customer Service Associate - Part Time,"US, AZ, Phoenix",,,"Novitex Enterprise Solutions, formerly Pitney ...",The Customer Service Associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,,0,1,0,Part-time,Entry level,High School or equivalent,Financial Services,Customer Service,0


In [4]:
# catenate together the text fields
concatCols = [colm.name for colm in jobs.schema if colm.dataType is StringType()]
print('Concatenating %s'%concatCols)
jobs = jobs.select('job_id', 'fraudulent', 'telecommuting', 'has_company_logo', 'has_questions',\
                   concat_ws(' ', *concatCols).alias('text'))
# talk
display(jobs.limit(10).toPandas())

Concatenating ['title', 'location', 'department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']


Unnamed: 0,job_id,fraudulent,telecommuting,has_company_logo,has_questions,text
0,1,0,0,1,0,"Marketing Intern US, NY, New York Marketing We..."
1,2,0,0,1,0,"Customer Service - Cloud Video Production NZ, ..."
2,3,0,0,1,0,"Commissioning Machinery Assistant (CMA) US, IA..."
3,4,0,0,1,0,"Account Executive - Washington DC US, DC, Wash..."
4,5,0,0,1,1,"Bill Review Manager US, FL, Fort Worth SpotSou..."
5,6,0,0,0,0,"Accounting Clerk US, MD, Job OverviewApex is ..."
6,7,0,0,1,1,"Head of Content (m/f) DE, BE, Berlin ANDROIDPI..."
7,8,0,0,1,1,"Lead Guest Service Specialist US, CA, San F..."
8,9,0,0,1,1,"HP BSM SME US, FL, Pensacola Solutions3 is a w..."
9,10,0,0,1,0,"Customer Service Associate - Part Time US, AZ..."


In [5]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:jobs.select(colm).where(col(colm).isNull()).count() for colm in jobs.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in jobs.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# drop mostly null columns
dropUs = nullCountsDF.loc[nullCountsDF['Rel. Freq.'] >.06, 'Column'].values.tolist()
print('Dropping %s'%dropUs)

# remove too-empty columns and the remaining nulls
jobs = jobs.drop(*dropUs).dropna(how='any')

# talk some more
print('%d records'%jobs.count())

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,telecommuting,1035.0,0.057886,IntegerType
1,fraudulent,914.0,0.051119,IntegerType
2,has_company_logo,887.0,0.049609,IntegerType
3,has_questions,716.0,0.040045,IntegerType
4,job_id,0.0,0.0,IntegerType
5,text,0.0,0.0,StringType


Dropping []


KeyboardInterrupt: 

In [6]:
''' ensure fraudulent is only 0 or 1 '''
# count by state
jobs.groupBy('fraudulent').count().orderBy(col('count').desc()).show()

# remove the bad rows if any
jobs = jobs.where(col('fraudulent').isin(0, 1)).withColumnRenamed('fraudulent', 'label')

# talk
print('%d records'%jobs.count())

+----------+-----+
|fraudulent|count|
+----------+-----+
|         0|16012|
|         1|  832|
+----------+-----+

16844 records


In [11]:
''' get rid of
urls
non-alphanumeric or whitespace chars
get rid of multiplied spaces
'''
# get the regexp for urls
with open('url_regex.txt', 'rt') as f:
    urlRE = f.readline().strip()

jobs = jobs.select('job_id', 'label', 'telecommuting', 'has_company_logo', 'has_questions',
                   regexp_replace(col('text'), urlRE, ' ').alias('text'))\
    .withColumn('text', regexp_replace(col('text'), '[^A-Za-z0-9]', ' '))\
    .withColumn('text', regexp_replace(col('text'), ' +', ' '))

# talk
jobs.show(5)

+------+-----+-------------+----------------+-------------+--------------------+
|job_id|label|telecommuting|has_company_logo|has_questions|                text|
+------+-----+-------------+----------------+-------------+--------------------+
|     1|    0|            0|               1|            0|Marketing Intern ...|
|     2|    0|            0|               1|            0|Customer Service ...|
|     3|    0|            0|               1|            0|Commissioning Mac...|
|     4|    0|            0|               1|            0|Account Executive...|
|     5|    0|            0|               1|            1|Bill Review Manag...|
+------+-----+-------------+----------------+-------------+--------------------+
only showing top 5 rows



In [13]:
''' final words pre-processing '''
# tokenizer
toker = RegexTokenizer(inputCol='text', outputCol='words', pattern='\\W', toLowercase=True)
# stopper
stop = StopWordsRemover(inputCol=toker.getOutputCol(), outputCol='fewer_words')

# pipeline
featEngine = Pipeline(stages=[toker, stop]).fit(jobs)
jobs = featEngine.transform(jobs).select('job_id', 'label', 'telecommuting', 'has_company_logo', 'has_questions', 'fewer_words')

# talk
jobs.show(5, truncate=False)

+------+-----+-------------+----------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### NLP Feature Engineering

In [None]:
# try the hashing term frequency processor
words = 256 # should be a power of 2
htf = HashingTF(inputCol='fewer_words', outputCol='features', numFeatures=words)
jobsTF = htf.transform(jobs).drop('fewer_words')
jobsTF.show(5, truncate=False)

In [None]:
# try tfidf - requires term frequencies input
idf = IDF(inputCol='tf', outputCol='features')
idfmod = idf.fit(jobsTF.withColumnRenamed('features', 'tf'))
jobsTFIDF = idfmod.transform(jobsTF.withColumnRenamed('features', 'tf')).drop('tf')
jobsTFIDF.show(5, truncate=False)

In [None]:
# try word2vec
vecLen = 10
w2v = Word2Vec(vectorSize=vecLen, minCount=1, inputCol='fewer_words', outputCol='features')
w2vmod = w2v.fit(jobs)
jobsW2V = w2vmod.transform(jobs).drop('fewer_words')
jobsW2V.show(5, truncate=False)

### Try to fit classifiction models to these

In [None]:
# global settings
trainPerc = 0.7
randSeed = 42
acc = MulticlassClassificationEvaluator(metricName='accuracy')

In [None]:
''' term frequency data '''
# split for cross-val
trn, tst = kickTF.select('id', 'label', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# fit a random forest
estim = RandomForestClassifier(numTrees=20)
fitModel = estim.fit(trn)
trainRes = fitModel.evaluate(trn)
trainAcc = acc.evaluate(trainRes.predictions)

# now evaluate test accuracy
testRes = fitModel.transform(tst)
testAcc = acc.evaluate(testRes)

print('Train Accuracy = %0.3f, Test Accuracy = %0.3f'%(trainAcc, testAcc))

In [None]:
''' term frequency data '''
# split for cross-val
trn, tst = kickTFIDF.select('id', 'label', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# fit a random forest
estim = RandomForestClassifier(numTrees=20)
fitModel = estim.fit(trn)
trainRes = fitModel.evaluate(trn)
trainAcc = acc.evaluate(trainRes.predictions)

# now evaluate test accuracy
testRes = fitModel.transform(tst)
testAcc = acc.evaluate(testRes)

print('Train Accuracy = %0.3f, Test Accuracy = %0.3f'%(trainAcc, testAcc))

In [None]:
''' term frequency data '''
# split for cross-val
trn, tst = kickW2V.select('id', 'label', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# fit a random forest
estim = RandomForestClassifier(numTrees=20)
fitModel = estim.fit(trn)
trainRes = fitModel.evaluate(trn)
trainAcc = acc.evaluate(trainRes.predictions)

# now evaluate test accuracy
testRes = fitModel.transform(tst)
testAcc = acc.evaluate(testRes)

print('Train Accuracy = %0.3f, Test Accuracy = %0.3f'%(trainAcc, testAcc))

In [14]:
sc.stop()