In [None]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, IndexToString, RegexTokenizer,\
    StopWordsRemover, Word2Vec, CountVectorizer, IDF, HashingTF, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'nlpHW'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

In [None]:
# get the data
fil = '../../data/fake_job_postings.csv'
schem = StructType([StructField('job_id', IntegerType()), StructField('title', StringType()),
                    StructField('location', StringType()), StructField('department', StringType()),
                    StructField('salary_range', StringType()), StructField('company_profile', StringType()),
                    StructField('description', StringType()), StructField('requirements', StringType()),
                    StructField('benefits', StringType()), StructField('telecommuting', IntegerType()),
                    StructField('has_company_logo', IntegerType()), StructField('has_questions', IntegerType()),
                    StructField('employment_type', StringType()), StructField('required_experience', StringType()),
                    StructField('required_education', StringType()), StructField('industry', StringType()),
                    StructField('function', StringType()), StructField('fraudulent', IntegerType())])
jobs = spark.read.format('csv').options(header=True).schema(schem).load(fil)

# talk
cnt = jobs.count()
print('%d records'%cnt)
display(jobs.limit(10).toPandas())

In [None]:
# catenate together the text fields
concatCols = [colm.name for colm in jobs.schema if colm.dataType is StringType()]
print('Concatenating %s'%concatCols)
jobs = jobs.select('job_id', 'fraudulent', 'telecommuting', 'has_company_logo', 'has_questions',\
                   concat_ws(' ', *concatCols).alias('text'))
# talk
display(jobs.limit(10).toPandas())

In [None]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:jobs.select(colm).where(col(colm).isNull()).count() for colm in jobs.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in jobs.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# drop mostly null columns
dropUs = nullCountsDF.loc[nullCountsDF['Rel. Freq.'] >.06, 'Column'].values.tolist()
print('Dropping %s'%dropUs)

# remove too-empty columns and the remaining nulls
jobs = jobs.drop(*dropUs).dropna(how='any')

# talk some more
print('%d records'%jobs.count())

In [None]:
''' ensure fraudulent is only 0 or 1 '''
# count by state
jobs.groupBy('fraudulent').count().orderBy(col('count').desc()).show()

# remove the bad rows if any
jobs = jobs.where(col('fraudulent').isin(0, 1)).withColumnRenamed('fraudulent', 'label')

# talk
print('%d records'%jobs.count())

In [None]:
''' get rid of
urls
non-alphanumeric or whitespace chars
get rid of multiplied spaces
'''
# get the regexp for urls
with open('url_regex.txt', 'rt') as f:
    urlRE = f.readline().strip()

jobs = jobs.select('job_id', 'label', 'telecommuting', 'has_company_logo', 'has_questions',
                   regexp_replace(col('text'), urlRE, ' ').alias('text'))\
    .withColumn('text', regexp_replace(col('text'), '[^A-Za-z0-9]', ' '))\
    .withColumn('text', regexp_replace(col('text'), ' +', ' '))

# talk
jobs.show(5)

In [None]:
# merge the boolean flags into an array
jobs = jobs.select('job_id', 'label', array('telecommuting', 'has_company_logo', 'has_questions').alias('bools'), 'text')

In [None]:
''' final words pre-processing '''
# tokenizer
toker = RegexTokenizer(inputCol='text', outputCol='words', pattern='\\W', toLowercase=True)
# stopper
stop = StopWordsRemover(inputCol=toker.getOutputCol(), outputCol='fewer_words')

# pipeline
featEngine = Pipeline(stages=[toker, stop]).fit(jobs)
jobs = featEngine.transform(jobs).select('job_id', 'label', 'bools', 'fewer_words')

# talk
jobs.show(5, truncate=False)

### NLP Feature Engineering

In [None]:
# try the hashing term frequency processor
words = 256 # should be a power of 2
htf = HashingTF(inputCol='fewer_words', outputCol='features', numFeatures=words)
jobsTF = htf.transform(jobs).drop('fewer_words')
jobsTF.show(5, truncate=False)

In [None]:
# try tfidf - requires term frequencies input
idf = IDF(inputCol='tf', outputCol='features')
idfmod = idf.fit(jobsTF.withColumnRenamed('features', 'tf'))
jobsTFIDF = idfmod.transform(jobsTF.withColumnRenamed('features', 'tf')).drop('tf')
jobsTFIDF.show(5, truncate=False)

In [None]:
# try word2vec
vecLen = 10
w2v = Word2Vec(vectorSize=vecLen, minCount=1, inputCol='fewer_words', outputCol='features')
w2vmod = w2v.fit(jobs)
jobsW2V = w2vmod.transform(jobs).drop('fewer_words')
jobsW2V.show(5, truncate=False)

### Try to fit classifiction models to these

In [None]:
trainPerc = 0.7
randSeed = 20180619
subsample = {0:0.25, 1:1.0}
acc = MulticlassClassificationEvaluator(metricName='truePositiveRateByLabel')

In [None]:
''' term frequency data '''
# put the boolean and word vectors together
vecass = VectorAssembler(inputCols=['label', 'feat'], outputCol='features')
jobsTF = vecass.transform(jobsTF.withColumnRenamed('features', 'feat')).drop('bools', 'feat')

# resample by the label, then split for cross-val
trn, tst = jobsTF.select('job_id', 'label', 'features').sampleBy('label', fractions=subsample, seed=randSeed)\
    .randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# fit a random forest
estim = RandomForestClassifier(numTrees=20)
fitModel = estim.fit(trn)
trainRes = fitModel.evaluate(trn)
trainAcc = acc.evaluate(trainRes.predictions)

# now evaluate test accuracy
testRes = fitModel.transform(tst)
testAcc = acc.evaluate(testRes)

print('Train True Positive Rate = %0.5f, Test True Positive Rate = %0.5f'%(trainAcc, testAcc))

# show the results for fraudulents
print('Training')
trainRes.predictions.select('job_id', 'label', 'prediction').where(col('label')==1).show(10)
print('Testing')
testRes.select('job_id', 'label', 'prediction').where(col('label')==1).show(10)

In [None]:
''' TFIDF data '''
# put the boolean and word vectors together
vecass = VectorAssembler(inputCols=['label', 'feat'], outputCol='features')
jobsTFIDF = vecass.transform(jobsTFIDF.withColumnRenamed('features', 'feat')).drop('bools', 'feat')

# resample by the label, then split for cross-val
trn, tst = jobsTFIDF.select('job_id', 'label', 'features').sampleBy('label', fractions=subsample, seed=randSeed)\
    .randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# fit a random forest
estim = RandomForestClassifier(numTrees=20)
fitModel = estim.fit(trn)
trainRes = fitModel.evaluate(trn)
trainAcc = acc.evaluate(trainRes.predictions)

# now evaluate test accuracy
testRes = fitModel.transform(tst)
testAcc = acc.evaluate(testRes)

print('Train True Positive Rate = %0.5f, Test True Positive Rate = %0.5f'%(trainAcc, testAcc))

# show the results for fraudulents
print('Training')
trainRes.predictions.select('job_id', 'label', 'prediction').where(col('label')==1).show(10)
print('Testing')
testRes.select('job_id', 'label', 'prediction').where(col('label')==1).show(10)

In [None]:
''' word2vec data '''
# put the boolean and word vectors together
vecass = VectorAssembler(inputCols=['label', 'feat'], outputCol='features')
jobsW2V = vecass.transform(jobsW2V.withColumnRenamed('features', 'feat')).drop('bools', 'feat')

# resample by the label, then split for cross-val
trn, tst = jobsW2V.select('job_id', 'label', 'features').sampleBy('label', fractions=subsample, seed=randSeed)\
    .randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# fit a random forest
estim = RandomForestClassifier(numTrees=20)
fitModel = estim.fit(trn)
trainRes = fitModel.evaluate(trn)
trainAcc = acc.evaluate(trainRes.predictions)

# now evaluate test accuracy
testRes = fitModel.transform(tst)
testAcc = acc.evaluate(testRes)

print('Train True Positive Rate = %0.5f, Test True Positive Rate = %0.5f'%(trainAcc, testAcc))

# show the results for fraudulents
print('Training')
trainRes.predictions.select('job_id', 'label', 'prediction').where(col('label')==1).show(10)
print('Testing')
testRes.select('job_id', 'label', 'prediction').where(col('label')==1).show(10)

In [None]:
# view feature importances for random forest from word2vec
imports = estim.featureImportances.toArray()
imports = pd.DataFrame(data=imports, columns=['Importance']).sort_values(by='Importance', ascending=False, inplace=False)
display(imports)

In [None]:
sc.stop()