In [20]:
import time
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc.stop()
sc = SparkContext()
sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv').options(header ='true',inferschema ='true').load('D:/jcu/5851/A4/unit_2020.csv')
print(data.count())

1200


In [21]:
data = data.select(['unit description','Category'])
data.printSchema()

root
 |-- unit description: string (nullable = true)
 |-- Category: string (nullable = true)



In [4]:
# Show the top 20 group label and count
from pyspark.sql.functions import col
data.groupBy('Category').count().orderBy(col('count').desc()).show()

+--------+-----+
|Category|count|
+--------+-----+
|    EDST|   68|
|    MMBA|   65|
|    LAWS|   48|
|    ACCG|   44|
|    COMP|   43|
|    TRAN|   41|
|    AFCP|   41|
|    STAT|   39|
|    MGMT|   36|
|    PICT|   36|
|    SPED|   32|
|    AFIN|   32|
|    PICX|   32|
|    MEDI|   29|
|    MKTG|   28|
|    PSYN|   27|
|    AFCL|   27|
|    MMCC|   25|
|    PHTY|   25|
|    GMBA|   24|
+--------+-----+
only showing top 20 rows



In [22]:
# NLP method 1 : word count vectors
# ML method 1 : Logistic Regression
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
 
# inputCol: description
# outputCol: stop words removed
regexTokenizer2 = RegexTokenizer(inputCol='unit description', outputCol='words', pattern='\\W')
# stop words
add_stopwords = ['unit','study','course','studies','field','students', 'faculty','staff','be','work','form','this']
stopwords_remover2 = StopWordsRemover(inputCol='words', outputCol='filtered').setStopWords(add_stopwords)
# words vector
count_vectors2 = CountVectorizer(inputCol='filtered', outputCol='features', vocabSize=10000, minDF=5)

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol='Category', outputCol='label')
pipeline = Pipeline(stages=[regexTokenizer2, stopwords_remover2, count_vectors2, label_stringIdx])
# fit the pipeline to documents
pipeline_fit = pipeline.fit(data)
dataset = pipeline_fit.transform(data)
#dataset.filter(dataset['Category'] == 'ACST').select(['filtered','features','Category','label']).show(10)

In [23]:
# set seed for reproducibility
# training / test setting，7:3
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print('Training Dataset Count:{}'.format(trainingData.count()))
print('Test Dataset Count:{}'.format(testData.count()))

Training Dataset Count:829
Test Dataset Count:371


In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
start_time = time.time()
lr = LogisticRegression(maxIter=50, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
#predictions.filter(predictions['Category'] == 'EDST').select('Category','prediction').orderBy('probability', accending=False).show(n=10, truncate=30)

# predictionCol: prediction column
evaluator2 = MulticlassClassificationEvaluator(predictionCol='prediction')
# accuracy
print("Accuracy: " + str(evaluator2.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.4762085226558802
Running time: 3.90097713470459


In [30]:
# NLP method 1 : word count vectors
# ML method 2 : Naive Bayes
from pyspark.ml.classification import NaiveBayes
start_time = time.time()
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
#predictions.filter(predictions['prediction'] == 16) \
#     .select( 'Category', 'probability', 'label', 'prediction') \
#     .orderBy('probability', ascending=False) \
#     .show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.3633177066932591
Running time: 1.7193336486816406


In [10]:
# NLP method 1 : word count vectors
# ML method 3 : Random Forest
from pyspark.ml.classification import RandomForestClassifier
start_time = time.time()
rf = RandomForestClassifier(labelCol='label', \
                             featuresCol='features', \
                             numTrees=100, \
                             maxDepth=10, \
                             maxBins=64)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
#predictions.filter(predictions['prediction'] == 16) \
#     .select('Category','probability','label','prediction') \
#     .orderBy('probability', ascending=False) \
#     .show(n = 10, truncate = 30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.36603501384644005
Running time: 9.490335464477539


In [11]:
# NLP method 2 : TF-IDF
# ML method 1 : Logistic Regression
from pyspark.ml.feature import HashingTF, IDF
start_time = time.time()
hashingTF = HashingTF(inputCol='filtered', outputCol='rawFeatures', numFeatures=10000)
idf = IDF(inputCol='rawFeatures', outputCol='features', minDocFreq=5)
pipeline = Pipeline(stages=[regexTokenizer2, stopwords_remover2, hashingTF, idf, label_stringIdx])
pipeline_fit = pipeline.fit(data)
dataset = pipeline_fit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
 
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lr_model = lr.fit(trainingData)
predictions = lr_model.transform(testData)
#predictions.filter(predictions['Category'] == 'EDST').select('Category','prediction').\
#orderBy('probability', ascending=False).show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.4568939323939996
Running time: 8.052201986312866


In [12]:
# NLP method 2 : TF-IDF
# ML method 2 : Naive Bayes
#from pyspark.ml.classification import NaiveBayes
start_time = time.time()
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
#predictions.filter(predictions['prediction'] == 16) \
#     .select( 'Category', 'probability', 'label', 'prediction') \
#     .orderBy('probability', ascending=False) \
#     .show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.44377322242332906
Running time: 3.4365761280059814


In [13]:
# NLP method 2 : TF-IDF
# ML method 3 : Random Forest
#from pyspark.ml.classification import RandomForestClassifier
start_time = time.time()
rf = RandomForestClassifier(labelCol='label', \
                             featuresCol='features', \
                             numTrees=100, \
                             maxDepth=10, \
                             maxBins=64)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
#predictions.filter(predictions['prediction'] == 16) \
#     .select('Category','probability','label','prediction') \
#     .orderBy('probability', ascending=False) \
#     .show(n = 10, truncate = 30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.3558465402119936
Running time: 11.208366632461548


In [18]:
# NLP method 3 : tagging + TF-IDF
# ML method 1 : Logistic Regression
import nltk
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF
#from nltk.stem.porter import *

tags = set(['NN','NNS','NNP','NNPS','JJ','VB','VBG','VBN'])

def pos_tag(text):
    text2 = nltk.word_tokenize(text.lower())
    pos_tags = nltk.pos_tag(text2)
    ret = []
    for word,pos in pos_tags:
        if (pos in tags and word not in add_stopwords):
            ret.append(word)
    ret= sorted(set(ret))
    return ret
udfValueToList = udf(pos_tag, ArrayType(StringType()))
data = data.withColumn('filtered2', udfValueToList('unit description'))


hashingTF = HashingTF(inputCol='filtered2', outputCol='rawFeatures', numFeatures=10000)
idf = IDF(inputCol='rawFeatures', outputCol='features', minDocFreq=5)
pipeline = Pipeline(stages=[hashingTF,idf, label_stringIdx])
pipeline_fit = pipeline.fit(data)
dataset = pipeline_fit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)

In [19]:
# NLP method 3 : tagging + TF-IDF
# ML method 1 : Logistic Regression
start_time = time.time()
lr = LogisticRegression(maxIter=30, regParam=0.3, elasticNetParam=0)
lr_model = lr.fit(trainingData)
predictions = lr_model.transform(testData)
#predictions.filter(predictions['Category'] == 'EDST').select('Category','prediction').\
#orderBy('probability', ascending=False).show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.4462921180450076
Running time: 217.89178943634033


In [16]:
# NLP method 3 : tagging + TF-IDF
# ML method 2 : Naive Bayes
start_time = time.time()
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
#predictions.filter(predictions['Category'] == 'EDST').select('Category','prediction').\
#orderBy('probability', ascending=False).show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.42009022259059275
Running time: 195.4695110321045


In [17]:
# NLP method 3 : tagging + TF-IDF
# ML method 3 : Random Forest
start_time = time.time()
rf = RandomForestClassifier(labelCol='label', \
                             featuresCol='features', \
                             numTrees=100, \
                             maxDepth=10, \
                             maxBins=64)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
#predictions.filter(predictions['Category'] == 'EDST').select('Category','prediction').\
#orderBy('probability', ascending=False).show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.3451141350948708
Running time: 343.14731669425964


In [32]:
# NLP method 4 : word2vec
# ML method 1 : Logistic Regression
from pyspark.ml.feature import Word2Vec
#from pyspark.ml.classification import LogisticRegression
 
# inputCol: description
# outputCol: stop words removed
#regexTokenizer = RegexTokenizer(inputCol='unit description', outputCol='words', pattern='\\W')
# stop words
#add_stopwords = ['unit','study','course','studies','field','students', 'faculty','staff','be','work','form','this']
#stopwords_remover = StopWordsRemover(inputCol='words', outputCol='filtered2').setStopWords(add_stopwords)
word2vec = Word2Vec(inputCol="filtered", outputCol="features")
#label_stringIdx = StringIndexer(inputCol='Category', outputCol='label')
pipeline = Pipeline(stages=[regexTokenizer2,stopwords_remover2,word2vec, label_stringIdx])
pipeline_fit = pipeline.fit(data)
dataset = pipeline_fit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)

In [33]:
# NLP method 4 : word2vec
# ML method 1 : Logistic Regression
start_time = time.time()
lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0)
lr_model = lr.fit(trainingData)
predictions = lr_model.transform(testData)
#predictions.filter(predictions['Category'] == 'EDST').select('Category','prediction').\
#orderBy('probability', ascending=False).show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.11781491525923361
Running time: 5.182934045791626


In [161]:
# NLP method 4 : word2vec
# ML method 2 : Naive Bayes
start_time = time.time()
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
#predictions.filter(predictions['Category'] == 'EDST').select('Category','prediction').\
#orderBy('probability', ascending=False).show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Py4JJavaError: An error occurred while calling o8910.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 296.0 failed 1 times, most recent failure: Lost task 0.0 in stage 296.0 (TID 494, localhost, executor driver): java.lang.IllegalArgumentException: requirement failed: Naive Bayes requires nonnegative feature values but found [0.0494010655498552,0.021606753440239834,0.03914478185899617,-0.010815896261669506,-0.013193660172381912,0.008746300763870422,-0.019435288443241543,-0.05210639878470054,-0.01974282363159143,0.006487035205431252,0.01258337291574953,-0.01793211557116115,0.02767387569546089,0.028060258805110563,0.0021220994130402926,-0.018086374219655837,0.015663163044888788,5.990019655923863E-5,-0.015545790104699307,0.010385167792622673,-0.01011081404108974,-0.005657448878007956,0.0356521494782026,-0.00463165836117696,0.0012773031485266984,0.018086207218536893,0.027082354590854014,-0.004533920131268774,0.02622583066256427,0.035140570712354054,-0.025626410573928955,-0.014307745552972937,0.0409695204828301,0.05059781316934978,0.03229498086768187,0.011430295413268395,0.043372122706849224,0.008638833594111512,-0.0068286227989850234,-0.03399408387295047,-0.010867463607058602,-0.010933131236343485,-0.005080345591299282,0.00528033949114138,-0.021758115425282235,0.023224034459238537,-0.027428383927040383,3.1452208994048054E-4,-0.03956714216700839,0.02657825886710837,0.022845761801810846,0.020180753922685372,-0.026208036883112015,-0.02289906810476735,-0.005707501296381481,-0.045015290526474724,-0.0876712493205321,0.039622954285673063,0.03158680147197495,0.011706670907471085,0.06400539029340001,-0.011364914955891914,0.024470934901981937,0.0059897492956522795,0.0017948999812971557,0.023944155881314953,-5.355746939503512E-4,-0.0043604756489808326,0.04518240590514921,8.217359747027704E-4,-0.020594808064996587,-0.09719260036754498,-0.028972812428544218,0.025573145878237873,-0.015295128809902666,-0.0035200267476716736,0.007756556983756619,-0.017497207413114547,-0.048209637827163594,0.002353464163549733,-0.06571633131868305,0.0663415342798365,-0.025845657593234764,0.019452739978346546,0.03535028442417913,0.029693517266184886,0.015198410638478264,-0.020510618319185298,-0.03502172753634695,-0.010170216632045074,0.0015087818352841453,-0.050863066673866614,-0.030792137170897523,-0.0013738260842997153,-0.017560946552815742,-0.0029132748013133277,0.033888756848712356,0.028960619080758303,0.03062076564424015,0.0026270865444113028].
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.NaiveBayes$.requireNonnegativeValues(NaiveBayes.scala:235)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:168)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:166)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$1$$anonfun$apply$6.apply(PairRDDFunctions.scala:172)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
	at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
	at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1.apply(NaiveBayes.scala:176)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1.apply(NaiveBayes.scala:129)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.classification.NaiveBayes.trainWithLabelCheck(NaiveBayes.scala:129)
	at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:118)
	at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:78)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: requirement failed: Naive Bayes requires nonnegative feature values but found [0.0494010655498552,0.021606753440239834,0.03914478185899617,-0.010815896261669506,-0.013193660172381912,0.008746300763870422,-0.019435288443241543,-0.05210639878470054,-0.01974282363159143,0.006487035205431252,0.01258337291574953,-0.01793211557116115,0.02767387569546089,0.028060258805110563,0.0021220994130402926,-0.018086374219655837,0.015663163044888788,5.990019655923863E-5,-0.015545790104699307,0.010385167792622673,-0.01011081404108974,-0.005657448878007956,0.0356521494782026,-0.00463165836117696,0.0012773031485266984,0.018086207218536893,0.027082354590854014,-0.004533920131268774,0.02622583066256427,0.035140570712354054,-0.025626410573928955,-0.014307745552972937,0.0409695204828301,0.05059781316934978,0.03229498086768187,0.011430295413268395,0.043372122706849224,0.008638833594111512,-0.0068286227989850234,-0.03399408387295047,-0.010867463607058602,-0.010933131236343485,-0.005080345591299282,0.00528033949114138,-0.021758115425282235,0.023224034459238537,-0.027428383927040383,3.1452208994048054E-4,-0.03956714216700839,0.02657825886710837,0.022845761801810846,0.020180753922685372,-0.026208036883112015,-0.02289906810476735,-0.005707501296381481,-0.045015290526474724,-0.0876712493205321,0.039622954285673063,0.03158680147197495,0.011706670907471085,0.06400539029340001,-0.011364914955891914,0.024470934901981937,0.0059897492956522795,0.0017948999812971557,0.023944155881314953,-5.355746939503512E-4,-0.0043604756489808326,0.04518240590514921,8.217359747027704E-4,-0.020594808064996587,-0.09719260036754498,-0.028972812428544218,0.025573145878237873,-0.015295128809902666,-0.0035200267476716736,0.007756556983756619,-0.017497207413114547,-0.048209637827163594,0.002353464163549733,-0.06571633131868305,0.0663415342798365,-0.025845657593234764,0.019452739978346546,0.03535028442417913,0.029693517266184886,0.015198410638478264,-0.020510618319185298,-0.03502172753634695,-0.010170216632045074,0.0015087818352841453,-0.050863066673866614,-0.030792137170897523,-0.0013738260842997153,-0.017560946552815742,-0.0029132748013133277,0.033888756848712356,0.028960619080758303,0.03062076564424015,0.0026270865444113028].
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.NaiveBayes$.requireNonnegativeValues(NaiveBayes.scala:235)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:168)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:166)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$1$$anonfun$apply$6.apply(PairRDDFunctions.scala:172)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
	at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
	at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [34]:
# NLP method 4 : word2vec
# ML method 3 : Random Forest
start_time = time.time()
rf = RandomForestClassifier(labelCol='label', \
                             featuresCol='features', \
                             numTrees=100, \
                             maxDepth=10, \
                             maxBins=64)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
#predictions.filter(predictions['Category'] == 'EDST').select('Category','prediction').\
#orderBy('probability', ascending=False).show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.23010140579723676
Running time: 60.165497064590454


In [35]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
#from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
#from pyspark.ml.classification import LogisticRegression
#from pyspark.ml import Pipeline
#from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
#from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#data = sqlContext.read.format('com.databricks.spark.csv').options(header ='true',inferschema ='true').load('D:/jcu/5851/A4/unit_2020.csv')
#data = data.select(['unit description','Category'])
start_time = time.time()
#regexTokenizer = RegexTokenizer(inputCol='unit description', outputCol='words2', pattern='\\W')
# stop words
#add_stopwords = ['unit','study','course','studies','field','students', 'faculty','staff','be','work','form','this']
#stopwords_remover = StopWordsRemover(inputCol='words2', outputCol='filtered3').setStopWords(add_stopwords)
# words vector
#count_vectors = CountVectorizer(inputCol='filtered3', outputCol='features', vocabSize=10000, minDF=5)
#label_stringIdx = StringIndexer(inputCol='Category', outputCol='label')
pipeline = Pipeline(stages=[regexTokenizer2, stopwords_remover2, count_vectors2, label_stringIdx])

pipeline_fit = pipeline.fit(data)
dataset = pipeline_fit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
lr = LogisticRegression(maxIter=50, regParam=0.3, elasticNetParam=0)
# 为交叉验证创建参数
# ParamGridBuilder：用于基于网格搜索的模型选择的参数网格的生成器
# addGrid：将网格中给定参数设置为固定值
# parameter：正则化参数
# maxIter：迭代次数
# numFeatures：特征值
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5])
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2])
             .addGrid(lr.maxIter, [10, 50, 80])
#              .addGrid(idf.numFeatures, [10, 100, 1000])
             .build())

# 创建五折交叉验证
# estimator：要交叉验证的估计器
# estimatorParamMaps：网格搜索的最优参数
# evaluator：评估器
# numFolds：交叉次数
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')
cv = CrossValidator(estimator=lr,\
                   estimatorParamMaps=paramGrid,\
                   evaluator=evaluator2,\
                   numFolds=10)
cv_model = cv.fit(trainingData)
predictions = cv_model.transform(testData)
 
# 模型评估

print("Accuracy: " + str(evaluator.evaluate(predictions)))
end_time = time.time()
print("Running time: " + str(end_time - start_time))

Accuracy: 0.48100762430372046
Running time: 1679.3602044582367
