In [1]:
import os

In [2]:
sc

In [3]:
###Create Dataframe###
from pyspark.sql import SQLContext, Row
#load the historical file and convert lines to rows
lines = sc.textFile("file:///Users/Administrator/Desktop/spark/out3.txt")
words = lines.map(lambda l: l.split("\t"))
header = words.first()
words = words.filter(lambda x:x != header)
data= sqlContext.createDataFrame(words, ["Title", "Description", "Topic","nT"])
data.show(5)
#omit column "nT" from dataframe
data = data.select(["Title","Description","Topic"])
data.show(5)
data.printSchema()

#look at count for topics
from pyspark.sql.functions import col
data.groupBy("Topic") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+--------------------+---------------+---+
|               Title|         Description|          Topic| nT|
+--------------------+--------------------+---------------+---+
|"Meek Mill Wishes...|"'I wasn't feelin...|"entertainment"|"1"|
|"A Victim Of Reve...| "Don't leak nudes."|"entertainment"|"1"|
|"Make A Pitcher O...|   "It's wine time."|"entertainment"|"1"|
|"Create A Summer ...|"Which tune will ...|"entertainment"|"1"|
|"Everyone Is A Co...|"Are you a combo ...|"entertainment"|"1"|
+--------------------+--------------------+---------------+---+
only showing top 5 rows

+--------------------+--------------------+---------------+
|               Title|         Description|          Topic|
+--------------------+--------------------+---------------+
|"Meek Mill Wishes...|"'I wasn't feelin...|"entertainment"|
|"A Victim Of Reve...| "Don't leak nudes."|"entertainment"|
|"Make A Pitcher O...|   "It's wine time."|"entertainment"|
|"Create A Summer ...|"Which tune will 

In [5]:
###Model Pipeline, 'Description' feature###
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer1 = RegexTokenizer(inputCol="Description", outputCol="words1", pattern="\\W")
# stop words
add_stopwords =StopWordsRemover.loadDefaultStopWords("english")
stopwordsRemover1 = StopWordsRemover(inputCol="words1", outputCol="filtered1").setStopWords(add_stopwords)
# bag of words count
countVectors1 = CountVectorizer(inputCol="filtered1", outputCol="features", vocabSize=10000, minDF=5)
#String Indexer 
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "Topic", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer1,stopwordsRemover1,countVectors1,label_stringIdx])

In [6]:
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+-----+
|               Title|         Description|          Topic|              words1|           filtered1|            features|label|
+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+-----+
|"Meek Mill Wishes...|"'I wasn't feelin...|"entertainment"|[i, wasn, t, feel...|     [wasn, feeling]|(3064,[914,2930],...|  4.0|
|"A Victim Of Reve...| "Don't leak nudes."|"entertainment"|[don, t, leak, nu...|       [leak, nudes]|        (3064,[],[])|  4.0|
|"Make A Pitcher O...|   "It's wine time."|"entertainment"| [it, s, wine, time]|        [wine, time]|   (3064,[10],[1.0])|  4.0|
|"Create A Summer ...|"Which tune will ...|"entertainment"|[which, tune, wil...|[tune, dominate, ...|        (3064,[],[])|  4.0|
|"Everyone Is A Co...|"Are you a combo ...|"entertainment"|[are, you, a, com...|   [combo, bey, r

In [7]:
###Split Data to Training and Test###
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 2807
Test Dataset Count: 1201


In [8]:
#Naive Bayes
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Description","Topic","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)   

+------------------------------+--------+------------------------------+-----+----------+
|                   Description|   Topic|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|"Diego Costa ensured Arsene...|"sports"|[1.0,6.834478772328512E-19,...|  0.0|       0.0|
|"This will be the first top...|"sports"|[1.0,3.8968441942431376E-19...|  0.0|       0.0|
|"SAN JOSE -- If there were ...|"sports"|[1.0,1.051270096485701E-19,...|  0.0|       0.0|
|"WASHINGTON -- The Washingt...|"sports"|[1.0,2.3636521666498692E-20...|  0.0|       0.0|
|"SHARKS at GOLDEN KNIGHTS10...|"sports"|[1.0,1.9809013430965372E-23...|  0.0|       0.0|
|"LAS VEGAS -- The Vegas Gol...|"sports"|[1.0,6.430321009640053E-28,...|  0.0|       0.0|
|"Philadelphia 76ers  point ...|"sports"|[0.9999999999999998,4.16331...|  0.0|       0.0|
|"The Nashville Predators pl...|"sports"|[0.9999999999999989,2.79997...|  0.0|       0.0|
|"Pascal G

0.7968641494925777

In [9]:
#Logistic Regression
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Description","Topic","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+--------+------------------------------+-----+----------+
|                   Description|   Topic|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|"Diego Costa ensured Arsene...|"sports"|[0.9949220203064796,0.00343...|  0.0|       0.0|
|"Get a report of the Mumbai...|"sports"|[0.9913926136601412,0.00220...|  0.0|       0.0|
|"Steven Gerrard is expected...|"sports"|[0.9862180200619127,0.00447...|  0.0|       0.0|
|"The Ottawa Senators will h...|"sports"|[0.9854441483057239,0.00477...|  0.0|       0.0|
|"Philadelphia 76ers  point ...|"sports"|[0.9848558820974237,0.00379...|  0.0|       0.0|
|"NASHVILLE -- The list of p...|"sports"|[0.9847760351660065,0.00446...|  0.0|       0.0|
|"LAS VEGAS -- The Vegas Gol...|"sports"|[0.9822609595076545,0.00569...|  0.0|       0.0|
|"Bayern Munich defender Jer...|"sports"|[0.9807110655041338,0.00372...|  0.0|       0.0|
|"Cardiff 

0.8418878223488463