In [1]:
import os

In [2]:
sc

In [5]:
###Create Dataframe###
from pyspark.sql import SQLContext, Row
#load the historical file and convert lines to rows
lines = sc.textFile("file:///Users/Administrator/Desktop/spark/out3.txt")
words = lines.map(lambda l: l.split("\t"))
header = words.first()
words = words.filter(lambda x:x != header)
data= sqlContext.createDataFrame(words, ["Title", "Description", "Topic","nT"])
data.show(5)
#omit column "nT" from dataframe
data = data.select(["Title","Description","Topic"])
data.show(5)
data.printSchema()

+--------------------+--------------------+---------------+---+
|               Title|         Description|          Topic| nT|
+--------------------+--------------------+---------------+---+
|"Meek Mill Wishes...|"'I wasn't feelin...|"entertainment"|"1"|
|"A Victim Of Reve...| "Don't leak nudes."|"entertainment"|"1"|
|"Make A Pitcher O...|   "It's wine time."|"entertainment"|"1"|
|"Create A Summer ...|"Which tune will ...|"entertainment"|"1"|
|"Everyone Is A Co...|"Are you a combo ...|"entertainment"|"1"|
+--------------------+--------------------+---------------+---+
only showing top 5 rows

+--------------------+--------------------+---------------+
|               Title|         Description|          Topic|
+--------------------+--------------------+---------------+
|"Meek Mill Wishes...|"'I wasn't feelin...|"entertainment"|
|"A Victim Of Reve...| "Don't leak nudes."|"entertainment"|
|"Make A Pitcher O...|   "It's wine time."|"entertainment"|
|"Create A Summer ...|"Which tune will 

In [6]:
#look at count for topics
from pyspark.sql.functions import col
data.groupBy("Topic") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+---------------+-----+
|          Topic|count|
+---------------+-----+
|       "sports"|  885|
|   "technology"|  743|
|     "business"|  679|
|      "science"|  674|
|"entertainment"|  647|
|       "health"|  380|
+---------------+-----+



In [8]:
###Model Pipeline, 'Title' Feature###
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer1 = RegexTokenizer(inputCol="Title", outputCol="words1", pattern="\\W")
# stop words
add_stopwords =StopWordsRemover.loadDefaultStopWords("english")
stopwordsRemover1 = StopWordsRemover(inputCol="words1", outputCol="filtered1").setStopWords(add_stopwords)
# bag of words count
countVectors1 = CountVectorizer(inputCol="filtered1", outputCol="features", vocabSize=10000, minDF=5)
#String Indexer 
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "Topic", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer1,stopwordsRemover1,countVectors1,label_stringIdx])

In [9]:
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)
##predictive modeling, 'Title'##
###Split Data to Training and Test###
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+-----+
|               Title|         Description|          Topic|              words1|           filtered1|            features|label|
+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+-----+
|"Meek Mill Wishes...|"'I wasn't feelin...|"entertainment"|[meek, mill, wish...|[meek, mill, wish...|  (1678,[818],[1.0])|  4.0|
|"A Victim Of Reve...| "Don't leak nudes."|"entertainment"|[a, victim, of, r...|[victim, revenge,...|(1678,[175,1294,1...|  4.0|
|"Make A Pitcher O...|   "It's wine time."|"entertainment"|[make, a, pitcher...|[make, pitcher, s...|(1678,[30,31,83,1...|  4.0|
|"Create A Summer ...|"Which tune will ...|"entertainment"|[create, a, summe...|[create, summer, ...|(1678,[83,125,526...|  4.0|
|"Everyone Is A Co...|"Are you a combo ...|"entertainment"|[everyone, is, a,...|[everyone, combin

In [11]:
###Predictive Modeling###
#Naive Bayes
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Title","Topic","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)   

+------------------------------+--------+------------------------------+-----+----------+
|                         Title|   Topic|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|"Brighton & Hove Albion 1-0...|"sports"|[0.9999999999993969,2.15979...|  0.0|       0.0|
|"Brighton & Hove Albion 1-0...|"sports"|[0.9999999999993969,2.15979...|  0.0|       0.0|
|"Brighton and Hove Albion F...|"sports"|[0.9999999999992071,7.07767...|  0.0|       0.0|
|"Brighton and Hove Albion F...|"sports"|[0.9999999999992071,7.07767...|  0.0|       0.0|
|"Brighton and Hove Albion 1...|"sports"|[0.9999999875306452,1.03292...|  0.0|       0.0|
|"Brighton and Hove Albion 1...|"sports"|[0.9999999875306452,1.03292...|  0.0|       0.0|
|"West Ham 0 Manchester Unit...|"sports"|[0.9999999678586534,1.88063...|  0.0|       0.0|
|"Where to stream Brighton &...|"sports"|[0.9999999661949992,2.95452...|  0.0|       0.0|
|"Manchest

0.7916678392396832

In [12]:
#Logistic Regression
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Title","Topic","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+--------+------------------------------+-----+----------+
|                         Title|   Topic|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|"Brighton and Hove Albion F...|"sports"|[0.9961809602864691,7.31110...|  0.0|       0.0|
|"Brighton and Hove Albion F...|"sports"|[0.9961809602864691,7.31110...|  0.0|       0.0|
|"Brighton & Hove Albion 1-0...|"sports"|[0.996067917718141,0.001187...|  0.0|       0.0|
|"Brighton & Hove Albion 1-0...|"sports"|[0.996067917718141,0.001187...|  0.0|       0.0|
|"Manchester City 0 Huddersf...|"sports"|[0.9861069366627296,0.00248...|  0.0|       0.0|
|"Manchester City 3 Brighton...|"sports"|[0.9700885037373322,0.00532...|  0.0|       0.0|
|"West Ham 0 Manchester Unit...|"sports"|[0.9638525210049624,0.01431...|  0.0|       0.0|
|"Juventus 4 AC Milan 0: Don...|"sports"|[0.960848109454052,0.011764...|  0.0|       0.0|
|"Women's 

0.8260784656783743