In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [4]:
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, lower
from pyspark.sql.types import StringType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, SQLTransformer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import col, when

Load in data

In [5]:
df = spark.read.json('C:/Users/lenne/anaconda3/envs/AA/Advanced_Analytics/Assignment_3/spark/scripts/data_full.json')

# Drop these columns for now (might use them later as predictors but doesn't seem to run for now)
df = df.drop("url", "domain", "posted_at", "source_title", "title", "comments", "user", "votes") #if the "frontpage" is already correct, can I remove posted_at? or i still need to check this to see if some are actually frontpage or not

# Convert True to 1 and False to 0 in the "frontpage" column
df = df.withColumn("label", when(df["frontpage"] == True, 1).otherwise(0))

# Drop the original "frontpage" column
df = df.drop("frontpage")

df.show(5)

+--------+--------------------+-----+
|     aid|         source_text|label|
+--------+--------------------+-----+
|39958086|Large Hadron Coll...|    0|
|39958094|Web Mash\n\n<\---...|    0|
|39958109|Blocked\n\n# whoa...|    0|
|39958127|Isaac Asimov obit...|    0|
|39958129|Building Computin...|    0|
+--------+--------------------+-----+
only showing top 5 rows



In [6]:
num_rows = df.count()
num_cols = len(df.columns)
print("Shape of DataFrame: {} rows, {} columns".format(num_rows, num_cols))

Shape of DataFrame: 5747 rows, 3 columns


In [7]:
print("Column Names:")
for col in df.columns:
    print(col)

Column Names:
aid
source_text
label


Clean the data

In [8]:
df = df.dropDuplicates(['aid'])
df = df.filter(df['source_text'] != '')
df = df.dropDuplicates(['source_text'])

In [9]:
df.count()

5051

In [10]:
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|  846|
|    0| 4205|
+-----+-----+



In [11]:
# Apply lowercase transformation 
df = df.withColumn("source_text", lower(df["source_text"]))

# Remove newline characters (\n), hashtags (#) and double spaces
df = df.withColumn("source_text", regexp_replace(regexp_replace(regexp_replace("source_text", "\\n", " "), "#", ""), "\\s+", " "))

In [12]:
df.show(5)

+--------+--------------------+-----+
|     aid|         source_text|label|
+--------+--------------------+-----+
|40015851|"strong focus on ...|    1|
|40025037|"the tourists hav...|    1|
|40076792|           50 ÷ 5 = |    0|
|40050776| lss lock state s...|    0|
|40087126| zen and the art ...|    0|
+--------+--------------------+-----+
only showing top 5 rows



Create a balanced datasset (class-balanced sampling - undersampling of the majority class)

In [13]:
from pyspark.sql.functions import col

In [14]:
n = 800
seed = 42

fractions = df.groupBy("label").count().withColumn("required_n", n/col("count")).drop("count").rdd.collectAsMap()

df_balanced = df.stat.sampleBy("label", fractions, seed)
df_balanced.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|  800|
|    0|  801|
+-----+-----+



In [15]:
(trainingData, testData) = df_balanced.randomSplit([0.7, 0.3], seed = 42)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 1147
Test Dataset Count: 476


Pipeline

In [16]:
# Regular expression tokenizer: Tokenizes input text into words using a regex pattern
regexTokenizer = RegexTokenizer(inputCol="source_text", outputCol="words", pattern="\\W")

# Stop words: Loads default English stop words and removes them from tokenized words
stops = StopWordsRemover.loadDefaultStopWords('english')
stopwordsRemover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(), outputCol="filtered", stopWords = stops)

# Bag of words count: Converts tokenized words into a numerical feature vector
    # @vocabSize: max size of vocabulary; @minDF: min nr of docs in which a term should appear to be included in the vocab
countVectors = CountVectorizer(inputCol=stopwordsRemover.getOutputCol(), outputCol="rawFeatures", vocabSize=10000, minDF=10)

# IDF (Inverse Document Frequency): Calculates the Inverse Document Frequency of words
    # @inputCol: raw features (word counts); @outputCol: IDF values
    # @minDocFreq: min nr of docs in which a term should appear to be included in IDF calculation
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=10)

# @maxIter: max nr of iterations; @regParam: Regularization parameter; @elasticNetParam: 0 for L2, 1 for L1 penalty)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.2)

In [17]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, idf, lr])

In [18]:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.3, 0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0, 0.1, 0.25]) \
    .build()

In [19]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)

In [20]:
# Fit the model and find the best set of parameters
model = crossval.fit(trainingData)

In [21]:
best = model.bestModel
print(best.stages)

[RegexTokenizer_65d1ddf1203b, StopWordsRemover_50391ba4f05b, CountVectorizerModel: uid=CountVectorizer_4adbf538e7d7, vocabularySize=9809, IDFModel: uid=IDF_3d56f470cbf2, numDocs=1147, numFeatures=9809, LogisticRegressionModel: uid=LogisticRegression_5bb519a4aad6, numClasses=2, numFeatures=9809]


Obtain predictions for the test data

In [22]:
prediction = model.transform(testData)
prediction.columns

['aid',
 'source_text',
 'label',
 'words',
 'filtered',
 'rawFeatures',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [23]:
prediction.select('label','probability','prediction').show(10)

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|    0|[0.39364400718012...|       1.0|
|    0|[0.44279923030141...|       1.0|
|    1|[0.25320673271740...|       1.0|
|    0|[0.60721402697849...|       0.0|
|    1|[0.63915001924698...|       0.0|
|    0|[0.87758914764345...|       0.0|
|    1|[0.61519810514949...|       0.0|
|    1|[0.44937622372760...|       1.0|
|    0|[0.44664033335041...|       1.0|
|    0|[0.50772400102630...|       0.0|
+-----+--------------------+----------+
only showing top 10 rows



Evaluate the model predictions

In [24]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(prediction)

0.5594214714901006

In [25]:
accuracy = prediction.filter(prediction.label == prediction.prediction).count() / float(testData.count())
accuracy

0.5609243697478992

Save the model locally to access later

In [26]:
model_LR = model

In [27]:
#model_LR.write().overwrite().save('C:/Users/lenne/anaconda3/envs/AA/Advanced_Analytics/Assignment_3/spark/models')