In [1]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t

import re


In [2]:
spark = SparkSession.builder \
    .appName("Redit Summarization")\
    .master("yarn")\
    .config("spark.driver.memory","16G")\
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3,org.apache.hadoop:hadoop-aws:2.7.3")\
    .getOrCreate()

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9e911f03-8f18-4b1a-8f57-4be789c78db4;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.3 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastu

2022-04-26 05:25:32,769 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-04-26 05:25:36,571 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
2022-04-26 05:25:55,506 WARN yarn.Client: Same path resource file:///home/ubuntu/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-3.4.3.jar added multiple times to distributed cache.
2022-04-26 05:25:55,506 WARN yarn.Client: Same path resource file:///home/ubuntu/.ivy2/jars/org.apache.hadoop_hadoop-aws-2.7.3.jar added multiple times to distributed cache.
2022-04-26 05:25:55,506 WARN yarn.Client: Same path resource file:///home/ubuntu/.ivy2/jars/com.typesafe_config-1.4.1.jar added multiple times to distributed cache.
2022-04-26 05:25:55,507 WARN yarn.Client: Sam

In [3]:
document_assembler = DocumentAssembler() \
    .setInputCol("comment") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
      
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.load('hdfs://namenode:9000/dis_materials/lemma_antbnc_en_2.0.2_2.4_1556480454569') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

2022-04-26 05:26:26,486 WARN cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

In [39]:
glove_embeddings = WordEmbeddingsModel().load("hdfs://namenode:9000/dis_materials/glove_100d_en_2.4.0_2.4_1579690104032") \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("subreddit")\
      .setMaxEpochs(20)\
      .setEnableOutputLogs(True)
      #.setOutputLogsPath('logs')

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

In [5]:
# Read data
df = spark.read.format("csv").option("header","true").load("hdfs://namenode:9000/dis_materials/data_reddit.csv")

                                                                                

In [6]:
# Helper functions
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"  
def cleanText(text):
    return re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
# Convert function to UDF
cleanTextDF = f.udf(lambda z: cleanText(z))

In [7]:
# drop duplicates
df2 = df.dropDuplicates()

In [8]:
# drop null values
df2 = df2.na.drop(subset=["subreddit","subreddit_id","body","created_utc","ups"])

In [9]:
# drop all moderator rows and deleted comments
df2 = df2.filter((df2.distinguished != "moderator")|(df2.body!="[deleted]"))

In [10]:
df2 = df2.withColumn("ups", df2["ups"].cast(t.IntegerType()))

In [11]:
df2 = df2.withColumn("comments",cleanTextDF(f.col("body")))

In [12]:
df_2 = df2.filter(df2.link_id == df2.parent_id)

In [None]:
# df = df_top_comments_n_replies.groupBy("subreddit","parent_id","comment").agg(f.flattenList(f.collect_list("comment")).alias("comment"))
df_2 = df_2.groupby("subreddit","parent_id","comments").agg(f.concat_ws(", ", f.collect_list(df_2.subreddit)).alias("comment"))

In [13]:
df_grouped = df_2.groupBy("subreddit","parent_id","comment").agg(f.sum("ups").alias("total_ups"))

In [14]:
df_top_channels = df_grouped.groupBy("subreddit").agg(f.sum("total_ups").alias("total_ups"))

In [24]:
df_top_channels = df_top_channels.orderBy(f.desc("total_ups")).limit(90)



In [25]:
# Get a list of all link_ids for top comments
top_channels=df_top_channels.rdd.map(lambda x: x.subreddit).collect()

                                                                                

In [26]:
len(top_channels)

90

In [27]:
df_top_channel_comments = df_2.filter(df_2.subreddit.isin(top_channels))

In [28]:
splits = df_top_channel_comments.randomSplit([1.0, 2.0], 54)

In [20]:
# splits[1].count()

In [21]:
# splits[0].count()

In [29]:
df_train = splits[1]
df_test = splits[0]

In [None]:
df_train.show(10)

In [40]:
%time

clf_pipelineModel = clf_pipeline.fit(df_train.limit(10000))

CPU times: user 5 µs, sys: 2 µs, total: 7 µs
Wall time: 22.9 µs


2022-04-26 08:02:18.159624: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:32] Reading SavedModel from: /tmp/c1b7d068807f_classifier_dl1118724960642505397
2022-04-26 08:02:18.429764: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:55] Reading meta graph with tags { serve }
2022-04-26 08:02:18.429850: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:93] Reading SavedModel debug info (if present) from: /tmp/c1b7d068807f_classifier_dl1118724960642505397
2022-04-26 08:02:19.295378: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:206] Restoring SavedModel bundle.
2022-04-26 08:02:20.736641: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:190] Running initialization op on SavedModel bundle at path: /tmp/c1b7d068807f_classifier_dl1118724960642505397
2022-04-26 08:02:21.181934: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:277] SavedModel load for tags { serve }; Status: success: OK. Took 3022345 microseconds

Training started - epochs: 20 - learning_rate: 0.005 - batch_size: 64 - training_examples: 10000 - classes: 90
Epoch 1/20 - 2.92s - loss: 687.4067 - acc: 0.18229167 - batches: 157
Epoch 2/20 - 2.58s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 3/20 - 2.64s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 4/20 - 2.31s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 5/20 - 2.23s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 6/20 - 2.30s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 7/20 - 2.21s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 8/20 - 2.03s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 9/20 - 2.30s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 10/20 - 2.44s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 11/20 - 1.88s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 12/20 - 1.81s - loss: 687.43915 - acc: 0.18309295 - batches: 157
Epoch 13/20 - 1.82s - loss: 687.43915 - acc: 0.

In [41]:
# get the predictions on test Set

preds = clf_pipelineModel.transform(df_test)

In [42]:
preds.select("subreddit","comment","class.result").show()

[Stage 101:>                                                        (0 + 1) / 1]

+-------------+--------------------+-----------+
|    subreddit|             comment|     result|
+-------------+--------------------+-----------+
|    AskReddit|smoking tobacco w...|[AskReddit]|
|          nfl|        8pm let s go|[AskReddit]|
|       movies|i assume it was o...|[AskReddit]|
|todayilearned|i did this i am t...|[AskReddit]|
|          nfl|how the fuck do i...|[AskReddit]|
|    AskReddit|                warm|[AskReddit]|
|      atheism|in my nation this...|[AskReddit]|
|          nfl|tonight we re all...|[AskReddit]|
|       videos|at 1 50 inshallah...|[AskReddit]|
|          nfl|it s 8 02 and the...|[AskReddit]|
|       gaming|no amount of rad ...|[AskReddit]|
|          nfl|just saw that 2 f...|[AskReddit]|
|AdviceAnimals|i have a solution...|[AskReddit]|
|          nfl|well this was in ...|[AskReddit]|
|          nfl|three different t...|[AskReddit]|
|    AskReddit|     im from florida|[AskReddit]|
|AdviceAnimals|32 male divorced ...|[AskReddit]|
|        funny|this 

                                                                                