In [1]:
import time
a = time.time()
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t

import re


In [2]:
spark = SparkSession.builder \
    .appName("Redit Summarization")\
    .master("yarn")\
    .config("spark.driver.memory","16G")\
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3,org.apache.hadoop:hadoop-aws:2.7.3")\
    .getOrCreate()

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-51db883b-ff6c-4351-aca8-f15eab8aaf05;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.3 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastu

2022-05-01 20:45:26,116 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-01 20:45:29,982 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
2022-05-01 20:45:49,957 WARN yarn.Client: Same path resource file:///home/ubuntu/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-3.4.3.jar added multiple times to distributed cache.
2022-05-01 20:45:49,957 WARN yarn.Client: Same path resource file:///home/ubuntu/.ivy2/jars/org.apache.hadoop_hadoop-aws-2.7.3.jar added multiple times to distributed cache.
2022-05-01 20:45:49,957 WARN yarn.Client: Same path resource file:///home/ubuntu/.ivy2/jars/com.typesafe_config-1.4.1.jar added multiple times to distributed cache.
2022-05-01 20:45:49,957 WARN yarn.Client: Sam

In [3]:
document_assembler = DocumentAssembler() \
    .setInputCol("comment") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
      
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.load('hdfs://namenode:9000/dis_materials/lemma_antbnc_en_2.0.2_2.4_1556480454569') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

                                                                                

In [4]:
glove_embeddings = WordEmbeddingsModel().load("hdfs://namenode:9000/dis_materials/glove_100d_en_2.4.0_2.4_1579690104032") \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("subreddit")\
      .setMaxEpochs(20)\
      .setEnableOutputLogs(True)
      #.setOutputLogsPath('logs')

classify_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

In [31]:
# Read data
df_all = spark.read.format("csv").option("header","true").load("hdfs://namenode:9000/popular_comments.csv")

In [6]:
df.columns

['created_utc', 'subreddit', 'max_ups', 'parent_id', 'comment']

In [7]:
df = df.filter(df.subreddit!="AskReddit")

In [8]:
df_top_channels = df.groupby("subreddit").agg(f.max("max_ups").alias("max_ups"),f.first("comment").alias("comment"))

In [9]:
df_top_channels = df_top_channels.sort(df_top_channels.max_ups.desc()).limit(90)

In [10]:
splits = df_top_channels.randomSplit([1.0, 2.0], 54)

In [11]:
# splits[1].count()

In [12]:
# splits[0].count()

In [13]:
df_train = splits[1]
df_test = splits[0]

In [14]:
df_train.show(10)



+-------------+-------+--------------------+
|    subreddit|max_ups|             comment|
+-------------+-------+--------------------+
|          AFL|  997.0|wouldn t he have ...|
|AndroidGaming|   99.0|with a misspelled...|
|     ArcherFX|  990.0|no it s just a no...|
|  BasicIncome|   99.0|at 9 12k a kid it...|
|     BayStars|   99.0|                null|
|       Braves|   99.0|the season record...|
|    Cardinals|  996.0|agreed but didn t...|
|CompetitiveHS|   99.0|yeah there is def...|
| CrappyDesign|  994.0|you re supposed t...|
|          DIY|  998.0|           thank you|
+-------------+-------+--------------------+
only showing top 10 rows



                                                                                

In [26]:
clf_pipelineModel = classify_pipeline.fit(df_train.limit(1000))

2022-05-01 20:57:21.269916: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:32] Reading SavedModel from: /tmp/17ec79289490_classifier_dl74573551424602643
2022-05-01 20:57:21.438630: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:55] Reading meta graph with tags { serve }
2022-05-01 20:57:21.438710: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:93] Reading SavedModel debug info (if present) from: /tmp/17ec79289490_classifier_dl74573551424602643
2022-05-01 20:57:22.197162: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:206] Restoring SavedModel bundle.
2022-05-01 20:57:23.133039: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:190] Running initialization op on SavedModel bundle at path: /tmp/17ec79289490_classifier_dl74573551424602643
2022-05-01 20:57:23.348348: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:277] SavedModel load for tags { serve }; Status: success: OK. Took 2078460 microseconds.
    

Training started - epochs: 20 - learning_rate: 0.005 - batch_size: 64 - training_examples: 59 - classes: 59
Epoch 1/20 - 1.50s - loss: 4.0775084 - acc: Infinity - batches: 1
Epoch 2/20 - 0.02s - loss: 4.064014 - acc: Infinity - batches: 1
Epoch 3/20 - 0.02s - loss: 4.030554 - acc: Infinity - batches: 1
Epoch 4/20 - 0.01s - loss: 4.00832 - acc: Infinity - batches: 1
Epoch 5/20 - 0.02s - loss: 3.9699645 - acc: Infinity - batches: 1
Epoch 6/20 - 0.02s - loss: 3.9523711 - acc: Infinity - batches: 1
Epoch 7/20 - 0.01s - loss: 3.9484277 - acc: Infinity - batches: 1
Epoch 8/20 - 0.01s - loss: 3.9277587 - acc: Infinity - batches: 1
Epoch 9/20 - 0.01s - loss: 3.8842115 - acc: Infinity - batches: 1
Epoch 10/20 - 0.01s - loss: 3.8178744 - acc: Infinity - batches: 1
Epoch 11/20 - 0.02s - loss: 3.7282553 - acc: Infinity - batches: 1
Epoch 12/20 - 0.02s - loss: 3.622566 - acc: Infinity - batches: 1
Epoch 13/20 - 0.01s - loss: 3.5182219 - acc: Infinity - batches: 1
Epoch 14/20 - 0.01s - loss: 3.42977

In [18]:
# get the predictions on test Set

preds = clf_pipelineModel.transform(df_test)

In [19]:
preds.select("subreddit","comment","class.result").show()

[Stage 47:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+
|           subreddit|             comment|              result|
+--------------------+--------------------+--------------------+
|                 AFL|wouldn t he have ...|           [Calgary]|
|            ArcherFX|no it s just a no...|            [redsox]|
|        ArianaGrande|lol why are you s...|         [argentina]|
|           AskTrollX|yeah i think it s...|            [redsox]|
|            Buddhism|nice thanks for t...|        [TeraOnline]|
|           Cardinals|agreed but didn t...|            [redsox]|
|      ComedyCemetery|he s touching his...|            [redsox]|
|        CrappyDesign|you re supposed t...|           [Calgary]|
|          Disneyland|so a cm let only ...|[CyanideandHappin...|
|                 GTA|try doing it simi...|           [Calgary]|
|          Guildwars2|   link for the lazy|            [redsox]|
|        HPfanfiction|thanks both i ve ...|        [TeraOnline]|
|            Hatfilms|it 

                                                                                

In [33]:
# Predict data from Ask Reddit
preds = clf_pipelineModel.transform(df_all.filter(df_all.subreddit=="AskReddit"))

In [34]:
preds.select("subreddit","comment","class.result").show()

[Stage 87:>                                                         (0 + 1) / 1]

+---------+--------------------+-------------------+
|subreddit|             comment|             result|
+---------+--------------------+-------------------+
|AskReddit|those 2 things yo...|           [Braves]|
|AskReddit|never replace at ...|           [Braves]|
|AskReddit|ahah you gave me ...|           [Braves]|
|AskReddit|i ve been through...|           [Braves]|
|AskReddit|what about nyc ho...|[mildlyinfuriating]|
|AskReddit|i m not very happ...|           [Braves]|
|AskReddit|what if i m comin...|           [Braves]|
|AskReddit|based on what you...|           [Braves]|
|AskReddit|i feel like i am ...|           [Braves]|
|AskReddit|i texted my frien...|[DogShowerThoughts]|
|AskReddit|the trouble with ...|           [Braves]|
|AskReddit|god why did you h...|           [Braves]|
|AskReddit|well life in sfr ...|           [Braves]|
|AskReddit|can confirm rolle...|           [Braves]|
|AskReddit|car financing use...|           [Braves]|
|AskReddit|i think this is k...|           [Br

                                                                                

In [22]:
b = time.time()
b-a

333.4869456291199