In [1]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t

import re


In [2]:
spark = SparkSession.builder \
    .appName("Redit Summarization")\
    .master("yarn")\
    .config("spark.driver.memory","16G")\
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3,org.apache.hadoop:hadoop-aws:2.7.3")\
    .getOrCreate()

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-01604744-4c25-4e5f-bf55-1570e0f1d0c1;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.3 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastu

2022-04-30 21:01:04,618 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-04-30 21:01:06,595 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2022-04-30 21:01:08,407 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
2022-04-30 21:01:28,839 WARN yarn.Client: Same path resource file:///home/ubuntu/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-3.4.3.jar added multiple times to distributed cache.
2022-04-30 21:01:28,839 WARN yarn.Client: Same path resource file:///home/ubuntu/.ivy2/jars/org.apache.hadoop_hadoop-aws-2.7.3.jar added multiple times to distributed cache.
2022-04-30 21:01:28,840 WARN yarn.Client: Same path resource file:///home/ubuntu/.ivy2/jars/com.type

In [6]:
document_assembler = DocumentAssembler() \
    .setInputCol("comment") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
      
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.load('hdfs://namenode:9000/dis_materials/lemma_antbnc_en_2.0.2_2.4_1556480454569') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

                                                                                

In [7]:
glove_embeddings = WordEmbeddingsModel().load("hdfs://namenode:9000/dis_materials/glove_100d_en_2.4.0_2.4_1579690104032") \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("subreddit")\
      .setMaxEpochs(20)\
      .setEnableOutputLogs(True)
      #.setOutputLogsPath('logs')

classify_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

In [8]:
# Read data
df = spark.read.format("csv").option("header","true").load("hdfs://namenode:9000/popular_comments.csv")

In [9]:
df.columns

['created_utc', 'subreddit', 'max_ups', 'parent_id', 'comment']

In [10]:
df = df.filter(df.subreddit!="AskReddit")

In [12]:
df_top_channels = df.groupby("subreddit").agg(f.max("max_ups").alias("max_ups"),f.first("comment").alias("comment"))

In [20]:
df_top_channels = df_top_channels.sort(df_top_channels.max_ups.desc()).limit(90)

In [21]:
splits = df_top_channels.randomSplit([1.0, 2.0], 54)

In [None]:
# splits[1].count()

In [None]:
# splits[0].count()

In [22]:
df_train = splits[1]
df_test = splits[0]

In [None]:
df_train.show(10)

In [23]:
%time

clf_pipelineModel = classify_pipeline.fit(df_train.limit(10000))

CPU times: user 4 µs, sys: 3 µs, total: 7 µs
Wall time: 23.1 µs


2022-04-30 21:16:32.041409: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:32] Reading SavedModel from: /tmp/6aba8b3d1b29_classifier_dl229465463461666819
2022-04-30 21:16:32.169477: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:55] Reading meta graph with tags { serve }
2022-04-30 21:16:32.169553: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:93] Reading SavedModel debug info (if present) from: /tmp/6aba8b3d1b29_classifier_dl229465463461666819
2022-04-30 21:16:32.169691: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-30 21:16:32.903993: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:206] Restoring SavedModel bundle.
2022-04-30 21:1

Py4JJavaError: An error occurred while calling o146.fit.
: java.util.NoSuchElementException: key not found: Alex."
	at scala.collection.MapLike.default(MapLike.scala:236)
	at scala.collection.MapLike.default$(MapLike.scala:235)
	at scala.collection.AbstractMap.default(Map.scala:65)
	at scala.collection.MapLike.apply(MapLike.scala:144)
	at scala.collection.MapLike.apply$(MapLike.scala:143)
	at scala.collection.AbstractMap.apply(Map.scala:65)
	at com.johnsnowlabs.ml.tensorflow.ClassifierDatasetEncoder.$anonfun$encodeTags$1(ClassifierDatasetEncoder.scala:40)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at com.johnsnowlabs.ml.tensorflow.ClassifierDatasetEncoder.encodeTags(ClassifierDatasetEncoder.scala:38)
	at com.johnsnowlabs.ml.tensorflow.TensorflowClassifier.train(TensorflowClassifier.scala:70)
	at com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLApproach.train(ClassifierDLApproach.scala:430)
	at com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLApproach.train(ClassifierDLApproach.scala:121)
	at com.johnsnowlabs.nlp.AnnotatorApproach._fit(AnnotatorApproach.scala:69)
	at com.johnsnowlabs.nlp.AnnotatorApproach.fit(AnnotatorApproach.scala:75)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
# get the predictions on test Set

preds = clf_pipelineModel.transform(df_test)

In [None]:
preds.select("subreddit","comment","class.result").show()