In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import *
from sparknlp.annotator import *
import gc
gc.enable()
import re

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
spark = sparknlp.start(gpu=False)
sparknlp.version()

'2.5.5'

In [3]:
# Import Spark NLP 
from sparknlp.base import *
from sparknlp.annotator import *


In [20]:
sdf = spark.read.parquet('toxicity.parquet')

In [21]:
document_assembler = DocumentAssembler() \
    .setInputCol("comment_text") \
    .setOutputCol("document")\
    .setCleanupMode('shrink') 

In [22]:
document_assembler.transform(sdf).select(F.explode("document.result"),'toxic').show(truncate=170)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|                                                                                                                                                                       col|toxic|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|                                                                                                                              COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK|    1|
|Hey... what is it.. @ | talk . What is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks th...|    1|
|                                                                                                        

In [28]:
def print_latest_log(path):
   
    from pathlib import Path
    import pendulum
    
    files=[*Path(path).iterdir()]
    files=[(file,file.stat().st_ctime) for file in files]
    
    a=sorted(files, key=lambda x:x[1],reverse=True)[0]
    
    with open(a[0],'r') as f:
        b=f.readlines()
    
    print(pendulum.from_timestamp(a[1],tz=pendulum.local_timezone()).to_cookie_string(),"\n")
    
    [print(b_) for b_ in b]
    
    return

In [25]:
document_assembler = DocumentAssembler() \
    .setInputCol("comment_text") \
    .setOutputCol("document")\
    .setCleanupMode('shrink') 

use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentClassifier = SentimentDLApproach()\
      .setInputCols("sentence_embeddings")\
      .setOutputCol("prediction")\
      .setLabelColumn("toxic")\
      .setBatchSize(128)\
      .setMaxEpochs(10)\
      .setDropout(0.7)\
      .setValidationSplit(0.2)\


clf_pipeline = Pipeline(
    stages=[document_assembler, 
            use,
            sentimentClassifier
           ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [26]:
clf_pipelineModel = clf_pipeline.fit(sdf)

In [29]:
path = "/home/aytu/annotator_logs"
print_latest_log(path)

Wednesday, 12-Aug-2020 12:58:14 CDT 

Training started - total epochs: 5 - learning rate: 0.0035 - batch size: 32 - training examples: 27530

Epoch 0/5 - 3.724994516%.2fs - loss: 369.49588 - accuracy: 0.87215847 - validation: 89.27404 - batches: 861

Epoch 1/5 - 3.115798814%.2fs - loss: 347.8156 - accuracy: 0.90617734 - validation: 90.255066 - batches: 861

Epoch 2/5 - 3.110044603%.2fs - loss: 339.1991 - accuracy: 0.9194041 - validation: 89.96076 - batches: 861

Epoch 3/5 - 3.107633562%.2fs - loss: 333.6996 - accuracy: 0.9277253 - validation: 90.05886 - batches: 861

Epoch 4/5 - 3.106006482%.2fs - loss: 327.55127 - accuracy: 0.93277615 - validation: 90.15697 - batches: 861



In [30]:
document_assembler = DocumentAssembler() \
    .setInputCol("comment_text") \
    .setOutputCol("document")\
    .setCleanupMode('shrink') 

tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(True)

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

bert = BertEmbeddings.pretrained('bert_base_uncased', 'en') \
      .setInputCols("document", "lemma") \
      .setOutputCol("embeddings")\
      .setPoolingLayer(0) # default 0

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")



classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("prediction")\
  .setLabelColumn("toxic")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)\
  .setBatchSize(32)\
  .setValidationSplit(0.1)\
  .setDropout(0.75)\
  .setLr(0.0035)\
  #.setOutputLogsPath('logs')

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            bert,
            embeddingsSentence,
            classsifierdl
           ])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_uncased download started this may take some time.
Approximate size to download 392.5 MB
[OK!]


In [31]:
clf_pipelineModel = clf_pipeline.fit(sdf)

In [32]:
print_latest_log(path)

Wednesday, 12-Aug-2020 19:58:56 CDT 

Training started - total epochs: 5 - learning rate: 0.0035 - batch size: 32 - training examples: 27530

Epoch 0/5 - 3.765714683%.2fs - loss: 372.9806 - accuracy: 0.87136626 - validation: 89.24133 - batches: 861

Epoch 1/5 - 3.104467391%.2fs - loss: 342.52994 - accuracy: 0.9053052 - validation: 89.37214 - batches: 861

Epoch 2/5 - 3.106660562%.2fs - loss: 335.56335 - accuracy: 0.9181686 - validation: 89.30673 - batches: 861

Epoch 3/5 - 3.164794216%.2fs - loss: 331.0061 - accuracy: 0.92659885 - validation: 89.27404 - batches: 861

Epoch 4/5 - 3.222160764%.2fs - loss: 325.64188 - accuracy: 0.9313227 - validation: 89.30673 - batches: 861

