In [None]:
import findspark
from sparknlp import *
findspark.init()

# Lets use the builder to stop from logging inter mediatory stuff
# SparkSession.builder \\
#             .appName("Spark NLP") \\
#             .master("local[*]") \\
#             .config("spark.driver.memory", "16G") \\
#             .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \\
#             .config("spark.kryoserializer.buffer.max", "2000M") \\
#             .config("spark.driver.maxResultSize", "0") \\
#             .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.0") \\
#             .getOrCreate()
spark = (
    start()
)
print('spark version ' + spark.version)


In [None]:
# Read data
history = (
    spark.read.csv('sample_history.csv', sep='|', header=True)
    .toDF("kind", "text", "url")
)

print('history count: ' + str(history.count()))

# Keep searches only
search_history = history.filter(history.kind == 'Searched')

print('search_history count: ' + str(search_history.count()))


In [None]:
from datetime import datetime, date
from pyspark.sql import *

from pyspark.sql.functions import *
from pyspark.ml import *
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.pretrained import *
from sparknlp.base import *

document = (
    DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")
)

# we can also use sentence detector here
# if we want to train on and get predictions for each sentence
# downloading pretrained embeddings
sentence_encoder = (
    UniversalSentenceEncoder.pretrained()
    .setInputCols(["document"])
    .setOutputCol("sentence_embeddings")
)
# the classes/labels/categories are in category column
classsifierdl = (
    ClassifierDLApproach()
    .setInputCols(["sentence_embeddings"])
    .setLabelColumn("category")
    .setOutputCol("predicted_category")
    .setMaxEpochs(80)
    .setEnableOutputLogs(True)
)

# Training samples will look like this:
# `I look great in the mirror | [narcissistic, positive]`
# I do not think we want multi label predictions,
# this will do the same thing as the previous classifier
# but it will be taught on multiple labels and will be able to predict multiple labels.
# Although the potential is there, for example if we want to express more complicated emotions
# `How do I not get fired from my job? | [fear, anger, sadness]`
# multi_label_classifier = (
#     MultiClassifierDLApproach()
#     .setInputCols(["sentence_embeddings"])
#     .setLabelColumn("category")
#     .setOutputCol("predicted_category")
#     .setMaxEpochs(80)
#     .setEnableOutputLogs(True)
# )

pipeline = Pipeline(
    stages=[
        document,
        sentence_encoder,
        classsifierdl
    ]
)


In [None]:
# Definitely feels like multi label would work much better.
# That's not even just for training, but also for predicting.
# What of something is both dumb and fun at the same time?
all_samples = [
    # Work
    ['work', 'Work'],
    ['how to make a python3 spark dataframe?', 'Work'],
    ['i got tired after my presentation in front of colleagues', 'Work'],
    # Fun
    ['fun', 'Fun'],
    ['where to go out and eat with friends?', 'Fun'],
    ['Best things to bring to a beach party', 'Fun'],
    ['Fun things to do after a stressful day', 'Fun'],
    # Dumb
    # We actually do not want to label Dumb, as being dumb
    ['What does 2 + 2 equals', 'Dumb'],
    ['Is the earth flat?', 'Dumb'],
    # Smart
    ['How do I save for retirement?', 'Smart'],
    ['How do I stay healthy?', 'Smart'],
    # Violent
    ['Martial arts around me', 'Violent'],
    ['How do I beat somebody up?', 'Violent'],
    ['Murder videos', 'Violent'],
    # Illegal
    ['How do I break the law and get away with it?', 'Illegal'],
    ['How to I buy drugs?', 'Illegal'],
    # Medical
    ['My stomach hurts what do I do?', 'Medical'],
    ['I have a hard time focusing, do I have ADHD?', 'Medical'],
    # Sexual
    ['How do I use a condom?', 'Sexual'],
    ['Sex toys', 'Sexual'],
    ['How do I please my partner?', 'Sexual'],
    # Romantic
    ['Romantic movies', 'Romantic'],
    ['buy rose petals around me', 'Romantic'],
    # Incoherent
    # ['adjf;qjdf asd fakldffalksdfj asd fasdkfljalkfd', 'Incoherent'],
    # ['I am loooooeee aaaa confiss fond confused not a ssssentence why is this ap', 'Incoherent'],
]
all_labels = list(set(map(lambda x: x[1], all_samples)))


In [None]:

# Work fun classifier
work_fun_samples = [
    # work
    ['how to make a python3 spark dataframe?', 'work'],
    ['swift objective-c inter operation', 'work'],
    ['bootstrapping deployment pipeline', 'work'],
    ['aws route 53 domain configuration', 'work'],
    # fun
    ['where to go out and eat with friends?', 'fun'],
    ['Best things to bring to a beach party', 'fun'],
    ['Were to buy a sex doll?', 'fun'],
    ['Is not yawning in response a sign of a maniac', 'fun'],
]
work_fun_frame = spark.createDataFrame(
    work_fun_samples).toDF("text", "category")
work_fun_classifier = pipeline.fit(work_fun_frame)


In [None]:
# Running work fun classifier
work_fun_labeled = work_fun_classifier.transform(search_history)

# To understand shema use work_fun_labeled.printSchema()
prediction = work_fun_labeled.predicted_category.getItem(0)


def category_score_column(category: str):
    return prediction.metadata.getItem(category).alias(f"{category}_score")


work_fun_labeled_pretty = (
    work_fun_labeled.select(
        [
            work_fun_labeled.text.substr(0, 50),
            prediction.result.alias("predicted_category"),
            category_score_column("work")
        ]
    )
)


nonwork_searches = work_fun_labeled_pretty.filter(
    work_fun_labeled_pretty.predicted_category != 'work')
print(f'Non work related searches: {nonwork_searches.count()}')

work_fun_labeled_pretty_sampled = work_fun_labeled_pretty.sample(
    0.1).orderBy(work_fun_labeled_pretty.work_score)
work_fun_labeled_pretty_sampled.show(1000, False)
