In [76]:
#Hey I've added this so we can see properly SQL Views from Spark (Simon)
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [77]:
# Import Spark NLP   
#from sparknlp import DocumentAssembler
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.embeddings import *
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.common import RegexRule

# Start Spark Session with Spark NLP
spark = sparknlp.start()
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.4.1
Apache Spark version:  2.4.4


In [78]:
#Imports
import sys

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import array_contains
from pyspark.ml import Pipeline, PipelineModel

from pyspark.sql import functions as F
from pyspark.sql import SQLContext
sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)


In [79]:
df_dev = spark.read.csv("/project/review_dev_labelled.csv", escape='"', multiLine=True,
     inferSchema=True, header=True)

df_dev = df_dev.withColumnRenamed("label\r", "label")
df_dev.show()

+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+
|  #|         business_id|cool|            date|funny|           review_id|stars|                text|useful|             user_id|      index_1|index_2|label|
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+
|  0|XFhnPpO2x0nS-lVkq...|   1|14/02/2017 16:50|    0|o6VZ0cQEfIBvGW15u...|    2|I knew it was a k...|     0|dk5yqirSwDH7Cpp52...|1080000000000|      0|  1.0|
|  1|SRCepG6PbMvwKcfIU...|   0|11/12/2015 02:47|    0|PBy3gB_D1xEiBJDrw...|    2|Average place, bu...|     0|xmtWxX67wF7afssQR...| 730000000000|      1|  1.0|
|  2|pneBBrIzd3SCbbglY...|   2|15/05/2015 19:18|    0|AeyK7AEtbfubw1aD0...|    4|This is a great o...|     4|X4k8ugJaihHVLHq32...| 490000000000|      2|  0.0|
|  3|8enR8Wc0ot9L41JY1...|   0|15/09/2017 13:1

In [80]:
#df_dev.schema.names
#df_dev.printSchema()

## Create Clean Tokens

In [81]:
document_assembler = DocumentAssembler() \
            .setInputCol("text")\
            .setOutputCol("document")\
            .setCleanupMode("shrink")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token") \
    #.setSplitChars(['-']) \
   # .setContextChars(['(', ')', '?', '!']) \

normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setLowercase(True)\
            .setOutputCol("normal")

lemmatizer = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["normal"]) \
    .setOutputCol("lemma") \
    #.setDictionary("/tmp/lemmas_small.txt", key_delimiter="->", value_delimiter="\t")

stop_words_cleaner = StopWordsCleaner() \
        .setInputCols(["lemma"]) \
        .setOutputCol("cleanTokens") \
        .setCaseSensitive(False) 

finisher = Finisher() \
    .setInputCols(["cleanTokens"]) \
    .setIncludeMetadata(False)\
    .setOutputCols(["cleanTokens"])

pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    lemmatizer,
    stop_words_cleaner,
    finisher
])

normalized_data = pipeline.fit(df_dev).transform(df_dev)
normalized_data.show(5)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+
|  #|         business_id|cool|            date|funny|           review_id|stars|                text|useful|             user_id|      index_1|index_2|label|         cleanTokens|
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+
|  0|XFhnPpO2x0nS-lVkq...|   1|14/02/2017 16:50|    0|o6VZ0cQEfIBvGW15u...|    2|I knew it was a k...|     0|dk5yqirSwDH7Cpp52...|1080000000000|      0|  1.0|[know, kitschy, p...|
|  1|SRCepG6PbMvwKcfIU...|   0|11/12/2015 02:47|    0|PBy3gB_D1xEiBJDrw...|    2|Average place, bu...|     0|xmtWxX67wF7afssQR...| 730000000000|      1|  1.0|[average, place, ...|
|

In [82]:
from pyspark.sql.functions import concat_ws

normalized_data = normalized_data.withColumn("cleanTokens_n", concat_ws(",", "cleanTokens"))
normalized_data.select('cleanTokens_n').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [83]:
normalized_data.show(5)

+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+--------------------+
|  #|         business_id|cool|            date|funny|           review_id|stars|                text|useful|             user_id|      index_1|index_2|label|         cleanTokens|       cleanTokens_n|
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+--------------------+
|  0|XFhnPpO2x0nS-lVkq...|   1|14/02/2017 16:50|    0|o6VZ0cQEfIBvGW15u...|    2|I knew it was a k...|     0|dk5yqirSwDH7Cpp52...|1080000000000|      0|  1.0|[know, kitschy, p...|know,kitschy,plac...|
|  1|SRCepG6PbMvwKcfIU...|   0|11/12/2015 02:47|    0|PBy3gB_D1xEiBJDrw...|    2|Average place, bu...|     0|xmtWxX67wF7afssQR...| 730000000000|      1|  1.0|[average, place, ...|average,place,wis

## Get Number of Tokens

In [84]:
normalized_data = normalized_data.withColumn('n_clean_tokens', F.expr('size(cleanTokens)'))
normalized_data.show(5)

+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+--------------------+--------------+
|  #|         business_id|cool|            date|funny|           review_id|stars|                text|useful|             user_id|      index_1|index_2|label|         cleanTokens|       cleanTokens_n|n_clean_tokens|
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+--------------------+--------------+
|  0|XFhnPpO2x0nS-lVkq...|   1|14/02/2017 16:50|    0|o6VZ0cQEfIBvGW15u...|    2|I knew it was a k...|     0|dk5yqirSwDH7Cpp52...|1080000000000|      0|  1.0|[know, kitschy, p...|know,kitschy,plac...|            65|
|  1|SRCepG6PbMvwKcfIU...|   0|11/12/2015 02:47|    0|PBy3gB_D1xEiBJDrw...|    2|Average place, bu...|     0|xmtWxX67wF7afssQR...| 73000

In [85]:
#normalized_data.select('n_clean_tokens').show()

## Get Sentiment per review

In [86]:
document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
        
sentiment_detector = SentimentDetector() \
    .setInputCols(["lemma", "sentence"]) \
    .setOutputCol("sentiment_score") \
    .setDictionary("./sentiment_lexicon.txt", ",")  #Chen, Y., & Skiena, S. (2014). Building Sentiment Lexicons for All Major Languages. In ACL (2) (pp. 383-389).
    
finisher = Finisher() \
    .setInputCols(["sentiment_score"]) \
    .setOutputCols(["lex_sentiment"])


pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer, 
    lemmatizer, 
    sentiment_detector, 
    finisher
    ])
normalized_data = pipeline.fit(normalized_data).transform(normalized_data)
normalized_data.show()

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+--------------------+--------------+-------------+
|  #|         business_id|cool|            date|funny|           review_id|stars|                text|useful|             user_id|      index_1|index_2|label|         cleanTokens|       cleanTokens_n|n_clean_tokens|lex_sentiment|
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+--------------------+--------------+-------------+
|  0|XFhnPpO2x0nS-lVkq...|   1|14/02/2017 16:50|    0|o6VZ0cQEfIBvGW15u...|    2|I knew it was a k...|     0|dk5yqirSwDH7Cpp52...|1080000000000|      0|  1.0|[know, kitschy, p...|know,kitschy,plac...|           

## Get Sentiment ratio from the sum of all sentences for each review (needs to be fixed)

In [87]:
###NEEDS TO BE FIXED

pipeline = PretrainedPipeline("analyze_sentiment", lang="en")
sentiment_data = pipeline.annotate(normalized_data, column='text')

from pyspark.sql import functions as F
sentiment_data = sentiment_data.withColumn('pos_cnt', F.expr('size(filter(sentiment.result, x -> x in ("positive")))'))
sentiment_data = sentiment_data.withColumn('neg_cnt', F.expr('size(filter(sentiment.result, x -> x in ("negative")))'))
sentiment_data = sentiment_data.withColumn("pos_prc", sentiment_data["pos_cnt"]/(sentiment_data["pos_cnt"]+sentiment_data["neg_cnt"])).fillna(0)

sentiment_data.show(5)
#sentiment_data.select('sentiment.result').show(5)

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+--------------------+--------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+-------+------------------+
|  #|         business_id|cool|            date|funny|           review_id|stars|                text|useful|             user_id|      index_1|index_2|label|         cleanTokens|       cleanTokens_n|n_clean_tokens|lex_sentiment|            document|            sentence|               token|             checked|           sentiment|pos_cnt|neg_cnt|           pos_prc|
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+-----

## Snorkle trials


In [89]:
#Try to apply Labeling function
import pyspark.sql.functions as F
from snorkel.labeling import LabelModel
from snorkel.labeling.apply.spark import SparkLFApplier
import pandas as pd
import numpy as np
#from snorkel.labeling import ,LFAnalysis
from snorkel.labeling import LFAnalysis
from pyspark.sql import Row
from snorkel.labeling.lf import labeling_function
#from snorkel.labeling.lf.nlp_spark import spark_nlp_labeling_function
from snorkel.preprocess import preprocessor

ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1

In [92]:
sentiment_data.show(5)

+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+--------------------+--------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+-------+------------------+
|  #|         business_id|cool|            date|funny|           review_id|stars|                text|useful|             user_id|      index_1|index_2|label|         cleanTokens|       cleanTokens_n|n_clean_tokens|lex_sentiment|            document|            sentence|               token|             checked|           sentiment|pos_cnt|neg_cnt|           pos_prc|
+---+--------------------+----+----------------+-----+--------------------+-----+--------------------+------+--------------------+-------------+-------+-----+--------------------+--------------------+--------------+-------------+--------------------+----------

In [113]:
sentiment_data.select(sentiment_data.lex_sentiment[0]).show(5)

+----------------+
|lex_sentiment[0]|
+----------------+
|        positive|
|        positive|
|        positive|
|        positive|
|        positive|
+----------------+
only showing top 5 rows



In [114]:
from snorkel.labeling import LabelingFunction


@labeling_function()
def short_comment(x):
    """Negative comments are often short, such as 'foods were insane'"""
    return NEGATIVE if x.n_clean_tokens < 50 else ABSTAIN

@labeling_function()
def keyword_disgusting(x):
    """Negative comments are often short, such as 'foods were insane'"""
    return POSITIVE if x.lex_sentiment[0]=='positive' else ABSTAIN


In [115]:
lfs = [short_comment,keyword_disgusting]
applier = SparkLFApplier(lfs)
L_dev = applier.apply(sentiment_data.rdd)
g_label =np.array(sentiment_data.select('label').collect())
LFAnalysis(L_dev, lfs).lf_summary(g_label)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
short_comment,0,[0],0.61,0.544,0.544,195,110,0.639344
keyword_disgusting,1,[1],0.89,0.544,0.544,166,279,0.373034


## appendix

In [15]:
#get business_id
#df_dev.select('business_id').rdd.flatMap(lambda x: x).collect()

['XFhnPpO2x0nS-lVkqZSf_Q',
 'SRCepG6PbMvwKcfIUjk8bQ',
 'pneBBrIzd3SCbbglYRTBZA',
 '8enR8Wc0ot9L41JY1FxMlA',
 'HhVmDybpU7L50Kb5A0jXTg',
 'LUDX--wfStrKavGyitk4nA',
 'AED5R0yXVzJvWAGEqWBG4A',
 'cxsN3mkioAXuOLKy04QpyA',
 'vfwTrfrZ3FWhVkyLjAggLQ',
 'dsYjWjV0vr1RgSJ78B7K1w',
 'Jm_R0IhwFlfCAV3Fk6NTWg',
 'WFB1fn8rWNukmmIfTg6AMw',
 'qQXc9D6PV1Y8RhIBZpj-Jg',
 'xiWmyJyTa7o1H6uUNH2dEw',
 'ZAuAwz1ijuydn5yL3q3rzA',
 'TT2bNjvnVNFOanDNYnAffQ',
 'bPcqucuuClxYrIM8xWoArg',
 'aqwn3jdQLCWnb59gHK4nAw',
 'RThuRlhMdowhtY8vITEIzQ',
 'xfWdUmrz2ha3rcigyITV0g',
 'F1PMcEIg-FHZvwhiUoFEgw',
 '4g7vpvqPMgWGwge1w5JSAA',
 'fI9ErCUGY8rXRPBbatcxMA',
 'iag4htR8F7FmE9zpz7_GMw',
 'yxdmX8dXk5Se9zJ4WAW1pw',
 '5shgJB7a-2_gdnzc0gsOtg',
 '3oajqiPFhYQJsHHiVCchEQ',
 'BedDWXrINOyEjP8NTXoQ-Q',
 'ApHiz1xxENwjkp8MKrE4Dg',
 'NblDoJBEwhkJyvAuxzh4rg',
 '3Jq5LfJ5fmJ5KmuA6VHmzw',
 'zaLFPpm-t89qSDOaFVgwDQ',
 'rq5dgoksPHkJwJNQKlGQ7w',
 'GQ5f1D9CbFLXkBAlGR7riA',
 'F3oDNZGUJBIHHy4wQ5jRGw',
 '1qwkbN3wu3AGDxMTXVz2gA',
 'CdpBVSkL2Pljj-yP0ihb_w',
 