In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql.types import *
import sparknlp
spark = sparknlp.start()

In [119]:
from pyspark.ml.feature import RegexTokenizer

sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark hear"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

sentenceDataFrame.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish Java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [120]:
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish Java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [121]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

countTokens = udf(lambda words: len(words), IntegerType())
tok=regexTokenized.select("sentence", "words").withColumn("tokens", countTokens(col("words")))
tok.show(truncate=False)


+-----------------------------------+------------------------------------------+------+
|sentence                           |words                                     |tokens|
+-----------------------------------+------------------------------------------+------+
|Hi I heard about Spark hear        |[hi, i, heard, about, spark, hear]        |6     |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |
|Logistic,regression,models,are,neat|[logistic, regression, models, are, neat] |5     |
+-----------------------------------+------------------------------------------+------+



In [122]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

countTokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+-----------------------------------+------------------------------------------+------+
|sentence                           |words                                     |tokens|
+-----------------------------------+------------------------------------------+------+
|Hi I heard about Spark hear        |[hi, i, heard, about, spark, hear]        |6     |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |
|Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |1     |
+-----------------------------------+------------------------------------------+------+



# StopWordsRemover

In [123]:
from pyspark.ml.feature import StopWordsRemover


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
remover.transform(tok).show(truncate=False)

+-----------------------------------+------------------------------------------+------+------------------------------------+
|sentence                           |words                                     |tokens|filtered                            |
+-----------------------------------+------------------------------------------+------+------------------------------------+
|Hi I heard about Spark hear        |[hi, i, heard, about, spark, hear]        |6     |[hi, heard, spark, hear]            |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |[wish, java, use, case, classes]    |
|Logistic,regression,models,are,neat|[logistic, regression, models, are, neat] |5     |[logistic, regression, models, neat]|
+-----------------------------------+------------------------------------------+------+------------------------------------+



In [124]:
#df=tokenizer.transform(sentenceDataFrame)
#df.show()

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.embeddings import *

from pyspark.sql.functions import udf, col, array
from pyspark.sql.types import *

#from sparknlp.base import LightPipeline

document_assembler = DocumentAssembler().setInputCol("sentence").setOutputCol("document")

tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

bert_pipeline = Pipeline().setStages([document_assembler,tokenizer])

print(bert_pipeline.fit(sentenceDataFrame).transform(sentenceDataFrame).printSchema())

df=bert_pipeline.fit(sentenceDataFrame).transform(sentenceDataFrame)

df.select("token.result").show(truncate=False)
#df.withColumn('squared', col('token')).show()
#df.withColumn("squared", array(df["token"])).printSchema


root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value:

In [179]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.embeddings import *

data = [
  ("New York is the greatest city in the world great List ", 0),
  ("The beauty of Paris is vast", 1),
  ("The Centre Pompidou is in Paris", 1)
]

df = spark.createDataFrame(data, ["text","label"])


document_assembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"])\
  .setOutputCol("token")
 
word_embeddings = BertEmbeddings.pretrained('bert_base_cased', 'en')\
  .setInputCols(["document", "token"])\
  .setOutputCol("embeddings")


bert_pipeline = Pipeline().setStages(
  [
    document_assembler,
    tokenizer
      #,
    #word_embeddings
  ]
)

df_bert = bert_pipeline.fit(df).transform(df)
#display(df_bert)
df_bert.show()

bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]
+--------------------+-----+--------------------+--------------------+
|                text|label|            document|               token|
+--------------------+-----+--------------------+--------------------+
|New York is the g...|    0|[[document, 0, 53...|[[token, 0, 2, Ne...|
|The beauty of Par...|    1|[[document, 0, 26...|[[token, 0, 2, Th...|
|The Centre Pompid...|    1|[[document, 0, 30...|[[token, 0, 2, Th...|
+--------------------+-----+--------------------+--------------------+



In [143]:
from sparknlp.annotator import Stemmer, Lemmatizer
stemmer = Stemmer().setInputCols(['token']).setOutputCol('stems_annotations')
bert_pipeline = Pipeline().setStages(
  [
    document_assembler,
    tokenizer,
    stemmer
  ]
)

df_bert = bert_pipeline.fit(df).transform(df)
#display(df_bert)
#df_bert.show()
print(df_bert.printSchema())
df_bert.select("token.result","stems_annotations.result").show(truncate=False)

root
 |-- text: string (nullable = true)
 |-- label: long (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: 

In [144]:
lemmatizer = LemmatizerModel.pretrained(name="lemma_antbnc", lang="en").setInputCols(['token']).setOutputCol('lemma_annotations')

bert_pipeline = Pipeline().setStages(
  [
    document_assembler,
    tokenizer,
    lemmatizer
  ]
)

df_bert = bert_pipeline.fit(df).transform(df)
df_bert.select("lemma_annotations.result").show(truncate=False)


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
+-------------------------------------------------------------------------------+
|result                                                                         |
+-------------------------------------------------------------------------------+
|[New, York, be, the, great, city, in, the, world, great, Listing, List, Listed]|
|[The, beauty, of, Paris, be, vast]                                             |
|[The, Centre, Pompidou, be, in, Paris]                                         |
+-------------------------------------------------------------------------------+



In [207]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.embeddings import *
from pyspark.sql.types import *


import sparknlp
spark = sparknlp.start()

sentenceDataFrame = spark.createDataFrame([
    ("abc cde",1),("eefg efa efb",2)
], ["text","v"])

sentenceDataFrame.show()

# Spell check (Distance)

#df = spark.createDataFrame([("abc cde"),("eefg efa efb")], ["names"])

nlpPipeline = Pipeline().setStages([
  DocumentAssembler().setInputCol("text").setOutputCol("document"),
  Tokenizer().setInputCols("document").setOutputCol("tokens")])

df_bert = bert_pipeline.fit(sentenceDataFrame).transform(sentenceDataFrame)
df_bert.show()


+------------+---+
|        text|  v|
+------------+---+
|     abc cde|  1|
|eefg efa efb|  2|
+------------+---+

+------------+---+--------------------+--------------------+
|        text|  v|            document|               token|
+------------+---+--------------------+--------------------+
|     abc cde|  1|[[document, 0, 6,...|[[token, 0, 2, ab...|
|eefg efa efb|  2|[[document, 0, 11...|[[token, 0, 3, ee...|
+------------+---+--------------------+--------------------+

