In [1]:
import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql.types import StringType

In [2]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.4.1
Apache Spark version:  2.4.4


In [5]:
dfTest = spark.createDataFrame([
    "Cloud computing is benefiting major manufacturing companies compute Listing Listed List",
    "Big data cloud computing cyber security machine learning"
], StringType()).toDF("text")
dfTest.show(truncate=False)

+---------------------------------------------------------------------------------------+
|text                                                                                   |
+---------------------------------------------------------------------------------------+
|Cloud computing is benefiting major manufacturing companies compute Listing Listed List|
|Big data cloud computing cyber security machine learning                               |
+---------------------------------------------------------------------------------------+



In [6]:
document_assembler = DocumentAssembler().setInputCol("text")
    
tokenizer = Tokenizer() .setInputCols(["document"]).setOutputCol("token")

bigrams = NGramGenerator() \
            .setInputCols(["token"]) \
            .setOutputCol("bigrams") \
            .setN(2)

trigrams_cum = NGramGenerator() \
            .setInputCols(["token"]) \
            .setOutputCol("trigrams") \
            .setN(3)            

pipeline = Pipeline(stages=[
    document_assembler, 
    tokenizer, 
    bigrams,
    trigrams_cum
])

In [7]:
model = pipeline.fit(dfTest)
prediction = model.transform(dfTest)

In [8]:
prediction.select("bigrams.result").show(2, truncate=60)

+------------------------------------------------------------+
|                                                      result|
+------------------------------------------------------------+
|[Cloud computing, computing is, is benefiting, benefiting...|
|[Big data, data cloud, cloud computing, computing cyber, ...|
+------------------------------------------------------------+



In [9]:
prediction.select("trigrams.result").show(2, truncate=60)

+------------------------------------------------------------+
|                                                      result|
+------------------------------------------------------------+
|[Cloud computing is, computing is benefiting, is benefiti...|
|[Big data cloud, data cloud computing, cloud computing cy...|
+------------------------------------------------------------+



In [10]:
from sparknlp.base import LightPipeline
text = 'Cloud computing is benefiting major manufacturing companies'
result = LightPipeline(model).annotate(text)
result

{'document': ['Cloud computing is benefiting major manufacturing companies'],
 'token': ['Cloud',
  'computing',
  'is',
  'benefiting',
  'major',
  'manufacturing',
  'companies'],
 'bigrams': ['Cloud computing',
  'computing is',
  'is benefiting',
  'benefiting major',
  'major manufacturing',
  'manufacturing companies'],
 'trigrams': ['Cloud computing is',
  'computing is benefiting',
  'is benefiting major',
  'benefiting major manufacturing',
  'major manufacturing companies']}

In [13]:
list(result.keys())

['document', 'token', 'bigrams', 'trigrams']

In [14]:
result['bigrams']

['Cloud computing',
 'computing is',
 'is benefiting',
 'benefiting major',
 'major manufacturing',
 'manufacturing companies']

In [15]:
result['trigrams']

['Cloud computing is',
 'computing is benefiting',
 'is benefiting major',
 'benefiting major manufacturing',
 'major manufacturing companies']