In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [2]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys
from os.path import join as PJ
def get_workstation_spark_path(where_are_you : str) -> str:
    if where_are_you == 'titan':
        return '/home/data/ryanchao2012/lib'
    elif where_are_you == 'thor':
        return '/opt/spark/versions'
    else:
        raise ValueError("wrong work station name")

spark_path = get_workstation_spark_path('thor')

print('You have pyspark version : ', os.listdir(spark_path))
os.environ['PYSPARK_PYTHON'] = sys.executable
# spark-2.3, spark-2.4
# os.environ['SPARK_HOME'] = os.path.join(spark
os.environ['SPARK_HOME'] = '/opt/spark/versions/spark-3.1.2-bin-hadoop2.7/'

You have pyspark version :  ['spark-2.3', 'spark-3.1.2-bin-hadoop2.7', 'spark-3.0', 'spark-3.0.1-bin-hadoop2.7', 'spark-2.3.4-bin-hadoop2.7', 'spark-2.4.7-bin-hadoop2.7', 'spark-2.4', 'spark-3.1']


In [4]:
os.environ['SPARK_HOME']

'/opt/spark/versions/spark-3.1.2-bin-hadoop2.7/'

In [5]:
from os.path import join
import pandas as pd
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
C = F.col

In [6]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '4g')
    .set('spark.driver.maxResultSize', '1g')
   )

In [7]:
spark = (Session
     .builder
     .appName('pyspark-challenge-nlp')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

21/08/03 10:27:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/08/03 10:27:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/08/03 10:27:59 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [8]:
spark

# CounterVectorizer | Hashing TF

https://towardsdatascience.com/countvectorizer-hashingtf-e66f169e2d4e

1. **input** : array of tokens
2. **output** : vector (dense or sparse)

e.g.

```
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
```

calculate via all the corpus(the word column)

In [9]:
data = [
    (0, "PYTHON HIVE HIVE".split(" ")),
    (1, "JAVA JAVA SQL".split(" ")),
]

cols = ["id","words"]

df = spark.createDataFrame(data, cols)
df.show(truncate=False)
df.printSchema()

[Stage 0:>                                                          (0 + 1) / 1]

+---+--------------------+
|id |words               |
+---+--------------------+
|0  |[PYTHON, HIVE, HIVE]|
|1  |[JAVA, JAVA, SQL]   |
+---+--------------------+

root
 |-- id: long (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



                                                                                

In [10]:
## CounterVectorlizerModel

from pyspark.ml.feature import CountVectorizer
# CountVectorizer??

# minTF=1.0,
# minDF=1.0,
# vocabSize=262144,
# binary=False,
# inputCol=None,
# outputCol=None,

In [11]:
cv = CountVectorizer(inputCol="words",outputCol="features")

model = cv.fit(df)

res = model.transform(df)

res.printSchema()

res.show(truncate=False)

print('you can check the vocabulary : ',sorted(model.vocabulary))
print('the order per row (follow the counts)')

for row in df.rdd.toLocalIterator():
    print(row.id, ':', set(row.words), sep=' ')
    
# Term    Freq Index
# HIVE     2     0
# JAVA     2     1
# PYTHON      1     2
# SQL   1     3

# (4,             [1, 2], [2.0, 1.0])
# (vector_legnth, vector index)

root
 |-- id: long (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)

+---+--------------------+-------------------+
|id |words               |features           |
+---+--------------------+-------------------+
|0  |[PYTHON, HIVE, HIVE]|(4,[0,3],[2.0,1.0])|
|1  |[JAVA, JAVA, SQL]   |(4,[1,2],[2.0,1.0])|
+---+--------------------+-------------------+

you can check the vocabulary :  ['HIVE', 'JAVA', 'PYTHON', 'SQL']
the order per row (follow the counts)
0 : {'HIVE', 'PYTHON'}
1 : {'SQL', 'JAVA'}


In [12]:
# sorted(model.vocabulary)
# model.params

In [13]:
# dir(cv)
# dir(model)

In [14]:
from pyspark.ml.feature import HashingTF

In [15]:
ht = HashingTF(inputCol="words",outputCol="features")

ht_res = ht.transform(df)

ht_res.printSchema()
ht_res.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)

+---+--------------------+----------------------------------+
|id |words               |features                          |
+---+--------------------+----------------------------------+
|0  |[PYTHON, HIVE, HIVE]|(262144,[129668,191247],[2.0,1.0])|
|1  |[JAVA, JAVA, SQL]   |(262144,[53343,256570],[2.0,1.0]) |
+---+--------------------+----------------------------------+



# TFIDF

https://spark.apache.org/docs/2.3.4/api/python/pyspark.ml.html?highlight=idf#pyspark.ml.feature.IDF

$t$ : term

$d$ : document

$D$ : corpus

$TF(t, d)$ - Term frequency : number of times that term $t$ appears in document $d$

$DF(t, D) : $ - document frequency : number of documents that contains term $t$

$IDF(t, D) = log \frac{|D| + 1}{DF(t, D) + 1}$ - numerical measure of how much information a term $t$ provides in corpus $D$

* a term appears in all documents, $IDF(t, D) = 0$

* smoothing term is applied to avoid dividibng by zero for terms outside the corpus(unseen testing set)

## Toy Sample I

https://runawayhorse001.github.io/LearningApacheSpark/pyspark.pdf

page 93

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF

In [17]:
sentence = [
    (0, "Python python Spark Spark"),
    (1, "Python SQL")
]

col = ["doc_id","document"]

df = spark.createDataFrame(sentence, col)
df.printSchema()
df.show(truncate=False)

root
 |-- doc_id: long (nullable = true)
 |-- document: string (nullable = true)

+------+-------------------------+
|doc_id|document                 |
+------+-------------------------+
|0     |Python python Spark Spark|
|1     |Python SQL               |
+------+-------------------------+



Calculate the idf of each terms by hand

$t$ : (normalized) term : $\text{{python, spark, sql}}$

$d_{i}$ : document : $d_1, d_2$

$D$ : copus = ${d_1, d_2}$

----------------------------------------------------------------

$DF(\text{python}, D) = 2$

$DF(\text{spark}, D) = 1$

$DF(\text{sql}, D) = 1$

$DF(\text{python}, D) = log_{2} \frac{2+1}{2+1} = 0$ - `python` appears in every document

$DF(\text{spark}, D) = log_{2} \frac{2+1}{1+1} = log_{2} \frac{3}{2}$ - 0.405

$DF(\text{sql}, D) = log_{2} \frac{2+1}{1+1} = log_{2} \frac{3}{2}$ - 0.405


In [18]:
import numpy as np

np.log(3/2)

0.4054651081081644

In [19]:
# TODO

# It's wrong, we need idf only

tokenizer = Tokenizer(inputCol="document",
                      outputCol="term")

vectorizer = CountVectorizer(inputCol="term",
                             outputCol="raw_features")

idf = IDF(inputCol="raw_features",outputCol="features")

pipeline = Pipeline(stages=[tokenizer, vectorizer, idf])

model = pipeline.fit(df)

In [20]:
model.transform(df).show(vertical=True, truncate=False)

-RECORD 0------------------------------------------
 doc_id       | 0                                  
 document     | Python python Spark Spark          
 term         | [python, python, spark, spark]     
 raw_features | (3,[0,1],[2.0,2.0])                
 features     | (3,[0,1],[0.0,0.8109302162163288]) 
-RECORD 1------------------------------------------
 doc_id       | 1                                  
 document     | Python SQL                         
 term         | [python, sql]                      
 raw_features | (3,[0,2],[1.0,1.0])                
 features     | (3,[0,2],[0.0,0.4054651081081644]) 



### IDF Only

In [21]:
df.show(truncate=False)

+------+-------------------------+
|doc_id|document                 |
+------+-------------------------+
|0     |Python python Spark Spark|
|1     |Python SQL               |
+------+-------------------------+



In [22]:
tokenizer = Pipeline(stages=[tokenizer])

model = tokenizer.fit(df)

doc_term_sdf = (
    model.transform(df)
    .withColumn('term',F.explode_outer('term'))
)

print("doc_term dataframe : ")

doc_term_sdf.show(truncate=False)

def compute_idf(doc_term_sdf : DataFrame,
                doc_id_col : str,
                term_col : str,
                verbose : bool = True
               ):
    """
    Input : should be term level dataframe 
            with document content and doc_id
    
    +------+-------------------------+
    |doc_id|document                 |
    +------+-------------------------+
    |0     |Python python Spark Spark|
    |1     |Python SQL               |
    +------+-------------------------+
    
    Out : term level idf values
    
    +------+--------------------------+-----+------------------+
    |term  |document_frequency_by_term|n_doc|idf               |
    +------+--------------------------+-----+------------------+
    |sql   |1                         |2    |0.5849625007211562|
    |spark |1                         |2    |0.5849625007211562|
    |python|2                         |2    |0.0               |
    +------+--------------------------+-----+------------------+
    
    TODO add formula
    
    """
    
    n_document = doc_term_sdf.select(doc_id_col).distinct().count()
    
    if verbose:
        print('n document : ',n_document)
    
    idf : DataFrame = (
    doc_term_sdf
    .groupBy(term_col)
    # well, id means document_id
    .agg(F.countDistinct(doc_id_col).alias("document_frequency_by_term"))
    .withColumn('n_doc', F.lit(n_document).cast('integer'))
    .withColumn("idf",
                F.log2(
                    (C("n_doc") + F.lit(1)) / (C("document_frequency_by_term") + F.lit(1))
                )
               )
    )
    
    return idf
    
    
idf_sdf = compute_idf(doc_term_sdf, 'doc_id', 'term')
idf_sdf.show()

doc_term_idf_sdf = (
    doc_term_sdf
    .join(
        idf_sdf.drop('n_doc'),
    on='term',
    how='left')
)

doc_term_idf_sdf.show(vertical=True,truncate=False)

doc_term dataframe : 
+------+-------------------------+------+
|doc_id|document                 |term  |
+------+-------------------------+------+
|0     |Python python Spark Spark|python|
|0     |Python python Spark Spark|python|
|0     |Python python Spark Spark|spark |
|0     |Python python Spark Spark|spark |
|1     |Python SQL               |python|
|1     |Python SQL               |sql   |
+------+-------------------------+------+



                                                                                

n document :  2




+------+--------------------------+-----+------------------+
|  term|document_frequency_by_term|n_doc|               idf|
+------+--------------------------+-----+------------------+
|   sql|                         1|    2|0.5849625007211562|
| spark|                         1|    2|0.5849625007211562|
|python|                         2|    2|               0.0|
+------+--------------------------+-----+------------------+

-RECORD 0-----------------------------------------------
 term                       | sql                       
 doc_id                     | 1                         
 document                   | Python SQL                
 document_frequency_by_term | 1                         
 idf                        | 0.5849625007211562        
-RECORD 1-----------------------------------------------
 term                       | spark                     
 doc_id                     | 0                         
 document                   | Python python Spark Spark 
 d

## Toy Sample III (PoI)

In [23]:
from pixlake.tables.warehouse.log.mainstream.meta import poi

  from pixlake.tables.warehouse.log.mainstream.meta import poi


In [25]:
# poi_sdf = ( poi
#             .load(daterange=[20210530, 20210530])
#           )

In [27]:
# (
#     poi_sdf
#     .where(C("category_id") == 0)
# #     .orderBy(C("name"))
#     .limit(5)
#     .select(['name'])
# ).show(truncate=False)


sentence = [
    (0, "江蘇老趙刀切麵"),
    (1, "石研室 石頭火鍋"),
    (2, "石二鍋"),
    (3, "肉多多火鍋"),
    (4, "Vivienne Westwood Cafe")
]

col = ["doc_id","document"]

df = spark.createDataFrame(sentence, col)
df.printSchema()
df.show(truncate=False)


root
 |-- doc_id: long (nullable = true)
 |-- document: string (nullable = true)

+------+----------------------+
|doc_id|document              |
+------+----------------------+
|0     |江蘇老趙刀切麵        |
|1     |石研室 石頭火鍋       |
|2     |石二鍋                |
|3     |肉多多火鍋            |
|4     |Vivienne Westwood Cafe|
+------+----------------------+



In [28]:
tokenizer = Tokenizer(inputCol="document",
                      outputCol="term")

vectorizer = CountVectorizer(inputCol="term",
                             outputCol="raw_features")

idf = IDF(inputCol="raw_features",outputCol="features")

pipeline = Pipeline(stages=[tokenizer, vectorizer, idf])

model = pipeline.fit(df)

In [29]:
res = model.transform(df)

res.printSchema()

res.show(vertical=True, truncate=False)

root
 |-- doc_id: long (nullable = true)
 |-- document: string (nullable = true)
 |-- term: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)

-RECORD 0------------------------------------------------------------------------------
 doc_id       | 0                                                                      
 document     | 江蘇老趙刀切麵                                                         
 term         | [江蘇老趙刀切麵]                                                       
 raw_features | (8,[3],[1.0])                                                          
 features     | (8,[3],[1.0986122886681098])                                           
-RECORD 1------------------------------------------------------------------------------
 doc_id       | 1                                                                      
 document     | 石研室 石頭火鍋                                     

## idf only

In [30]:
from pixlake.nlp.utils import unigram
df.show(truncate=False)

+------+----------------------+
|doc_id|document              |
+------+----------------------+
|0     |江蘇老趙刀切麵        |
|1     |石研室 石頭火鍋       |
|2     |石二鍋                |
|3     |肉多多火鍋            |
|4     |Vivienne Westwood Cafe|
+------+----------------------+



In [31]:
doc_term_sdf = (
    df
    .withColumn('term',F.udf(unigram,
                             returnType="array<string>")("document", F.lit(True))
               )
    .withColumn('term',F.explode_outer('term'))
)


doc_term_sdf.show(truncate=False)


    
idf_sdf = compute_idf(doc_term_sdf, 'doc_id', 'term')

doc_term_idf_sdf = (
    doc_term_sdf
    .join(
        idf_sdf.drop('n_doc'),
    on='term',
    how='left')
)

(
    doc_term_idf_sdf
    .orderBy("idf")
#     .orderBy("doc_id",'document')
).show(vertical=True,truncate=False)

+------+---------------+----+
|doc_id|document       |term|
+------+---------------+----+
|0     |江蘇老趙刀切麵 |江  |
|0     |江蘇老趙刀切麵 |蘇  |
|0     |江蘇老趙刀切麵 |老  |
|0     |江蘇老趙刀切麵 |趙  |
|0     |江蘇老趙刀切麵 |刀  |
|0     |江蘇老趙刀切麵 |切  |
|0     |江蘇老趙刀切麵 |麵  |
|1     |石研室 石頭火鍋|石  |
|1     |石研室 石頭火鍋|研  |
|1     |石研室 石頭火鍋|室  |
|1     |石研室 石頭火鍋|石  |
|1     |石研室 石頭火鍋|頭  |
|1     |石研室 石頭火鍋|火  |
|1     |石研室 石頭火鍋|鍋  |
|2     |石二鍋         |石  |
|2     |石二鍋         |二  |
|2     |石二鍋         |鍋  |
|3     |肉多多火鍋     |肉  |
|3     |肉多多火鍋     |多  |
|3     |肉多多火鍋     |多  |
+------+---------------+----+
only showing top 20 rows

n document :  5




-RECORD 0--------------------------------------------
 term                       | 鍋                     
 doc_id                     | 3                      
 document                   | 肉多多火鍋             
 document_frequency_by_term | 3                      
 idf                        | 0.5849625007211562     
-RECORD 1--------------------------------------------
 term                       | 鍋                     
 doc_id                     | 2                      
 document                   | 石二鍋                 
 document_frequency_by_term | 3                      
 idf                        | 0.5849625007211562     
-RECORD 2--------------------------------------------
 term                       | 鍋                     
 doc_id                     | 1                      
 document                   | 石研室 石頭火鍋        
 document_frequency_by_term | 3                      
 idf                        | 0.5849625007211562     
-RECORD 3-------------------------------------



# ngram

* we use `unigram` in pixlake ( a simple tokenizer to split english word and chinese word, skip sapce

## sklearn ngram

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
# from pixlake.nlp.utils import ZH_PATTERNS

In [33]:

corpus = [
    'This is the first document.',
#     '江蘇老趙刀切麵',
]




vectorizer2 = CountVectorizer(
    analyzer='word',
     # you gonna need a tokenizer if you wanna dealing with chinese
    ngram_range=(3, 3)
)
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X2.toarray())


['is the first', 'the first document', 'this is the']
[[1 1 1]]


## pyspark NGram


input : array of tokens
output : array of Ngram(space seperated)


e.g.

```
root
 |-- doc_id: long (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- bi-gram: array (nullable = true)
 |    |-- element: string (containsNull = false)
```

if we wanna build `NGram range`, union_dataframe is needed due to it's fux sized range

```
-RECORD 6----------------------------------------------------------------
 doc_id   | 6                                                            
 document | mo-mo-paradise                                               
 token    | [mo-mo-paradise]                                             
 bi-gram  | []
```

You gonna do some filtering each `i^th` gram

In [34]:
from pyspark.ml.feature import NGram

ngram = NGram(n=2, inputCol='token',outputCol='bi-gram')

fivegram = NGram(n=5, inputCol='token',outputCol='5-gram')

sentence = [
    (0, "江蘇老趙刀切麵"),
    (1, "石研室 石頭火鍋"),
    (2, "石二鍋"),
    (3, "しゃぶしゃぶ温野菜日本涮涮鍋專門店"),
    (4, "Vivienne Westwood Cafe"),
    (5, "婧 shabu"),
    (6, 'mo-mo-paradise'),
    (7, 'mo mo paradise')
]


col = ["doc_id","document"]

# space-separated by default

df = (
    spark.createDataFrame(sentence, col)
    .withColumn('token', F.udf(unigram,'array<string>')('document', F.lit(True)))
)

bigram_df = ngram.transform(df).cache()
bigram_df.printSchema()
bigram_df.show(vertical=True,truncate=False)

fivegram.transform(df).show(vertical=True, truncate=False)

root
 |-- doc_id: long (nullable = true)
 |-- document: string (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- bi-gram: array (nullable = true)
 |    |-- element: string (containsNull = true)

-RECORD 0-------------------------------------------------------------------------------------------
 doc_id   | 0                                                                                       
 document | 江蘇老趙刀切麵                                                                          
 token    | [江, 蘇, 老, 趙, 刀, 切, 麵]                                                            
 bi-gram  | [江 蘇, 蘇 老, 老 趙, 趙 刀, 刀 切, 切 麵]                                              
-RECORD 1-------------------------------------------------------------------------------------------
 doc_id   | 1                                                                                       
 document | 石研室 石頭火鍋                                               

## ngram by window func

input : token - array of string
output : Ngram - array of string

```
root
 |-- doc_id: long (nullable = true)
 |-- token: string (nullable = true)
 |-- bi-gram: array (nullable = true)
 |    |-- element: string (containsNull = true)
```

1. compare to `Pyspark NGram` - the NGram displayed at `row` level not `column` level

It's fix size range, if you wanna implement NGram range, you still needs union_dataframe

and perform some filtering

In [35]:
sentence = [
    (0, "江蘇老趙刀切麵"),
    (1, "石研室 石頭火鍋"),
    (2, "石二鍋"),
    (3, "しゃぶしゃぶ温野菜日本涮涮鍋專門店"),
    (4, "Vivienne Westwood Cafe"),
    (5, "婧 shabu"),
    (6, 'mo-mo-paradise'),
    (7, 'mo mo paradise')
]

col = ["doc_id","document"]

df = spark.createDataFrame(sentence, col)
df.printSchema()
df.show(truncate=False)


bigram_span_in_doc = (
    W.partitionBy('doc_id')
     .orderBy('token_id')
#      .rowsBetween(0, 2) 
     .rowsBetween(0, 1) 
    
    
#     |2     |石二鍋     |0       |石    |石二鍋    | 4gram for 3 words, jusrt filter them =)
)

bigram_df = (
    df
    .withColumn('unigram', F.udf(unigram, "array<string>")("document", F.lit(True)))
    .select(
        '*',
        F.posexplode_outer('unigram').alias('token_id','token')
    )
    .drop('unigram')
    .withColumn('bi-gram', F.collect_list(C("token")).over(bigram_span_in_doc))
    .withColumn('length_of_bi-gram', F.size(C("bi-gram")))
)
#     .withColumn('bi-gram',F.concat_ws('',C("bi-gram")))

bigram_df.printSchema()
bigram_df.show(n=30,truncate=False)

root
 |-- doc_id: long (nullable = true)
 |-- document: string (nullable = true)

+------+----------------------------------+
|doc_id|document                          |
+------+----------------------------------+
|0     |江蘇老趙刀切麵                    |
|1     |石研室 石頭火鍋                   |
|2     |石二鍋                            |
|3     |しゃぶしゃぶ温野菜日本涮涮鍋專門店|
|4     |Vivienne Westwood Cafe            |
|5     |婧 shabu                          |
|6     |mo-mo-paradise                    |
|7     |mo mo paradise                    |
+------+----------------------------------+

root
 |-- doc_id: long (nullable = true)
 |-- document: string (nullable = true)
 |-- token_id: integer (nullable = true)
 |-- token: string (nullable = true)
 |-- bi-gram: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- length_of_bi-gram: integer (nullable = false)

+------+----------------------------------+--------+--------------+------------------+-----------------+
|doc_id|document    

### creating ngram_groups 

In [36]:
from pixlake.etl.snippets import union_dataframes

def build_ngram(
                sdf : DataFrame,
                doc_id_col : str,
                document_col : str,
                output_col : str ,
                ngram_range : list):
    """
    Example
    
        sentence = [
        (0, "江蘇老趙刀切麵"),
        (1, "石研室 石頭火鍋"),
        (2, "石二鍋"),
        (3, "しゃぶしゃぶ温野菜日本涮涮鍋專門店"),
        (4, "Vivienne Westwood Cafe")
        ]

        col = ["doc_id","document"]

        df = spark.createDataFrame(sentence, col)
        
        +------+----------------------+
        |doc_id|document              |
        +------+----------------------+
        |0     |江蘇老趙刀切麵               |
        |1     |石研室 石頭火鍋              |
        |2     |石二鍋                   |
        |3     |しゃぶしゃぶ温野菜日本涮涮鍋專門店     |
        |4     |Vivienne Westwood Cafe|
        +------+----------------------+
        
        ngram_sdf = build_ngram(df,
            doc_id_col='doc_id',
            document_col='document',
            output_col='ngram',
            ngram_range = [2,3]
           )

        (
            ngram_sdf
            .select(
                'doc_id',
                'document',
                C('ngram.n').alias('n'),
                C('ngram.ngram').alias('ngram'),
            )
            .orderBy(C('doc_id'),C('n').desc())
        ).show()
        
        +------+--------+---+---------+
        |doc_id|document|  n|    ngram|
        +------+--------+---+---------+
        |     0| 江蘇老趙刀切麵|  3|[蘇, 老, 趙]|
        |     0| 江蘇老趙刀切麵|  3|[刀, 切, 麵]|
        |     0| 江蘇老趙刀切麵|  3|[趙, 刀, 切]|
        |     0| 江蘇老趙刀切麵|  3|[老, 趙, 刀]|
        |     0| 江蘇老趙刀切麵|  3|[江, 蘇, 老]|
        |     0| 江蘇老趙刀切麵|  2|   [切, 麵]|
        |     0| 江蘇老趙刀切麵|  2|   [江, 蘇]|
        |     0| 江蘇老趙刀切麵|  2|   [老, 趙]|
        |     0| 江蘇老趙刀切麵|  2|   [趙, 刀]|
        |     0| 江蘇老趙刀切麵|  2|   [刀, 切]|
        |     0| 江蘇老趙刀切麵|  2|   [蘇, 老]|
        |     1|石研室 石頭火鍋|  3|[石, 頭, 火]|
        |     1|石研室 石頭火鍋|  3|[研, 室, 石]|
        |     1|石研室 石頭火鍋|  3|[室, 石, 頭]|
        |     1|石研室 石頭火鍋|  3|[石, 研, 室]|
        |     1|石研室 石頭火鍋|  3|[頭, 火, 鍋]|
        |     1|石研室 石頭火鍋|  2|   [室, 石]|
        |     1|石研室 石頭火鍋|  2|   [石, 頭]|
        |     1|石研室 石頭火鍋|  2|   [火, 鍋]|
        |     1|石研室 石頭火鍋|  2|   [頭, 火]|
        +------+--------+---+---------+
        
    """
    
    ngram_start, ngram_end = ngram_range[0], ngram_range[1] # complement
    print(f'ngram_start : {ngram_start}, ngram_end : {ngram_end}')
    
    unigram_sdf = (
        sdf
        .withColumn('unigram', F.udf(unigram, "array<string>")(document_col, F.lit(True)))
        .withColumn('n', F.lit(1))
        .select(
            '*',
            F.posexplode_outer('unigram').alias('token_id','token')
        )
        .drop('unigram')
    )
    
    _sdf_list = []
    
    _sdf_list.append(
                unigram_sdf
                .withColumn(output_col,F.struct(C("n"),
                                                C("token"))
                           )
        )

#     unigram_sdf.show()  

    if ngram_end == 1:
        raise NotImplementedError('NotImplementedError')
#         return union_dataframes()
    
    else:
        for i in range(ngram_start, ngram_end + 1):
            print(f'get {i} grams excution plan...') #2

            token_span = (
                    W.partitionBy(doc_id_col)
                     .orderBy('token_id')
                     .rowsBetween(0, i - 1)
                )
            ngram_sdf = (
                unigram_sdf
                # TODO add ngram position_id
                .withColumn("n", F.lit(i))
                .withColumn(output_col, F.collect_list(C("token")).over(token_span))
                .withColumn('length_of_ngram', F.size(output_col))
#                 .withColumn(output_col,F.concat_ws('',C(output_col)))
                .withColumn(output_col, F.struct(C("n"),C(output_col)))
                .where(C('length_of_ngram') == i)
                .drop('length_of_ngram')
            )
               
#             ngram_sdf.show(truncate=False)
            
            _sdf_list.append(ngram_sdf)
        
    _sdf_list = [sdf.drop('token_id','token','n') for sdf in _sdf_list]
    
    if ngram_start > 1:
        return union_dataframes(_sdf_list[1:])
    else:
        return union_dataframes(_sdf_list)

In [37]:
df.show()

+------+----------------------------------+
|doc_id|                          document|
+------+----------------------------------+
|     0|                    江蘇老趙刀切麵|
|     1|                   石研室 石頭火鍋|
|     2|                            石二鍋|
|     3|しゃぶしゃぶ温野菜日本涮涮鍋專門店|
|     4|              Vivienne Westwood...|
|     5|                          婧 shabu|
|     6|                    mo-mo-paradise|
|     7|                    mo mo paradise|
+------+----------------------------------+



In [38]:
ngram_sdf = build_ngram(df,
            doc_id_col='doc_id',
            document_col='document',
            output_col='ngram',
            ngram_range = [2,3]
           )

ngram_start : 2, ngram_end : 3
get 2 grams excution plan...
get 3 grams excution plan...


In [39]:
(
    ngram_sdf
    .select(
        'doc_id',
        'document',
        C('ngram.n').alias('n'),
        C('ngram.ngram').alias('ngram'),
    )
    .orderBy(C('doc_id'),C('n').desc())
).show()
# ).show(n=200,vertical=True, truncate=False)
# ).count()



+------+---------------+---+------------+
|doc_id|       document|  n|       ngram|
+------+---------------+---+------------+
|     0| 江蘇老趙刀切麵|  3|[趙, 刀, 切]|
|     0| 江蘇老趙刀切麵|  3|[蘇, 老, 趙]|
|     0| 江蘇老趙刀切麵|  3|[刀, 切, 麵]|
|     0| 江蘇老趙刀切麵|  3|[江, 蘇, 老]|
|     0| 江蘇老趙刀切麵|  3|[老, 趙, 刀]|
|     0| 江蘇老趙刀切麵|  2|    [刀, 切]|
|     0| 江蘇老趙刀切麵|  2|    [蘇, 老]|
|     0| 江蘇老趙刀切麵|  2|    [切, 麵]|
|     0| 江蘇老趙刀切麵|  2|    [江, 蘇]|
|     0| 江蘇老趙刀切麵|  2|    [老, 趙]|
|     0| 江蘇老趙刀切麵|  2|    [趙, 刀]|
|     1|石研室 石頭火鍋|  3|[石, 頭, 火]|
|     1|石研室 石頭火鍋|  3|[研, 室, 石]|
|     1|石研室 石頭火鍋|  3|[室, 石, 頭]|
|     1|石研室 石頭火鍋|  3|[石, 研, 室]|
|     1|石研室 石頭火鍋|  3|[頭, 火, 鍋]|
|     1|石研室 石頭火鍋|  2|    [研, 室]|
|     1|石研室 石頭火鍋|  2|    [室, 石]|
|     1|石研室 石頭火鍋|  2|    [石, 頭]|
|     1|石研室 石頭火鍋|  2|    [頭, 火]|
+------+---------------+---+------------+
only showing top 20 rows



                                                                                

## ngram by slice


Do not support column =(



In [45]:
sentence = [
    (0, "江蘇老趙刀切麵"),
    (1, "石研室 石頭火鍋"),
    (2, "石二鍋"),
    (3, "しゃぶしゃぶ温野菜日本涮涮鍋專門店"),
    (4, "Vivienne Westwood Cafe"),
    (5, "婧 shabu"),
    (6, 'mo-mo-paradise'),
    (7, 'mo mo paradise')
]

col = ["doc_id","document"]

df = spark.createDataFrame(sentence, col)
(
    df
    .withColumn('token', F.udf(unigram, 'array<string>')('document',F.lit(True)))
    .withColumn("l_token", F.size("token"))
#     .withColumn('slice_0_2', F.slice("x", 0, 2)) # Unexpected value for start in function slice: SQL array indices start at 1.
    .withColumn('slice_1_2', F.slice("token", 1, 2))
    .withColumn('slice_1_3', F.slice("token", 1, 3))
    # py4j.Py4JException: Method slice([class org.apache.spark.sql.Column, class java.lang.Integer, class org.apache.spark.sql.Column]) does not exist
#     .withColumn('slice_1_N', F.slice("token", 1, C("l_token"))
#                )
).show()

+------+----------------------------------+-----------------------------+-------+--------------------+----------------------+
|doc_id|                          document|                        token|l_token|           slice_1_2|             slice_1_3|
+------+----------------------------------+-----------------------------+-------+--------------------+----------------------+
|     0|                    江蘇老趙刀切麵|   [江, 蘇, 老, 趙, 刀, 切...|      7|            [江, 蘇]|          [江, 蘇, 老]|
|     1|                   石研室 石頭火鍋|   [石, 研, 室, 石, 頭, 火...|      7|            [石, 研]|          [石, 研, 室]|
|     2|                            石二鍋|                 [石, 二, 鍋]|      3|            [石, 二]|          [石, 二, 鍋]|
|     3|しゃぶしゃぶ温野菜日本涮涮鍋專門店|[しゃぶしゃぶ, 温, 野, 菜,...|     12|  [しゃぶしゃぶ, 温]|[しゃぶしゃぶ, 温, 野]|
|     4|              Vivienne Westwood...|         [Vivienne, Westwo...|      3|[Vivienne, Westwood]|  [Vivienne, Westwo...|
|     5|                          婧 shabu|                  [婧, shabu]|      2| 

# LSH API

* Min-Hash for Jaccard (token 沒有權重的概念在裡面)
* Bucketed Random Projection for L2 (Threshold 很難切，因為你不知道要取多少)
* signed random projection for cosine ( spark 沒有官方實作 )