In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [2]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys
from os.path import join as PJ
def get_workstation_spark_path(where_are_you : str) -> str:
    if where_are_you == 'titan':
        return '/home/data/ryanchao2012/lib'
    elif where_are_you == 'thor':
        return '/opt/spark/versions'
    else:
        raise ValueError("wrong work station name")

spark_path = get_workstation_spark_path('thor')

print('You have pyspark version : ', os.listdir(spark_path))
os.environ['PYSPARK_PYTHON'] = sys.executable
# spark-2.3, spark-2.4
os.environ['SPARK_HOME'] = os.path.join(spark_path,'spark-2.3')

You have pyspark version :  ['spark-2.3', 'spark-3.0', 'spark-3.0.1-bin-hadoop2.7', 'spark-2.3.4-bin-hadoop2.7', 'spark-2.4.7-bin-hadoop2.7', 'spark-2.4']


In [4]:
os.environ['SPARK_HOME']

'/opt/spark/versions/spark-2.3'

In [5]:
from os.path import join
import pandas as pd
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
C = F.col

In [6]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '4g')
    .set('spark.driver.maxResultSize', '1g')
   )

In [7]:
spark = (Session
     .builder
     .appName('pyspark-challenge-nlp')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

In [8]:
spark

# CounterVectorizer | Hashing TF

https://towardsdatascience.com/countvectorizer-hashingtf-e66f169e2d4e


In [9]:
data = [
    (0, "PYTHON HIVE HIVE".split(" ")),
    (1, "JAVA JAVA SQL".split(" ")),
]

cols = ["id","words"]

df = spark.createDataFrame(data, cols)
df.show(truncate=False)
df.printSchema()

+---+--------------------+
|id |words               |
+---+--------------------+
|0  |[PYTHON, HIVE, HIVE]|
|1  |[JAVA, JAVA, SQL]   |
+---+--------------------+

root
 |-- id: long (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [10]:
## CounterVectorlizerModel

from pyspark.ml.feature import CountVectorizer
# CountVectorizer??

# minTF=1.0,
# minDF=1.0,
# vocabSize=262144,
# binary=False,
# inputCol=None,
# outputCol=None,

In [11]:
cv = CountVectorizer(inputCol="words",outputCol="features")

model = cv.fit(df)

res = model.transform(df)

res.show(truncate=False)

print('you can check the vocabulary : ',sorted(model.vocabulary))
print('the order per row (follow the counts)')

for row in df.rdd.toLocalIterator():
    print(row.id, ':', set(row.words), sep=' ')
    
# Term    Freq Index
# HIVE     2     0
# JAVA     2     1
# PYTHON      1     2
# SQL   1     3

# (4,             [1, 2], [2.0, 1.0])
# (vector_legnth, vector index)

+---+--------------------+-------------------+
|id |words               |features           |
+---+--------------------+-------------------+
|0  |[PYTHON, HIVE, HIVE]|(4,[0,3],[2.0,1.0])|
|1  |[JAVA, JAVA, SQL]   |(4,[1,2],[2.0,1.0])|
+---+--------------------+-------------------+

you can check the vocabulary :  ['HIVE', 'JAVA', 'PYTHON', 'SQL']
the order per row (follow the counts)
0 : {'PYTHON', 'HIVE'}
1 : {'JAVA', 'SQL'}


In [12]:
# sorted(model.vocabulary)
# model.params

In [13]:
# dir(cv)
# dir(model)

In [14]:
from pyspark.ml.feature import HashingTF

In [15]:
ht = HashingTF(inputCol="words",outputCol="features")

ht_res = ht.transform(df)

ht_res.show(truncate=False)

+---+--------------------+----------------------------------+
|id |words               |features                          |
+---+--------------------+----------------------------------+
|0  |[PYTHON, HIVE, HIVE]|(262144,[129668,134160],[2.0,1.0])|
|1  |[JAVA, JAVA, SQL]   |(262144,[53343,167238],[2.0,1.0]) |
+---+--------------------+----------------------------------+



# TFIDF

https://spark.apache.org/docs/2.3.4/api/python/pyspark.ml.html?highlight=idf#pyspark.ml.feature.IDF

$t$ : term

$d$ : document

$D$ : corpus

$TF(t, d)$ - Term frequency : number of times that term $t$ appears in document $d$

$DF(t, D) : $ - document frequency : number of documents that contains term $t$

$IDF(t, D) = log \frac{|D| + 1}{DF(t, D) + 1}$ - numerical measure of how much information a term $t$ provides in corpus $D$

* a term appears in all documents, $IDF(t, D) = 0$

* smoothing term is applied to avoid dividibng by zero for terms outside the corpus(unseen testing set)

## Toy Sample I

https://runawayhorse001.github.io/LearningApacheSpark/pyspark.pdf

page 93

In [20]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF

In [61]:
sentence = [
    (0, "Python python Spark Spark"),
    (1, "Python SQL")
]

col = ["doc_id","document"]

df = spark.createDataFrame(sentence, col)
df.printSchema()
df.show(truncate=False)

root
 |-- doc_id: long (nullable = true)
 |-- document: string (nullable = true)

+------+-------------------------+
|doc_id|document                 |
+------+-------------------------+
|0     |Python python Spark Spark|
|1     |Python SQL               |
+------+-------------------------+



Calculate the idf of each terms by hand

$t$ : (normalized) term : $\text{{python, spark, sql}}$

$d_{i}$ : document : $d_1, d_2$

$D$ : copus = ${d_1, d_2}$

----------------------------------------------------------------

$DF(\text{python}, D) = 2$

$DF(\text{spark}, D) = 1$

$DF(\text{sql}, D) = 1$

$DF(\text{python}, D) = log_{2} \frac{2+1}{2+1} = 0$ - `python` appears in every document

$DF(\text{spark}, D) = log_{2} \frac{2+1}{1+1} = log_{2} \frac{3}{2}$ - 0.405

$DF(\text{sql}, D) = log_{2} \frac{2+1}{1+1} = log_{2} \frac{3}{2}$ - 0.405


In [62]:
import numpy as np

np.log(3/2)

0.4054651081081644

In [63]:
# TODO

# It's wrong, we need idf only

tokenizer = Tokenizer(inputCol="document",
                      outputCol="term")

vectorizer = CountVectorizer(inputCol="term",
                             outputCol="raw_features")

idf = IDF(inputCol="raw_features",outputCol="features")

pipeline = Pipeline(stages=[tokenizer, vectorizer, idf])

model = pipeline.fit(df)

In [64]:
model.transform(df).show(vertical=True, truncate=False)

-RECORD 0------------------------------------------
 doc_id       | 0                                  
 document     | Python python Spark Spark          
 term         | [python, python, spark, spark]     
 raw_features | (3,[0,1],[2.0,2.0])                
 features     | (3,[0,1],[0.0,0.8109302162163288]) 
-RECORD 1------------------------------------------
 doc_id       | 1                                  
 document     | Python SQL                         
 term         | [python, sql]                      
 raw_features | (3,[0,2],[1.0,1.0])                
 features     | (3,[0,2],[0.0,0.4054651081081644]) 



### IDF Only

In [66]:
df.show(truncate=False)

+------+-------------------------+
|doc_id|document                 |
+------+-------------------------+
|0     |Python python Spark Spark|
|1     |Python SQL               |
+------+-------------------------+



In [75]:
tokenizer = Pipeline(stages=[tokenizer])

model = tokenizer.fit(df)

doc_term_sdf = (
    model.transform(df)
    .withColumn('term',F.explode_outer('term'))
)

print("doc_term dataframe : ")

doc_term_sdf.show(truncate=False)

def compute_idf(doc_term_sdf : DataFrame,
                doc_id_col : str,
                term_col : str,
                verbose : bool = True
               ):
    """
    Input : should be term level dataframe 
            with document content and doc_id
    
    +------+-------------------------+
    |doc_id|document                 |
    +------+-------------------------+
    |0     |Python python Spark Spark|
    |1     |Python SQL               |
    +------+-------------------------+
    
    Out : term level idf values
    
    +------+--------------------------+-----+------------------+
    |term  |document_frequency_by_term|n_doc|idf               |
    +------+--------------------------+-----+------------------+
    |sql   |1                         |2    |0.5849625007211562|
    |spark |1                         |2    |0.5849625007211562|
    |python|2                         |2    |0.0               |
    +------+--------------------------+-----+------------------+
    
    TODO add formula
    
    """
    
    n_document = doc_term_sdf.select(doc_id_col).distinct().count()
    
    if verbose:
        print('n document : ',n_document)
    
    idf : DataFrame = (
    doc_term_sdf
    .groupBy(term_col)
    # well, id means document_id
    .agg(F.countDistinct(doc_id_col).alias("document_frequency_by_term"))
    .withColumn('n_doc', F.lit(n_document).cast('integer'))
    .withColumn("idf",
                F.log2(
                    (C("n_doc") + F.lit(1)) / (C("document_frequency_by_term") + F.lit(1))
                )
               )
    )
    
    return idf
    
    
idf_sdf = compute_idf(doc_term_sdf, 'doc_id', 'term')
idf_sdf.show()

doc_term_idf_sdf = (
    doc_term_sdf
    .join(
        idf_sdf.drop('n_doc'),
    on='term',
    how='left')
)

doc_term_idf_sdf.show(vertical=True,truncate=False)

doc_term dataframe : 
+------+-------------------------+------+
|doc_id|document                 |term  |
+------+-------------------------+------+
|0     |Python python Spark Spark|python|
|0     |Python python Spark Spark|python|
|0     |Python python Spark Spark|spark |
|0     |Python python Spark Spark|spark |
|1     |Python SQL               |python|
|1     |Python SQL               |sql   |
+------+-------------------------+------+

n document :  2
+------+--------------------------+-----+------------------+
|  term|document_frequency_by_term|n_doc|               idf|
+------+--------------------------+-----+------------------+
|   sql|                         1|    2|0.5849625007211562|
| spark|                         1|    2|0.5849625007211562|
|python|                         2|    2|               0.0|
+------+--------------------------+-----+------------------+

-RECORD 0-----------------------------------------------
 term                       | sql                       

## Toy Sample III (PoI)

In [76]:
from pixlake.tables.warehouse.log.mainstream.meta import poi

In [77]:
poi_sdf = ( poi
            .load(daterange=[20210530, 20210530])
          )

In [89]:
(
    poi_sdf
    .where(C("category_id") == 0)
#     .orderBy(C("name"))
    .limit(5)
    .select(['name'])
).show(truncate=False)


sentence = [
    (0, "江蘇老趙刀切麵"),
    (1, "石研室 石頭火鍋"),
    (2, "石二鍋"),
    (3, "肉多多火鍋"),
    (4, "Vivienne Westwood Cafe")
]

col = ["doc_id","document"]

df = spark.createDataFrame(sentence, col)
df.printSchema()
df.show(truncate=False)


+----------------------+
|name                  |
+----------------------+
|江蘇老趙刀切麵               |
|石研室 石頭火鍋              |
|嘉味仙麻油雞腿庫飯             |
|Vivienne Westwood Cafe|
|五鮮級平價鍋物               |
+----------------------+

root
 |-- doc_id: long (nullable = true)
 |-- document: string (nullable = true)

+------+----------------------+
|doc_id|document              |
+------+----------------------+
|0     |江蘇老趙刀切麵               |
|1     |石研室 石頭火鍋              |
|2     |石二鍋                   |
|3     |肉多多火鍋                 |
|4     |Vivienne Westwood Cafe|
+------+----------------------+



In [90]:
tokenizer = Tokenizer(inputCol="document",
                      outputCol="term")

vectorizer = CountVectorizer(inputCol="term",
                             outputCol="raw_features")

idf = IDF(inputCol="raw_features",outputCol="features")

pipeline = Pipeline(stages=[tokenizer, vectorizer, idf])

model = pipeline.fit(df)

In [91]:
res = model.transform(df)

res.printSchema()

res.show(vertical=True, truncate=False)

root
 |-- doc_id: long (nullable = true)
 |-- document: string (nullable = true)
 |-- term: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)

-RECORD 0------------------------------------------------------------------------------
 doc_id       | 0                                                                      
 document     | 江蘇老趙刀切麵                                                                
 term         | [江蘇老趙刀切麵]                                                              
 raw_features | (8,[3],[1.0])                                                          
 features     | (8,[3],[1.0986122886681098])                                           
-RECORD 1------------------------------------------------------------------------------
 doc_id       | 1                                                                      
 document     | 石研室 石頭火鍋                       

## idf only

In [92]:
from pixlake.nlp.utils import unigram
df.show(truncate=False)

+------+----------------------+
|doc_id|document              |
+------+----------------------+
|0     |江蘇老趙刀切麵               |
|1     |石研室 石頭火鍋              |
|2     |石二鍋                   |
|3     |肉多多火鍋                 |
|4     |Vivienne Westwood Cafe|
+------+----------------------+



In [97]:
doc_term_sdf = (
    df
    .withColumn('term',F.udf(unigram,
                             returnType="array<string>")("document", F.lit(True))
               )
    .withColumn('term',F.explode_outer('term'))
)


doc_term_sdf.show(truncate=False)


    
idf_sdf = compute_idf(doc_term_sdf, 'doc_id', 'term')

doc_term_idf_sdf = (
    doc_term_sdf
    .join(
        idf_sdf.drop('n_doc'),
    on='term',
    how='left')
)

(
    doc_term_idf_sdf
    .orderBy("idf")
#     .orderBy("doc_id",'document')
).show(vertical=True,truncate=False)

+------+--------+----+
|doc_id|document|term|
+------+--------+----+
|0     |江蘇老趙刀切麵 |江   |
|0     |江蘇老趙刀切麵 |蘇   |
|0     |江蘇老趙刀切麵 |老   |
|0     |江蘇老趙刀切麵 |趙   |
|0     |江蘇老趙刀切麵 |刀   |
|0     |江蘇老趙刀切麵 |切   |
|0     |江蘇老趙刀切麵 |麵   |
|1     |石研室 石頭火鍋|石   |
|1     |石研室 石頭火鍋|研   |
|1     |石研室 石頭火鍋|室   |
|1     |石研室 石頭火鍋|石   |
|1     |石研室 石頭火鍋|頭   |
|1     |石研室 石頭火鍋|火   |
|1     |石研室 石頭火鍋|鍋   |
|2     |石二鍋     |石   |
|2     |石二鍋     |二   |
|2     |石二鍋     |鍋   |
|3     |肉多多火鍋   |肉   |
|3     |肉多多火鍋   |多   |
|3     |肉多多火鍋   |多   |
+------+--------+----+
only showing top 20 rows

n document :  5
-RECORD 0--------------------------------------------
 term                       | 鍋                      
 doc_id                     | 1                      
 document                   | 石研室 石頭火鍋               
 document_frequency_by_term | 3                      
 idf                        | 0.5849625007211562     
-RECORD 1--------------------------------------------
 term                       