In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [2]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys

def get_workstation_spark_path(where_are_you : str) -> str:
    if where_are_you == 'titan':
        return '/home/data/ryanchao2012/lib'
    elif where_are_you == 'thor':
        return '/opt/spark/versions'
    else:
        raise ValueError("wrong work station name")

spark_path = get_workstation_spark_path('thor')

print('You have pyspark version : ', os.listdir(spark_path))
os.environ['PYSPARK_PYTHON'] = sys.executable
# spark-2.3, spark-2.4
os.environ['SPARK_HOME'] = os.path.join(spark_path,'spark-2.3')

You have pyspark version :  ['spark-2.3', 'spark-3.0', 'spark-3.0.1-bin-hadoop2.7', 'spark-2.3.4-bin-hadoop2.7', 'spark-2.4.7-bin-hadoop2.7', 'spark-2.4']


In [4]:
os.environ['SPARK_HOME']

'/opt/spark/versions/spark-2.3'

In [5]:
from os.path import join
import pandas as pd
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
C = F.col

In [6]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '4g')
    .set('spark.driver.maxResultSize', '1g')
   )

In [7]:
spark = (Session
     .builder
     .appName('pyspark-challenge-nlp')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

In [8]:
spark

# CounterVectorizer | Hashing TF

https://towardsdatascience.com/countvectorizer-hashingtf-e66f169e2d4e


In [44]:
data = [
    (0, "PYTHON HIVE HIVE".split(" ")),
    (1, "JAVA JAVA SQL".split(" ")),
]

cols = ["id","words"]

df = spark.createDataFrame(data, cols)
df.show(truncate=False)
df.printSchema()

+---+--------------------+
|id |words               |
+---+--------------------+
|0  |[PYTHON, HIVE, HIVE]|
|1  |[JAVA, JAVA, SQL]   |
+---+--------------------+

root
 |-- id: long (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [34]:
## CounterVectorlizerModel

from pyspark.ml.feature import CountVectorizer
# CountVectorizer??

# minTF=1.0,
# minDF=1.0,
# vocabSize=262144,
# binary=False,
# inputCol=None,
# outputCol=None,

In [41]:
cv = CountVectorizer(inputCol="words",outputCol="features")

model = cv.fit(df)

res = model.transform(df)

res.show(truncate=False)

print('you can check the vocabulary : ',sorted(model.vocabulary))
print('the order per row (follow the counts)')

for row in df.rdd.toLocalIterator():
    print(row.id, ':', set(row.words), sep=' ')
    
# Term    Freq Index
# HIVE     2     0
# JAVA     2     1
# PYTHON      1     2
# SQL   1     3

# (4,             [1, 2], [2.0, 1.0])
# (vector_legnth, vector index)

+---+--------------------+-------------------+
|id |words               |features           |
+---+--------------------+-------------------+
|0  |[PYTHON, HIVE, HIVE]|(4,[0,3],[2.0,1.0])|
|1  |[JAVA, JAVA, SQL]   |(4,[1,2],[2.0,1.0])|
+---+--------------------+-------------------+

you can check the vocabulary :  ['HIVE', 'JAVA', 'PYTHON', 'SQL']
the order per row (follow the counts)
0 : {'PYTHON', 'HIVE'}
1 : {'SQL', 'JAVA'}


In [43]:
# sorted(model.vocabulary)
# model.params

In [42]:
# dir(cv)
# dir(model)

In [46]:
from pyspark.ml.feature import HashingTF

In [48]:
ht = HashingTF(inputCol="words",outputCol="features")

ht_res = ht.transform(df)

ht_res.show(truncate=False)

+---+--------------------+----------------------------------+
|id |words               |features                          |
+---+--------------------+----------------------------------+
|0  |[PYTHON, HIVE, HIVE]|(262144,[129668,134160],[2.0,1.0])|
|1  |[JAVA, JAVA, SQL]   |(262144,[53343,167238],[2.0,1.0]) |
+---+--------------------+----------------------------------+

