In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
    builder.\
    master('local').\
    appName('nlp-tools').\
    getOrCreate()

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [None]:
df = spark.createDataFrame(
    [
        (0, 'Hi I heard about Spark'),
        (1, 'I wish Java could use case classes'),
        (2, 'Logistic,regression,models,are,neat')
    ],
    [
        'id',
        'sentence'
    ]
)
df.show()

In [None]:
?Tokenizer

In [5]:
tokenizer = Tokenizer(
    inputCol='sentence',
    outputCol='words'
)

In [None]:
?RegexTokenizer

In [7]:
regex_tokenizer = RegexTokenizer(
    inputCol='sentence',
    outputCol='words',
    pattern='\\W'
)

In [None]:
?udf

In [9]:
count_tokens = udf(lambda words: len(words), IntegerType())

In [10]:
df_tokenized = tokenizer.transform(df)
df_tokenized.show(truncate=False)

In [None]:
df_tokenized_1 = df_tokenized.withColumn('tokens', count_tokens(col('words')))
df_tokenized_1.show(truncate=False)

In [None]:
df_tokenized_regex = regex_tokenizer.transform(df)
df_tokenized_regex.show(truncate=False)

In [None]:
df_tokenized_regex_1 = df_tokenized_regex.withColumn('tokens', count_tokens(col('words')))
df_tokenized_regex_1.show(truncate=False)

In [None]:
from pyspark.ml.feature import StopWordsRemover

In [None]:
df_1 = spark.createDataFrame(
    [
        (0, ['I', 'saw', 'the', 'red', 'balloon']),
        (1, ['Mary', 'had', 'a', 'little', 'lamb'])
    ],
    [
        'id',
        'raw'
    ]
)

In [None]:
?StopWordsRemover

In [None]:
stop_word_remover = StopWordsRemover(
    inputCol='raw',
    outputCol='filtered'
)

In [None]:
remover.transform(df_1).show(truncate=False)

In [None]:
from pyspark.ml.feature import NGram

In [None]:
df_2 = spark.createDataFrame(
    [
        (0, ['Hi', 'I', 'heard', 'about', 'Spark']),
        (1, ['I', 'wish', 'Java', 'could', 'use', 'case', 'classes']),
        (2, ['Logistic', 'regression', 'models', 'are', 'neat'])
    ],
    [
        'id',
        'words'
    ]
)

In [None]:
?NGram

In [None]:
ngram = NGram(
    n=2,
    inputCol='words',
    outputCol='ngrams'
)

In [None]:
df_ngram = ngram.transform(wordDataFrame)

In [None]:
df_ngram.select('ngrams').show(truncate=False)

In [None]:
from pyspark.ml.feature import (
    HashingTF,
    IDF,
    Tokenizer
)

In [None]:
df_sentence = spark.createDataFrame(
    [
        (0.0, 'Hi I heard about Spark'),
        (0.0, 'I wish Java could use case classes'),
        (1.0, 'Logistic regression models are neat')
    ],
    [
        'label',
        'sentence'
    ]
)
df_sentence.show()

In [None]:
?HashingTF

In [None]:
?IDF

In [None]:
tokenizer = Tokenizer(
    inputCol='sentence',
    outputCol='words'
)
df_words = tokenizer.transform(df_sentence)
df_words.show()

In [None]:
tf = HashingTF(
    inputCol='words',
    outputCol='rawFeatures',
    numFeatures=20
)

In [None]:
df_tf = tf.transform(wordsData)

In [None]:
idf = IDF(
    inputCol='rawFeatures',
    outputCol='features'
)

In [None]:
idf_model = idf.fit(df_tf)
df_tf_idf = idf_model.transform(df_tf)

In [None]:
rescaledData.select('label', 'features').show()

In [None]:
from pyspark.ml.feature import CountVectorizer

In [None]:
df_3 = spark.createDataFrame(
    [
        (0, 'a b c'.split(' ')),
        (1, 'a b b c a'.split(' '))
    ],
    [
        'id',
        'words'
    ]
)

In [None]:
?CountVectorizer

In [None]:
cv = CountVectorizer(
    inputCol='words',
    outputCol='features',
    vocabSize=3,
    minDF=2.0
)

In [None]:
cv_model = cv.fit(df_3)

In [None]:
cv_res = cv_model.transform(df_3)

In [None]:
cv_res.show(truncate=False)