## PySpark - Natural Language Processing

In [None]:
!pip install pyspark

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NLP").getOrCreate()

## Tokenizer

_**Documentacion Tokenizer:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.Tokenizer.html_

_**Documentacion RegexTokenizer:** https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.feature.RegexTokenizer.html_

In [2]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import RegexTokenizer

from pyspark.sql.functions import col
from pyspark.sql.functions import udf

from pyspark.sql.types import IntegerType

In [13]:
data = spark.createDataFrame(data = [(0, "Hola mundo me gusta python y pyspark"),
                                     (1, "PySpark no me convence mucho"),
                                     (2, "Logistic,regression,models,decision,trees,gradient,boosting,classifier"),
                                     (3, "Hello world i like python and pyspark")],
                             schema = ["id", "sentence"])

In [14]:
data.show(truncate=False)

+---+----------------------------------------------------------------------+
|id |sentence                                                              |
+---+----------------------------------------------------------------------+
|0  |Hola mundo me gusta python y pyspark                                  |
|1  |PySpark no me convence mucho                                          |
|2  |Logistic,regression,models,decision,trees,gradient,boosting,classifier|
|3  |Hello world i like python and pyspark                                 |
+---+----------------------------------------------------------------------+



In [15]:
tokenizer = Tokenizer(inputCol = "sentence",
                      outputCol = "words")

regex_tokenizer = RegexTokenizer(inputCol = "sentence",
                                 outputCol = "words",
                                 pattern = "\\W")

count_token = udf(f = lambda words: len(words),
                  returnType = IntegerType())

In [16]:
tokenized = tokenizer.transform(data)

tokenized.select("sentence", "words").withColumn("tokens", count_token(col("words"))).toPandas()

Unnamed: 0,sentence,words,tokens
0,Hola mundo me gusta python y pyspark,"[hola, mundo, me, gusta, python, y, pyspark]",7
1,PySpark no me convence mucho,"[pyspark, no, me, convence, mucho]",5
2,"Logistic,regression,models,decision,trees,grad...","[logistic,regression,models,decision,trees,gra...",1
3,Hello world i like python and pyspark,"[hello, world, i, like, python, and, pyspark]",7


In [17]:
regex_tokenized = regex_tokenizer.transform(data)

regex_tokenized.select("sentence", "words").withColumn("tokens", count_token(col("words"))).show(truncate = False)

+----------------------------------------------------------------------+-------------------------------------------------------------------------------+------+
|sentence                                                              |words                                                                          |tokens|
+----------------------------------------------------------------------+-------------------------------------------------------------------------------+------+
|Hola mundo me gusta python y pyspark                                  |[hola, mundo, me, gusta, python, y, pyspark]                                   |7     |
|PySpark no me convence mucho                                          |[pyspark, no, me, convence, mucho]                                             |5     |
|Logistic,regression,models,decision,trees,gradient,boosting,classifier|[logistic, regression, models, decision, trees, gradient, boosting, classifier]|8     |
|Hello world i like python and pyspark  

### StopWords
_**Documentacion:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.StopWordsRemover.html_

In [19]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol = "words",
                           outputCol = "filtered",
                           stopWords= ['y', 'me'])

remover.transform(regex_tokenized).select("words", "filtered").show(truncate = False)

+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+
|words                                                                          |filtered                                                                       |
+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+
|[hola, mundo, me, gusta, python, y, pyspark]                                   |[hola, mundo, gusta, python, pyspark]                                          |
|[pyspark, no, me, convence, mucho]                                             |[pyspark, no, convence, mucho]                                                 |
|[logistic, regression, models, decision, trees, gradient, boosting, classifier]|[logistic, regression, models, decision, trees, gradient, boosting, classifier]|
|[hello, world, i, like, pyt

## n-grams
_**Documentacion:** https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.feature.NGram.html_

In [22]:
from pyspark.ml.feature import NGram

n_gram = NGram(n = 3,
               inputCol = "words",
               outputCol = "n_gram")

n_gram_data = n_gram.transform(regex_tokenized)

n_gram_data.select("n_gram").show(truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|n_gram                                                                                                                                                         |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[hola mundo me, mundo me gusta, me gusta python, gusta python y, python y pyspark]                                                                             |
|[pyspark no me, no me convence, me convence mucho]                                                                                                             |
|[logistic regression models, regression models decision, models decision trees, decision trees gradient, trees gradient boosting, gradient boosting classifier]|
|[hello world i, world i lik

### TF-IDF
_**Documentacion HashingTF:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.feature.HashingTF.html_ 

_**Documentacion IDF:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.feature.IDF.html_

In [23]:
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF

tokenizer = Tokenizer(inputCol = "sentence", outputCol = "words")
words_data = tokenizer.transform(data)
words_data.show(truncate = False)

+---+----------------------------------------------------------------------+------------------------------------------------------------------------+
|id |sentence                                                              |words                                                                   |
+---+----------------------------------------------------------------------+------------------------------------------------------------------------+
|0  |Hola mundo me gusta python y pyspark                                  |[hola, mundo, me, gusta, python, y, pyspark]                            |
|1  |PySpark no me convence mucho                                          |[pyspark, no, me, convence, mucho]                                      |
|2  |Logistic,regression,models,decision,trees,gradient,boosting,classifier|[logistic,regression,models,decision,trees,gradient,boosting,classifier]|
|3  |Hello world i like python and pyspark                                 |[hello, world, i, like, 

In [24]:
hashingTF = HashingTF(inputCol = "words",
                      outputCol = "raw_features",
                      numFeatures = 20)

featurized_data = hashingTF.transform(words_data)

featurized_data.select("words", "raw_features").show(truncate = False)

+------------------------------------------------------------------------+-----------------------------------------------+
|words                                                                   |raw_features                                   |
+------------------------------------------------------------------------+-----------------------------------------------+
|[hola, mundo, me, gusta, python, y, pyspark]                            |(20,[1,8,9,19],[1.0,1.0,2.0,3.0])              |
|[pyspark, no, me, convence, mucho]                                      |(20,[1,4,9,16],[1.0,2.0,1.0,1.0])              |
|[logistic,regression,models,decision,trees,gradient,boosting,classifier]|(20,[17],[1.0])                                |
|[hello, world, i, like, python, and, pyspark]                           |(20,[0,1,9,10,11,16],[1.0,1.0,2.0,1.0,1.0,1.0])|
+------------------------------------------------------------------------+-----------------------------------------------+



In [25]:
idf = IDF(inputCol = "raw_features",
          outputCol = "features")

rescaled_data = idf.fit(featurized_data).transform(featurized_data)

rescaled_data.select("raw_features", "features").show(truncate = False)

+-----------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
|raw_features                                   |features                                                                                                                                   |
+-----------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
|(20,[1,8,9,19],[1.0,1.0,2.0,3.0])              |(20,[1,8,9,19],[0.22314355131420976,0.9162907318741551,0.44628710262841953,2.7488721956224653])                                            |
|(20,[1,4,9,16],[1.0,2.0,1.0,1.0])              |(20,[1,4,9,16],[0.22314355131420976,1.8325814637483102,0.22314355131420976,0.5108256237659907])                                            |
|(20,[17],[1.0])                                |(

## CountVectorizer

_**Documentacion:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.CountVectorizer.html_

In [26]:
from pyspark.ml.feature import CountVectorizer

df = spark.createDataFrame(data = [(0, "a b c".split(" ")),
                                   (1, "a b b c a".split(" "))],
                           schema = ["id", "words"])

cv = CountVectorizer(inputCol = "words",
                     outputCol = "features",
                     vocabSize = 3,
                     minDF = 2.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate = False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



In [None]:
################################################################################################################################