# Text processing

## Init spark

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

## Dummy data

In [57]:
import pandas as pd

pdf = pd.DataFrame({
        'texts': ["I like playing?and drinking",
                  "I like coding."]})
    
df = spark.createDataFrame(pdf)
df.show()

+--------------------+
|               texts|
+--------------------+
|I like playing?an...|
|      I like coding.|
+--------------------+



## Clean text

In [58]:
import re
import unidecode
from pyspark.sql.functions import udf
from pyspark.sql.types import *

def clean(text):
    text= unidecode.unidecode(text) # remove accents
    text = text.lower()
    text = text.replace(r'\n','') # remove newline sign
    text = re.sub(r'\d+', '', text) # remove digits 
    text = re.sub(r'[.]?-[.]?', '', text) # concatenate divided words
    text = re.sub(r'[\W]+',' ', text) # replace non-alphanum with space  
    text = re.sub(' +', ' ', text) # replace multiple spaces with single space 
    return text

user_def_fun = udf(f=clean, returnType=StringType())

df = df.withColumn("cleaned", user_def_fun("texts"))
df.show()

+--------------------+--------------------+
|               texts|             cleaned|
+--------------------+--------------------+
|I like playing?an...|i like playing an...|
|      I like coding.|      i like coding |
+--------------------+--------------------+



## Tokenization

In [59]:
from pyspark.ml.feature import Tokenizer

# A tokenizer that converts the input string to lowercase and then
# splits it by white spaces.
tokenizer = Tokenizer(inputCol="cleaned", outputCol="tokens")
df = tokenizer.transform(df)
df.show(2, False)

+---------------------------+---------------------------+---------------------------------+
|texts                      |cleaned                    |tokens                           |
+---------------------------+---------------------------+---------------------------------+
|I like playing?and drinking|i like playing and drinking|[i, like, playing, and, drinking]|
|I like coding.             |i like coding              |[i, like, coding]                |
+---------------------------+---------------------------+---------------------------------+



## Stopwords Removal

In [60]:
from pyspark.ml.feature import StopWordsRemover

stopword_removal = StopWordsRemover(inputCol='tokens', 
                                    outputCol='refined_tokens')
df = stopword_removal.transform(df)

df.show(2, False)

+---------------------------+---------------------------+---------------------------------+-------------------------+
|texts                      |cleaned                    |tokens                           |refined_tokens           |
+---------------------------+---------------------------+---------------------------------+-------------------------+
|I like playing?and drinking|i like playing and drinking|[i, like, playing, and, drinking]|[like, playing, drinking]|
|I like coding.             |i like coding              |[i, like, coding]                |[like, coding]           |
+---------------------------+---------------------------+---------------------------------+-------------------------+



## Stemming vs lemmatization

In [61]:
%%time
from nltk.stem.snowball import SnowballStemmer

def list_stemmer(words):
    stemmer = SnowballStemmer(language='english')
    return [stemmer.stem(word) for word in words]

stemming = udf(list_stemmer, returnType=ArrayType(StringType()))

df = df.withColumn("stem", stemming("refined_tokens"))
df.select(['refined_tokens','stem']).show(2,False)

+--------------------+-------------------+
|      refined_tokens|               stem|
+--------------------+-------------------+
|[like, playing, d...|[like, play, drink]|
|      [like, coding]|       [like, code]|
+--------------------+-------------------+

CPU times: user 9.56 ms, sys: 3.8 ms, total: 13.4 ms
Wall time: 3.67 s


In [None]:
import nltk
nltk.download('wordnet')

In [62]:
%%time
from nltk.stem import WordNetLemmatizer

def list_lemmatizer(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

lemmatization = udf(list_lemmatizer, returnType=ArrayType(StringType()))

df = df.withColumn("lem", lemmatization("refined_tokens"))
df.select(['refined_tokens','lem']).show(2,False)

+--------------------+--------------------+
|      refined_tokens|                 lem|
+--------------------+--------------------+
|[like, playing, d...|[like, playing, d...|
|      [like, coding]|      [like, coding]|
+--------------------+--------------------+

CPU times: user 14.7 ms, sys: 0 ns, total: 14.7 ms
Wall time: 5.2 s


# TF-IDF

In [63]:
from pyspark.ml.feature import HashingTF, IDF

hashing_vec = HashingTF(numFeatures=262144,
                        inputCol='stem',
                        outputCol='tf_features')

hashing_df = hashing_vec.transform(df)
hashing_df.select(['stem','tf_features']).show(2,False)

+-------------------+--------------------------------------------+
|stem               |tf_features                                 |
+-------------------+--------------------------------------------+
|[like, play, drink]|(262144,[33140,123981,208258],[1.0,1.0,1.0])|
|[like, code]       |(262144,[93284,208258],[1.0,1.0])           |
+-------------------+--------------------------------------------+



In [64]:
tf_idf_vec = IDF(inputCol='tf_features',
               outputCol='tf_idf_features')

tf_idf_df = tf_idf_vec.fit(hashing_df).transform(hashing_df)
tf_idf_df.select(['stem','tf_idf_features']).show(2,False)

+-------------------+--------------------------------------------------------------------------+
|stem               |tf_idf_features                                                           |
+-------------------+--------------------------------------------------------------------------+
|[like, play, drink]|(262144,[33140,123981,208258],[0.4054651081081644,0.4054651081081644,0.0])|
|[like, code]       |(262144,[93284,208258],[0.4054651081081644,0.0])                          |
+-------------------+--------------------------------------------------------------------------+

