# Text processing

## Init spark

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

## Dummy data

In [None]:
import pandas as pd

pdf = pd.DataFrame({
        'texts': ["I like playing?",
                  "I like coding."]
    })
    
df = spark.createDataFrame(pdf)
df.show()

## Clean text

In [20]:
import re
import unidecode
from pyspark.sql.functions import udf
from pyspark.sql.types import *

def clean(text):
    text= unidecode.unidecode(text) # remove accents
    text = text.lower()
    text = text.replace(r'\n','') # remove newline sign
    text = re.sub(r'\d+', '', text) # remove digits 
    text = re.sub(r'[.]?-[.]?', '', text) # concatenate divided words
    text = re.sub(r'[\W]+',' ', text) # replace non-alphanum with space  
    text = re.sub(' +', ' ', text) # replace multiple spaces with single space 
    return text

user_def_fun = udf(f=clean, returnType=ArrayType(StringType()))

df = df.withColumn("cleaned", user_def_fun("texts"))
df.show()

+---------------+-------+------------------+---------------+------------+---------------+
|          texts|cleaned|            tokens| refined_tokens|        stem|            lem|
+---------------+-------+------------------+---------------+------------+---------------+
|I like playing?|   null|[i, like, playing]|[like, playing]|[like, play]|[like, playing]|
| I like coding.|   null| [i, like, coding]| [like, coding]|[like, code]| [like, coding]|
+---------------+-------+------------------+---------------+------------+---------------+



## Tokenization

In [None]:
from pyspark.ml.feature import Tokenizer

# A tokenizer that converts the input string to lowercase and then
# splits it by white spaces.
tokenizer = Tokenizer(inputCol="cleaned", outputCol="tokens")
df = tokenizer.transform(df)
df.show()

## Stopwords Removal

In [None]:
from pyspark.ml.feature import StopWordsRemover

stopword_removal = StopWordsRemover(inputCol='tokens', 
                                    outputCol='refined_tokens')
df = stopword_removal.transform(df)

df.show()

## Stemming vs lemmatization

In [None]:
import nltk
nltk.download('wordnet')

In [23]:
%%time
from nltk.stem.snowball import SnowballStemmer

def list_stemmer(words):
    stemmer = SnowballStemmer(language='english')
    return [stemmer.stem(word) for word in words]

stemming = udf(list_stemmer, returnType=ArrayType(StringType()))

df = df.withColumn("stem", stemming("refined_tokens"))
df.select(['refined_tokens','stem']).show()

+---------------+------------+
| refined_tokens|        stem|
+---------------+------------+
|[like, playing]|[like, play]|
| [like, coding]|[like, code]|
+---------------+------------+

CPU times: user 8.91 ms, sys: 5.54 ms, total: 14.4 ms
Wall time: 2.12 s


In [25]:
%%time
from nltk.stem import WordNetLemmatizer

def list_lemmatizer(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

lemmatization = udf(list_lemmatizer, returnType=ArrayType(StringType()))

df = df.withColumn("lem", lemmatization("refined_tokens"))
df.select(['refined_tokens','lem']).show()

+---------------+---------------+
| refined_tokens|            lem|
+---------------+---------------+
|[like, playing]|[like, playing]|
| [like, coding]| [like, coding]|
+---------------+---------------+

CPU times: user 14.5 ms, sys: 844 µs, total: 15.3 ms
Wall time: 4.83 s


# TF-IDF

In [36]:
from pyspark.ml.feature import HashingTF, IDF

hashing_vec = HashingTF(numFeatures=262144,
                        inputCol='stem',
                        outputCol='tf_features')

hashing_df = hashing_vec.transform(df)
hashing_df.select(['stem','tf_features']).show(4,False)

+------------+----------------------------------+
|stem        |tf_features                       |
+------------+----------------------------------+
|[like, play]|(262144,[123981,208258],[1.0,1.0])|
|[like, code]|(262144,[93284,208258],[1.0,1.0]) |
+------------+----------------------------------+



In [38]:
tf_idf_vec = IDF(inputCol='tf_features',
               outputCol='tf_idf_features')

tf_idf_df = tf_idf_vec.fit(hashing_df).transform(hashing_df)
tf_idf_df.select(['stem','tf_idf_features']).show(4,False)

+------------+-------------------------------------------------+
|stem        |tf_idf_features                                  |
+------------+-------------------------------------------------+
|[like, play]|(262144,[123981,208258],[0.4054651081081644,0.0])|
|[like, code]|(262144,[93284,208258],[0.4054651081081644,0.0]) |
+------------+-------------------------------------------------+



In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ["like playing",
          "like coding"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())

['coding', 'like', 'playing']


In [40]:
X.toarray()

array([[0.        , 0.57973867, 0.81480247],
       [0.81480247, 0.57973867, 0.        ]])

In [41]:
print(X)

  (0, 2)	0.8148024746671689
  (0, 1)	0.5797386715376657
  (1, 0)	0.8148024746671689
  (1, 1)	0.5797386715376657
