# Text processing

## Init spark

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

## Dummy data

In [None]:
import pandas as pd

pdf = pd.DataFrame({
        'texts': ["I like playing?",
                  "I like coding."]
    })
    
df = spark.createDataFrame(pdf)
df.show()

## Clean text

In [None]:
import re
import unidecode
from pyspark.sql.functions import udf

def clean(text):
    text= unidecode.unidecode(text) # remove accents
    text = text.lower()
    text = text.replace(r'\n','') # remove newline sign
    text = re.sub(r'\d+', '', text) # remove digits 
    text = re.sub(r'[.]?-[.]?', '', text) # concatenate divided words
    text = re.sub(r'[\W]+',' ', text) # replace non-alphanum with space  
    text = re.sub(' +', ' ', text) # replace multiple spaces with single space 
    return text

user_def_fun = udf(clean)

df = df.withColumn("cleaned", user_def_fun("texts"))
df.show()

## Tokenization

In [None]:
from pyspark.ml.feature import Tokenizer

# A tokenizer that converts the input string to lowercase and then
# splits it by white spaces.
tokenizer = Tokenizer(inputCol="cleaned", outputCol="tokens")
df = tokenizer.transform(df)
df.show()

## Stopwords Removal

In [None]:
from pyspark.ml.feature import StopWordsRemover

stopword_removal = StopWordsRemover(inputCol='tokens', 
                                    outputCol='refined_tokens')
df = stopword_removal.transform(df)

df.show()

## Stemming vs lemmatization

In [None]:
import nltk
nltk.download('wordnet')

In [12]:
%%time
from nltk.stem.snowball import SnowballStemmer

def list_stemmer(words):
    stemmer = SnowballStemmer(language='english')
    return [stemmer.stem(word) for word in words]

stemming = udf(list_stemmer)

df = df.withColumn("stem", stemming("refined_tokens"))
df.select(['refined_tokens','stem']).show()

+---------------+------------+
| refined_tokens|        stem|
+---------------+------------+
|[like, playing]|[like, play]|
| [like, coding]|[like, code]|
+---------------+------------+

CPU times: user 15 ms, sys: 2.3 ms, total: 17.3 ms
Wall time: 1.71 s


In [13]:
%%time
from nltk.stem import WordNetLemmatizer

def list_lemmatizer(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

lemmatization = udf(list_lemmatizer)

df = df.withColumn("lem", lemmatization("refined_tokens"))
df.select(['refined_tokens','lem']).show()

+---------------+---------------+
| refined_tokens|            lem|
+---------------+---------------+
|[like, playing]|[like, playing]|
| [like, coding]| [like, coding]|
+---------------+---------------+

CPU times: user 10 ms, sys: 4.54 ms, total: 14.6 ms
Wall time: 3.33 s


# TF-IDF

In [None]:
df.select()

In [15]:
from pyspark.ml.feature import HashingTF, IDF

hashing_vec = HashingTF(numFeatures=4,
                        inputCol='refined_tokens',
                        outputCol='tf_features')

hashing_df = hashing_vec.transform(df)
# hashing_df.select(['refined_tokens','tf_features']).show(2,False)

In [121]:
tf_idf_vec = IDF(inputCol='tf_features',
               outputCol='tf_idf_features')

tf_idf_df = tf_idf_vec.fit(hashing_df).transform(hashing_df)
tf_idf_df.select(['refined_tokens','tf_idf_features']).show(4,False)

+---------------+----------------------------------+
|refined_tokens |tf_idf_features                   |
+---------------+----------------------------------+
|[like, playing]|(4,[1,2],[0.4054651081081644,0.0])|
|[like, coding] |(4,[2,3],[0.0,0.4054651081081644])|
+---------------+----------------------------------+



In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ["like playing",
          "like coding"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())

['coding', 'like', 'playing']


In [91]:
X.toarray()

array([[0.        , 0.57973867, 0.81480247],
       [0.81480247, 0.57973867, 0.        ]])

In [92]:
print(X)

  (0, 2)	0.8148024746671689
  (0, 1)	0.5797386715376657
  (1, 0)	0.8148024746671689
  (1, 1)	0.5797386715376657
