# Text processing

## Init spark

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

## Dummy data

In [266]:
import pandas as pd

pdf = pd.DataFrame({
        'texts': ["I like playing?",
                  "I like coding."]
    })
    
df = spark.createDataFrame(pdf)
df.show()

+---------------+
|          texts|
+---------------+
|I like playing?|
| I like coding.|
+---------------+



## Clean text

In [267]:
import re
import unidecode
from pyspark.sql.functions import udf

def clean(text):
    text= unidecode.unidecode(text) # remove accents
    text = text.lower()
    text = text.replace(r'\n','') # remove newline sign
    text = re.sub(r'\d+', '', text) # remove digits 
    text = re.sub(r'[.]?-[.]?', '', text) # concatenate divided words
    text = re.sub(r'[\W]+',' ', text) # replace non-alphanum with space  
    text = re.sub(' +', ' ', text) # replace multiple spaces with single space 
    return text

user_def_fun = udf(clean)

df = df.withColumn("cleaned", user_def_fun("texts"))
df.show()

+---------------+---------------+
|          texts|        cleaned|
+---------------+---------------+
|I like playing?|i like playing |
| I like coding.| i like coding |
+---------------+---------------+



## Tokenization

In [268]:
from pyspark.ml.feature import Tokenizer

# A tokenizer that converts the input string to lowercase and then
# splits it by white spaces.
tokenizer = Tokenizer(inputCol="cleaned", outputCol="tokens")
df = tokenizer.transform(df)
df.show()

+---------------+---------------+------------------+
|          texts|        cleaned|            tokens|
+---------------+---------------+------------------+
|I like playing?|i like playing |[i, like, playing]|
| I like coding.| i like coding | [i, like, coding]|
+---------------+---------------+------------------+



## Stopwords Removal

In [269]:
from pyspark.ml.feature import StopWordsRemover

stopword_removal = StopWordsRemover(inputCol='tokens', 
                                    outputCol='refined_tokens')
refined_df = stopword_removal.transform(df)

refined_df.show()

+---------------+---------------+------------------+---------------+
|          texts|        cleaned|            tokens| refined_tokens|
+---------------+---------------+------------------+---------------+
|I like playing?|i like playing |[i, like, playing]|[like, playing]|
| I like coding.| i like coding | [i, like, coding]| [like, coding]|
+---------------+---------------+------------------+---------------+



## Stemming and lemmatization

In [None]:
import nltk
nltk.download('wordnet')

In [277]:
lemma('dogs')

'dog'

In [278]:
from nltk.stem import WordNetLemmatizer

def lemma(x):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(i) for i in x]
           
# lem_words = filtered_data.map(lemma)


lemmatization = udf(lambda x: lemma(x))

df = df.withColumn("lem", lemmatization("refined_tokens"))
df.show()

AnalysisException: cannot resolve '`refined_tokens`' given input columns: [cleaned, texts, tokens];;
'Project [texts#1029, cleaned#1038, tokens#1054, <lambda>('refined_tokens) AS lem#1117]
+- Project [texts#1029, cleaned#1038, UDF(cleaned#1038) AS tokens#1054]
   +- Project [texts#1029, clean(texts#1029) AS cleaned#1038]
      +- LogicalRDD [texts#1029], false


# TF-IDF

In [119]:
from pyspark.ml.feature import HashingTF, IDF

hashing_vec = HashingTF(numFeatures=4,
                        inputCol='refined_tokens',
                        outputCol='tf_features')

hashing_df = hashing_vec.transform(refined_df)
hashing_df.select(['refined_tokens','tf_features']).show(2,False)

+---------------+-------------------+
|refined_tokens |tf_features        |
+---------------+-------------------+
|[like, playing]|(4,[1,2],[1.0,1.0])|
|[like, coding] |(4,[2,3],[1.0,1.0])|
+---------------+-------------------+



In [121]:
tf_idf_vec = IDF(inputCol='tf_features',
               outputCol='tf_idf_features')

tf_idf_df = tf_idf_vec.fit(hashing_df).transform(hashing_df)
tf_idf_df.select(['refined_tokens','tf_idf_features']).show(4,False)

+---------------+----------------------------------+
|refined_tokens |tf_idf_features                   |
+---------------+----------------------------------+
|[like, playing]|(4,[1,2],[0.4054651081081644,0.0])|
|[like, coding] |(4,[2,3],[0.0,0.4054651081081644])|
+---------------+----------------------------------+



In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ["like playing",
          "like coding"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())

['coding', 'like', 'playing']


In [91]:
X.toarray()

array([[0.        , 0.57973867, 0.81480247],
       [0.81480247, 0.57973867, 0.        ]])

In [92]:
print(X)

  (0, 2)	0.8148024746671689
  (0, 1)	0.5797386715376657
  (1, 0)	0.8148024746671689
  (1, 1)	0.5797386715376657
