In [14]:
# !pip install googletrans==4.0.0-rc1
# !pip install pyspark py4j
# !pip install findspark

In [88]:
from googletrans import Translator
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pandas as pd
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.sql.functions import explode, col
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import lower
from pyspark.ml.feature import StopWordsRemover

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySpark - ContandoPalavras").getOrCreate()
sc = spark.sparkContext

spark

In [104]:
df = spark.read.text("C:/Users/cedua/CDIA - PUCSP/PROJETO PySpark (SAVINO)/Contador-de-palavras---PySpark/data/fitnessandpower.txt")
df.show()
type(df)

+--------------------+
|               value|
+--------------------+
|       articles_text|
|"When you want to...|
|These powders com...|
|ARE THERE ANY DIF...|
|Yes! Whey protein...|
|This means that t...|
|DO THEY HAVE THE ...|
|No! Whey protein ...|
|DO THEY TASTE THE...|
|Nope! Whey protei...|
|DO THEY CONTAIN T...|
|No! 100 grams of ...|
|ARE THEY ABSORBED...|
|No. Whey protein ...|
|All in all, you g...|
|If you want to ha...|
|Not as well as th...|
|                    |
|For the latest ne...|
|                    |
+--------------------+
only showing top 20 rows



pyspark.sql.dataframe.DataFrame

In [87]:
########################################################################################################################

In [20]:
#### EXEMPLO 1 - MAP REDUCE
dados=["B","B","B","C","C","C","D","D","D D"]

input_file=sc.parallelize(dados,3)
counts = input_file.map(lambda word:(word, 1)) \
                     .reduceByKey(lambda a, b: a + b)

result=counts.collect()
print(result) 
df=pd.DataFrame(result)
df

[('B', 3), ('C', 3), ('D', 2), ('D D', 1)]


Unnamed: 0,0,1
0,B,3
1,C,3
2,D,2
3,D D,1


In [22]:
#### EXEMPLO 2 - MAP REDUCE
dados=["B","B","B","C","C","C","D"]
item=["B","D"]
input_file=sc.parallelize(dados,3)
counts = (input_file.map(lambda word:(word, 1) if (word in item) else (word,0)) 
                     .reduceByKey(lambda a, b: a + b))

result=counts.collect()
print(result)
df=pd.DataFrame(result)
# df.query("qtde>0")
df

[('B', 3), ('C', 0), ('D', 1)]


Unnamed: 0,0,1
0,B,3
1,C,0
2,D,1


In [27]:
#########################################################################################################################

In [63]:
# def translate_text(text):
#     translator = Translator()
#     translation = translator.translate(text, dest="pt")
#     return translation.text

In [71]:
# # Cria a função UDF (User Defined Function) para aplicar a tradução
# translate_udf = udf(translate_text, StringType())

# # Aplica a tradução no DataFrame usando a função UDF
# df = df.withColumn("translated_text", translate_udf(df['value']))
# # translated_df = translated_df.drop("articles_text")

# # Mostra o DataFrame com os textos traduzidos
# df.show()
# type(df)

### Tratamento - PySpark

In [105]:
#spliting the df by space " " 
df = df.select(
split(df['value'], ' ').alias('words')
)

df.show()
type(df)

+--------------------+
|               words|
+--------------------+
|     [articles_text]|
|["When, you, want...|
|[These, powders, ...|
|[ARE, THERE, ANY,...|
|[Yes!, Whey, prot...|
|[This, means, tha...|
|[DO, THEY, HAVE, ...|
|[No!, Whey, prote...|
|[DO, THEY, TASTE,...|
|[Nope!, Whey, pro...|
|[DO, THEY, CONTAI...|
|[No!, 100, grams,...|
|[ARE, THEY, ABSOR...|
|[No., Whey, prote...|
|[All, in, all,, y...|
|[If, you, want, t...|
|[Not, as, well, a...|
|                  []|
|[For, the, latest...|
|                  []|
+--------------------+
only showing top 20 rows



pyspark.sql.dataframe.DataFrame

In [106]:
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="words2")

# Aplique a remoção de stopwords
df = stopwords_remover.transform(df)
df = df.drop("words")
df.show()
type(df)

+--------------------+
|              words2|
+--------------------+
|     [articles_text]|
|["When, want, put...|
|[powders, come, k...|
|[DIFFERENCES, WAY...|
|[Yes!, Whey, prot...|
|[means, particles...|
|[NUTRITIONAL, VAL...|
|[No!, Whey, prote...|
|      [TASTE, SAME?]|
|[Nope!, Whey, pro...|
|[CONTAIN, AMOUNT,...|
|[No!, 100, grams,...|
|   [ABSORBED, SAME?]|
|[No., Whey, prote...|
|[all,, get, make,...|
|[want, benefits, ...|
|[well, isolate,, ...|
|                  []|
|[latest, news, up...|
|                  []|
+--------------------+
only showing top 20 rows



pyspark.sql.dataframe.DataFrame

In [107]:
#removing from lists
df = df.select(
explode(col("words2")).alias("words3")
)

df.show()
type(df)

+-------------+
|       words3|
+-------------+
|articles_text|
|        "When|
|         want|
|          put|
|       muscle|
|     increase|
|      protein|
|      intake,|
|          one|
|         best|
|      choices|
|          buy|
|         whey|
|      protein|
|      powder.|
|     However,|
|         even|
|      picking|
|         whey|
|     protein,|
+-------------+
only showing top 20 rows



pyspark.sql.dataframe.DataFrame

In [108]:
#droping blank spaces
df = df.where(
col('words3') != ''
)

df.show()
type(df)

+-------------+
|       words3|
+-------------+
|articles_text|
|        "When|
|         want|
|          put|
|       muscle|
|     increase|
|      protein|
|      intake,|
|          one|
|         best|
|      choices|
|          buy|
|         whey|
|      protein|
|      powder.|
|     However,|
|         even|
|      picking|
|         whey|
|     protein,|
+-------------+
only showing top 20 rows



pyspark.sql.dataframe.DataFrame

In [109]:
#removing punctuation
df = df.select(
regexp_extract(
col('words3'), #cleaning words2 column
'[A-z]+', #selecting all the words
0).alias('words4')
)

df.show()
type(df)

+-------------+
|       words4|
+-------------+
|articles_text|
|         When|
|         want|
|          put|
|       muscle|
|     increase|
|      protein|
|       intake|
|          one|
|         best|
|      choices|
|          buy|
|         whey|
|      protein|
|       powder|
|      However|
|         even|
|      picking|
|         whey|
|      protein|
+-------------+
only showing top 20 rows



pyspark.sql.dataframe.DataFrame

In [110]:
#every word to lower case
df = df.select(
lower(
col('words4')
).alias('words5')
)

df.show()
type(df)

+-------------+
|       words5|
+-------------+
|articles_text|
|         when|
|         want|
|          put|
|       muscle|
|     increase|
|      protein|
|       intake|
|          one|
|         best|
|      choices|
|          buy|
|         whey|
|      protein|
|       powder|
|      however|
|         even|
|      picking|
|         whey|
|      protein|
+-------------+
only showing top 20 rows



pyspark.sql.dataframe.DataFrame