In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pandas as pd
import pyspark
from pyspark.sql.functions import split
from pyspark.sql.functions import explode, col, trim, rtrim, ltrim
from pyspark.sql.functions import regexp_replace, regexp_extract
from pyspark.sql.functions import lower
from pyspark.sql.functions import desc
from pyspark.ml.feature import StopWordsRemover
import re
import glob
import os
import shutil
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cedua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
def count_words_EN(txt_path):
    #pyspark session
    spark = SparkSession.builder.appName("PySpark-ContandoPalavras").getOrCreate()
    sc = spark.sparkContext

    #txt doc
    df = spark.read.text(f"{txt_path}")

    #spliting the df by space " " 
    df = df.select(
    split(df['value'], ' ').alias('words'))
    
    #extracting words from the lists
    df = df.select(
    explode(col("words")).alias("words3"))
    
    #every word to lower case
    df = df.select(
    lower(
    col('words3')
    ).alias('words4'))

    #removing punctuation
    df = df.select(
    regexp_extract(
    col('words4'),  #cleaning words2 column
    '[A-z]+',       #selecting all the words
    0).alias('words5'))

    # Removing the NULLs
    df = df.where(
        col("words5") != "")
    
    #FIRST COUNT#
    df_counted = df.groupBy(
        col("words5"),
    ).count()

    #RETAKING
    df2 = df.select(
        split(
            col("words5"),
            " "
        ).alias("word6"))
    
    #removing stopwords - english 
    stopwords_remover2 = StopWordsRemover(inputCol="word6", outputCol="words7")
    df2 = stopwords_remover2.transform(df2)
    df2 = df2.drop("word6")

    #removing nulls - step2
    df2 = df2.select(
        (col("words7")[0]).alias("words8"))

    df2 = df2.where(
    col("words8") != "null")

    df2 = df2.where(
        col("words8") != "")
    
    #FINAL COUNT#
    df_counted2 = df2.groupby(
    col("words8")
    ).count()

    df_counted2 = df_counted2.orderBy(desc("count")).toPandas()
    
    return df_counted2

In [60]:
def count_words_PT(txt_path):
    #pyspark session
    spark = SparkSession.builder.appName("PySpark-ContandoPalavras").getOrCreate()
    sc = spark.sparkContext

    #txt doc
    df = spark.read.text(f"{txt_path}")

    #spliting the df by space " " 
    df = df.select(
    split(df.value, ' ').alias('words'))

    #extracting words from the lists
    df = df.select(
    explode(col("words")).alias("words3"))
    
    #every word to lower case
    df = df.select(
    lower(
    col('words3')
    ).alias('words4'))

    #removing punctuation
    punc = r"""[:!?.,"'"\"\/]"""
    é_rem = r"\b(\w*é\w*)"
    pode_rem = r"pode"
    ser_rem = r"ser"
    df = df.withColumn("words5", regexp_replace("words4", punc, ""))
    df = df.withColumn("words6", regexp_replace("words5", é_rem, ""))
    df = df.withColumn("words1", regexp_replace("words6", pode_rem, ""))
    df = df.withColumn("words7", regexp_replace("words1", ser_rem, ""))
    
    # Removing the NULLs
    df = df.where(
        col("words7") != "")

    #FIRST COUNT#
    df_counted = df.groupBy(
        col("words7"),
    ).count()

    #RETAKING
    df2 = df.select(
        split(
            col("words7"),
            " "
        ).alias("word8"))
    
    #removing stopwords - portuguese
    stopwords_remover = StopWordsRemover(inputCol="word8", outputCol="words9", 
                                            stopWords=StopWordsRemover.loadDefaultStopWords("portuguese"))
    df2 = stopwords_remover.transform(df2)
    df2 = df2.drop("word8")

    #removing nulls - step2
    df2 = df2.select(
        (col("words9")[0]).alias("words10"))

    df2 = df2.where(
    col("words10") != "null")

    df2 = df2.where(
        col("words10") != "")
    
    #FINAL COUNT#
    df_counted2 = df2.groupby(
    col("words10")
    ).count()

    df_counted2 = df_counted2.orderBy(desc("count")).toPandas()
    
    return df_counted2

#### Saving all the word_count.txt files in a CSV's files for future analysis and dashboard creation

##### **EN**

In [11]:
#folder path
folder_path = 'C:/Users/cedua/CDIA - PUCSP/PROJETO PySpark - SAVINO/Contador-de-palavras---PySpark/data/data_en'

#iterating the txt files only
for txt_file in glob.glob(os.path.join(folder_path, '*.txt')):
    df_pandas = count_words_EN(txt_file)
    df_pandas.to_csv(f'{txt_file[:-4]}.csv', sep=';', index=False)

##### **PT**

In [61]:
#folder path
folder_path = 'C:/Users/cedua/CDIA - PUCSP/PROJETO PySpark - SAVINO/Contador-de-palavras---PySpark/data/data_pt'

#iterating the txt files only
for txt_file in glob.glob(os.path.join(folder_path, '*.txt')):
    df_pandas = count_words_PT(txt_file)
    df_pandas.to_csv(f'{txt_file[:-4]}.csv', sep=';', index=False)