In [1]:
import sys
import glob
from pathlib import Path

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    explode,
    split,
    regexp_replace,
    lower
)

In [2]:
SONG_LYRICS_FILES = "/home/jovyan/work/data/song_lyrics/*.txt"
OUTPUT_FILE = "/home/jovyan/work/data/word_counts.csv"
SPLIT_PATTERN = r"[\s_]+|[^\w-'`*:$&.]+|\.\.\.*"
REPLACE_PATTERN = r"^\W+|\W+$"

In [3]:
files = glob.glob(SONG_LYRICS_FILES)
output_file_dir_path = Path(OUTPUT_FILE).parent

if not files:
    sys.exit(f"Path does not exist: {SONG_LYRICS_FILES}")

if not output_file_dir_path.exists():
    sys.exit(f"Path does not exist: {str(output_file_dir_path)}")

In [4]:
spark = (
    SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("word-count")
    .getOrCreate()
)

In [5]:
song_lyrics_df = (
    spark.read
    .format("text")
    .load(SONG_LYRICS_FILES)
)

In [6]:
words_df = (
    song_lyrics_df
    .withColumn("word", explode(split(col("value"), SPLIT_PATTERN)))
    .withColumn("word", regexp_replace(col("word"), REPLACE_PATTERN, ""))
    .withColumn("word", lower(col("word")))
    .where(col("word") != "")
    .select("word")
)

In [7]:
counted_unique_words_df = (
    words_df
    .groupBy("word")
    .count()
    .orderBy(col("count").desc())
)

In [8]:
print("total_words")
words_df.count()

total_words


1348941

In [9]:
print("word_counts")
counted_unique_words_df.show(10, False)

word_counts
+----+-----+
|word|count|
+----+-----+
|the |49546|
|i   |46973|
|you |42494|
|and |29429|
|to  |26780|
|a   |26118|
|me  |21103|
|my  |19522|
|it  |17884|
|in  |17048|
+----+-----+
only showing top 10 rows



In [10]:
(
    counted_unique_words_df
    .toPandas()
    .to_csv(OUTPUT_FILE, sep=",", index=False)
)

In [11]:
spark.stop()