In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, regexp_replace, lit, concat, lead
from pyspark.sql import Window

In [2]:
SONG_LYRICS_FILES = "/home/jovyan/work/data/song_lyrics/*.txt"
OUTPUT_FILE = "/home/jovyan/work/data/bigrams_counts.csv"
SPLIT_PATTERN = r"[\s_]+|[^\w-'`*:$&.]+|\.\.\.*"
REPLACE_PATTERN = r"^\W+|\W+$"

In [3]:
spark = (
    SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("bigram-count")
    .getOrCreate()
)

In [4]:
song_lyrics_df = (
    spark.read
    .format("text")
    .load(SONG_LYRICS_FILES)
)

In [5]:
words_df = (
    song_lyrics_df
    .withColumn("word", explode(split(col("value"), SPLIT_PATTERN)))
    .withColumn("word", regexp_replace(col("word"), REPLACE_PATTERN, ""))
    .where(col("word") != "")
    .select("word")
)

In [6]:
window_spec_fake_order = Window.orderBy(lit(1))

bigrams_df = (
    words_df
    .withColumn(
        "word", 
        concat(
            col("word"), 
            lit(" "), 
            lead(col("word"), 1, None).over(window_spec_fake_order)
        )
    )
    .dropna()
)

In [7]:
counted_unique_bigrams_df = (
    bigrams_df
    .groupBy("word")
    .count()
    .orderBy(col("count").desc())
)


In [8]:
print("total_bigrams")
bigrams_df.count()

total_bigrams


1348940

In [9]:
print("bigrams_counts")
counted_unique_bigrams_df.show(10, False)

bigrams_counts
+--------+-----+
|word    |count|
+--------+-----+
|in the  |4917 |
|on the  |2390 |
|I don't |2094 |
|I know  |2076 |
|to the  |1933 |
|And I   |1793 |
|I got   |1791 |
|of the  |1482 |
|in my   |1477 |
|you know|1466 |
+--------+-----+
only showing top 10 rows



In [10]:
(
    counted_unique_bigrams_df
    .toPandas()
    .to_csv(OUTPUT_FILE, sep=",", index=False)
)

In [11]:
spark.stop()