In [47]:
from pyspark.sql import SparkSession

import assignment3.data_cleaner as dc
import importlib

# from pyspark.sql.functions

In [53]:
# Reload the module
importlib.reload(dc)

# Create a spark session and load text file
spark = SparkSession.builder.appName("word_pairs").getOrCreate()
hamlet_df = spark.read.text("hamlet.txt").cache()
print(hamlet_df.count())
hamlet_df.show()

# Clean the dataframe but DO NOT split and explode the words
hamlet_rows_df = dc.clean_dataset(hamlet_df, should_split_explode=False)
# hamlet_rows_df.show()
print(hamlet_rows_df.count())

5899
+--------------------+
|               value|
+--------------------+
|             HAMLET,|
|  PRINCE OF DENMARK.|
|              ACT I.|
|Scene I.—ELSINORE...|
|Francisco on his ...|
|                    |
|   Ber. Who's there?|
|                    |
|Fran. (R.) Nay, a...|
|                    |
|Ber. Long live th...|
|                    |
|               Fran.|
|           Bernardo?|
|                    |
|                Ber.|
|                 He.|
|                    |
|Fran. You come mo...|
|                    |
+--------------------+
only showing top 20 rows

3062


In [24]:
from pyspark.sql.functions import explode, split
hamlet_exploded_df = hamlet_rows_df.select(explode(split(hamlet_rows_df.word, ' ')).alias('word'))
hamlet_exploded_df.show()

+---------+
|     word|
+---------+
|   hamlet|
|   prince|
|       of|
|  denmark|
|      act|
|        i|
|    scene|
|ielsinore|
|        a|
| platform|
|   before|
|      the|
|   castle|
|    night|
|francisco|
|       on|
|      his|
|     post|
|    enter|
|       to|
+---------+
only showing top 20 rows



In [27]:
from itertools import combinations_with_replacement
a = ['a', 'b', 'c', 'a']
alist = dict({word: a.count(word) for word in a})
pair_list = list(combinations_with_replacement(alist.keys(), 2))
newlist = []
for pair in pair_list:
    newlist.extend([f"({pair[0]}, {pair[1]})" for _ in range(min(alist[pair[0]], alist[pair[1]]))])
print(newlist)
# print(list(combinations_with_replacement(alist, 2)))

['(a, a)', '(a, a)', '(a, b)', '(a, c)', '(b, b)', '(b, c)', '(c, c)']


In [43]:
# Get the list of words and convert them to pairs

from pyspark.sql.functions import udf, size
from pyspark.sql.types import ArrayType, StringType
from itertools import combinations_with_replacement

def create_pairs(sentence):
    # set will remove duplicates
    all_words = sorted(list(([word for word in sentence.split(' ') if word != ''])))
    
    # Create pairs using itertools
    pairs = list(combinations_with_replacement(all_words, 2))
    
    # Add pairs to the dataframe as constant string literals.
    return [f"({pair[0]}, {pair[1]})" for pair in pairs]

create_pairs_udf = udf(create_pairs, ArrayType(StringType()))
pairs_df = hamlet_rows_df.select(create_pairs_udf(hamlet_rows_df.word).alias('pairs'))

# Filter out null values and empty lists
pairs_df_cleaned = pairs_df.filter(pairs_df.pairs.isNotNull() & (size(pairs_df.pairs) > 0))
all_pairs_df = pairs_df_cleaned.select(explode(pairs_df_cleaned.pairs).alias('pair'))
all_pairs_df.show()

+--------------------+
|                pair|
+--------------------+
|    (hamlet, hamlet)|
|            (of, of)|
|        (of, prince)|
|       (of, denmark)|
|    (prince, prince)|
|   (prince, denmark)|
|  (denmark, denmark)|
|          (act, act)|
|            (act, i)|
|              (i, i)|
|(ielsinore, ielsi...|
| (ielsinore, castle)|
|  (ielsinore, scene)|
|    (ielsinore, the)|
|      (ielsinore, a)|
|(ielsinore, platf...|
| (ielsinore, before)|
|  (ielsinore, night)|
|    (castle, castle)|
|     (castle, scene)|
+--------------------+
only showing top 20 rows



In [45]:
# Count the number of pairs

all_pairs_df.groupBy('pair').count().orderBy('count', ascending=False).limit(20).show()

+------------+-----+
|        pair|count|
+------------+-----+
|  (the, the)|  756|
|  (and, and)|  621|
|    (to, to)|  547|
|    (of, of)|  525|
|      (a, a)|  395|
|    (in, in)|  356|
|  (you, you)|  356|
|      (i, i)|  349|
|    (my, my)|  338|
|   (of, the)|  325|
|    (is, is)|  283|
|    (it, it)|  282|
|(that, that)|  264|
|  (ham, ham)|  261|
|  (the, and)|  231|
|  (not, not)|  226|
|(this, this)|  223|
|   (to, the)|  216|
|(with, with)|  215|
|  (his, his)|  211|
+------------+-----+



In [46]:
spark.stop()