# DSCI 617 - Homework 02
**Jeffery Boczkaja**

In [0]:
from pyspark.sql import SparkSession
import pandas as pd
from string import punctuation

In [0]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Problem 1: Word Count

In [0]:
ws_lines = sc.textFile("/FileStore/tables/shakespeare_complete.txt")

ws_words = (ws_lines
           .flatMap(lambda line: line.split(' '))
           .flatMap(lambda word: word.split('-'))
           .flatMap(lambda word: word.split('_'))
           .flatMap(lambda word: word.split('.'))
           .flatMap(lambda word: word.split(','))
           .flatMap(lambda word: word.split(':'))
           .flatMap(lambda word: word.split('|'))
           .flatMap(lambda word: word.split('\t'))
           .map(lambda word: word.strip(punctuation).strip('0123456789')
                              .replace("'", "").lower())
           .filter(lambda word: word != '')
          )

dist_words = ws_words.distinct()
print("Total Number of Words:      ", ws_words.count())
print("Number of Distinct Words:   ", dist_words.count())

Total Number of Words:       887279
Number of Distinct Words:    25362


In [0]:
s_words = ws_words.sample(withReplacement=False, fraction=0.0001).collect()

for word in s_words:
    print(word)

and
were
do
it
the
chastity
me
good
to
of
step
prophesy
me
lend
city
o
but
the
the
set
taen
than
am
catch
in
waste
no
you
o
oratory
and
false
for
margaret
whose
a
will
the
but
of
gods
you
shalt
holy
can
legitimate
some
drum
a
on
your
you
exeunt
hour
i
to
pity
and
good
is
i
both
as
it
by
we
a
the
that
do
romeo
their
montagues
cousin
thank
exit
you
goes
will
lion
to
what
or
be
my
where
kin


## Problem 2: Longest Words

In [0]:
def longer_string(s1: str, s2: str) -> str:
    if len(s1) > len(s2) or (len(s1) == len(s2) and s1 > s2):
        return s1
    else:
        return s2

test_strings = [("Michigan", "Indiana"), ("Wisconsin", "Oklahoma"), ("Mississippi", "Maine")]
results = [longer_string(s1, s2) for s1, s2 in test_strings]

longest_word = dist_words.reduce(longer_string)

print("Longest word:", longest_word)

Longest word: honorificabilitudinitatibus


In [0]:
longest_words_rdd = dist_words.sortBy(lambda word: -len(word))

top_20_longest_words = longest_words_rdd.take(20)

print("The 20 longest words:")
for word in top_20_longest_words:
    print(word)

The 20 longest words:
honorificabilitudinitatibus
anthropophaginian
undistinguishable
indistinguishable
northamptonshire
superserviceable
incomprehensible
prognostication
unreconciliable
interrogatories
gioucestershire
extraordinarily
particularities
praeclarissimus
impossibilities
misconstruction
flibbertigibbet
circumscription
disproportioned
uncomprehensive


## Problem 3: Word Frequency

In [0]:
pairs = ws_words.map(lambda word: (word, 1))
word_counts = pairs.reduceByKey(lambda a, b: a + b)
word_counts = word_counts.sortBy(lambda pair: pair[1], ascending=False)
top_20_word_counts = word_counts.take(20)
df_word_counts = pd.DataFrame(top_20_word_counts, columns=["Word", "Count"])
df_word_counts

Unnamed: 0,Word,Count
0,the,27379
1,and,26082
2,i,20717
3,to,19661
4,of,17474
5,a,14723
6,you,13630
7,my,12489
8,in,10996
9,that,10915


## Problem 4: Removing Stop Words

In [0]:
sw_rdd = sc.textFile("/FileStore/tables/stopwords.txt")
sw_count = sw_rdd.count()
print(sw_count)
sampled_sw = sw_rdd.sample(withReplacement=False, fraction=0.05).collect()
print(sampled_sw)
sw = sw_rdd.collect()

668
['act', 'am', 'become', 'cause', 'come', 'couldnt', 'd', 'end', 'fifth', 'former', 'furthermore', 'importance', 'invention', 'ltd', 'more', 'mrs', 'near', 'no', 'normally', 'obtained', 'particularly', 'plus', 'q', 'rd', 'seem', 'someone', 'still', 'thru', 'up', 'various', 'wasnt', 'whereafter', 'whomever', 'would', 'z']


In [0]:
ws_words_f = ws_words.filter(lambda word: word not in sw)
dist_words_f = ws_words_f.distinct()
print("Distinct Non-SW: ", dist_words_f.count())

Distinct Non-SW:  24840


In [0]:
pairs = ws_words_f.map(lambda word: (word, 1))
word_counts = pairs.reduceByKey(lambda a, b: a + b)
word_counts = word_counts.sortBy(lambda pair: pair[1], ascending=False)
top_20_word_counts = word_counts.take(20)
df_word_counts = pd.DataFrame(top_20_word_counts, columns=["Word", "Count"])
df_word_counts

Unnamed: 0,Word,Count
0,will,4977
1,thy,4034
2,thee,3180
3,lord,3062
4,king,2871
5,good,2834
6,sir,2763
7,well,2553
8,enter,2350
9,love,2109


## Problem 5: Diamonds Dataset


In [0]:
diamonds_raw = sc.textFile("/FileStore/tables/diamonds.txt")
print(diamonds_raw.count())

53941


In [0]:
first_five_elements = diamonds_raw.take(5)

for element in first_five_elements:
    print(element)

carat	cut	color	clarity	depth	table	price	x	y	z
0.23	Ideal	E	SI2	61.5	55	326	3.95	3.98	2.43
0.21	Premium	E	SI1	59.8	61	326	3.89	3.84	2.31
0.23	Good	E	VS1	56.9	65	327	4.05	4.07	2.31
0.29	Premium	I	VS2	62.4	58	334	4.2	4.23	2.63


In [0]:
def process_row(row):
    tokens = row.split('\t')

    processed_tokens = [
        float(tokens[0]),  # carat
        str(tokens[1]),    # cut
        str(tokens[2]),    # color
        str(tokens[3]),    # clarity
        float(tokens[4]),  # depth
        float(tokens[5]),  # table
        int(tokens[6]),    # price
        float(tokens[7]),  # x
        float(tokens[8]),  # y
        float(tokens[9])   # z
    ]
    
    return processed_tokens

header = diamonds_raw.first()
diamonds_without_header = diamonds_raw.filter(lambda row: row != header)
diamonds = diamonds_without_header.map(process_row)
first_five_diamonds = diamonds.take(5)
for diamond in first_five_diamonds:
    print(diamond)

[0.23, 'Ideal', 'E', 'SI2', 61.5, 55.0, 326, 3.95, 3.98, 2.43]
[0.21, 'Premium', 'E', 'SI1', 59.8, 61.0, 326, 3.89, 3.84, 2.31]
[0.23, 'Good', 'E', 'VS1', 56.9, 65.0, 327, 4.05, 4.07, 2.31]
[0.29, 'Premium', 'I', 'VS2', 62.4, 58.0, 334, 4.2, 4.23, 2.63]
[0.31, 'Good', 'J', 'SI2', 63.3, 58.0, 335, 4.34, 4.35, 2.75]


## Problem 6: Grouped Means

In [0]:
cut_summary = (diamonds
               .map(lambda x: (x[1], (x[0], x[6], 1)))
               .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2]))
               .map(lambda x: (x[0], x[1][2], round(x[1][0] / x[1][2], 2), round(x[1][1] / x[1][2], 2)))
               .collect())
               
cut_df = pd.DataFrame(cut_summary, columns=["Cut", "Count", "Mean_Carat", "Mean_Price"])
cut_df

Unnamed: 0,Cut,Count,Mean_Carat,Mean_Price
0,Premium,13791,0.89,4584.26
1,Good,4906,0.85,3928.86
2,Very Good,12082,0.81,3981.76
3,Fair,1610,1.05,4358.76
4,Ideal,21551,0.7,3457.54
