In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas
import matplotlib
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.113:7077") \
        .appName("AdityaShirke_A")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.cores.max",4)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

In [2]:
sc = spark_session.sparkContext
#A.1.1 Read the English transcripts with Spark, and count the number of lines.
lines_sv_en_en = sc.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.en")
linecount_sv_en_en = lines_sv_en_en.count()
linecount_sv_en_en

1862234

In [3]:
#A.1.2 Do the same with the other language (so that you have a separate lineage of RDDs for each).
lines_sv_en_sv = sc.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.sv").cache()
linecount_nl_en_sv = lines_sv_en_sv.count()
linecount_nl_en_sv

#A.1.3 Verify that the line counts are the same for the two languages.
if (linecount_sv_en_en == linecount_nl_en_sv):
    print('Verified: Line counts are the same for the two languages.')

Verified: Line counts are the same for the two languages.


In [4]:
#A.1.4 Count the number of partitions.
lines_sv_en_en.getNumPartitions()

2

In [5]:
#A.1.4 Count the number of partitions.
lines_sv_en_sv.getNumPartitions()

3

In [6]:
lines_sv_en_en.take(10)

['Resumption of the session',
 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.',
 "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.",
 'You have requested a debate on this subject in the course of the next few days, during this part-session.',
 "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.",
 "Please rise, then, for this minute' s silence.",
 "(The House rose and observed a minute' s silence)",
 'Madam President, on a point of order.',
 'You will be aware from the press and television that there have be

In [7]:
#A2.1 function to Pre-process the text from both RDDs by doing the following: Lowercase the text and Tokenize the text (split on space)
def lowercase_split(rdd):
    return rdd.map(lambda line: line.lower().split(' '))

#A.2.2 Inspect 10 entries from each of your RDDs to verify your pre-processing.
preprocessed_text_sv_en_en = lowercase_split(lines_sv_en_en)
print(preprocessed_text_sv_en_en.take(10))
print(preprocessed_text_sv_en_en.count())

#A.2.2 Inspect 10 entries from each of your RDDs to verify your pre-processing.
preprocessed_text_sv_en_sv = lowercase_split(lines_sv_en_sv)
print(preprocessed_text_sv_en_sv.take(10))
print(preprocessed_text_sv_en_sv.count())

#A.2.3 Verify that the line counts still match after the pre-processing.
if (preprocessed_text_sv_en_en.count() == preprocessed_text_sv_en_sv.count()):
    print('Verified: Line counts still match after the pre-processing.')

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

In [8]:
#A.3.1 Use Spark to compute the 10 most frequently according words in the English language corpus. Repeat for the other language.
import string #to remove punctuations
#reference: https://medium.com/@gulcanogundur/pyspark-word-count-b099106135a7 for punctuations and lambda use. We can still use manual method of defining regex which is just the same regex we have in string.punctuations.

def lowercase_split_map(rdd):
    split_lines = rdd.flatMap(lambda line: line.lower().strip(string.punctuation).split(' '))
    mapped = split_lines.filter(lambda x: x.isalpha()) \
                .map(lambda words: (words.strip(string.punctuation), 1), split_lines) \
                .reduceByKey(lambda a, b: a + b) \
                .sortByKey()
    return mapped

word_count_sv_en_en = lowercase_split_map(lines_sv_en_en)
word_count_sv_en_en = word_count_sv_en_en.map(lambda x:(x[1],x[0]))
#A.3.2 Verify that your results are reasonable.
print(word_count_sv_en_en.sortByKey(False).take(10))

word_count_sv_en_sv = lowercase_split_map(lines_sv_en_sv)
word_count_sv_en_sv = word_count_sv_en_sv.map(lambda x:(x[1],x[0]))
#A.3.2 Verify that your results are reasonable.
print(word_count_sv_en_sv.sortByKey(False).take(10))


[(3502214, 'the'), (1660403, 'of'), (1541401, 'to'), (1288421, 'and'), (1087073, 'in'), (805133, 'that'), (773554, 'a'), (759645, 'is'), (536037, 'for'), (522932, 'we')]
[(1706328, 'att'), (1344865, 'och'), (1052440, 'i'), (942048, 'det'), (913312, 'som'), (912272, 'för'), (739577, 'av'), (696798, 'är'), (620508, 'en'), (540272, 'vi')]


In [9]:
#Question A.4
#A.4.1 Use this parallel corpus to mine some translations in the form of word pairs, for the two
#languages. Do this by pairing words found on short lines with the same number of words
#respectively. We (incorrectly) assume the words stay in the same order when translated.

#1. Key the lines by their line number (hint: ZipWithIndex()).
en_1 = lines_sv_en_en.zipWithIndex()
sv_1 = lines_sv_en_sv.zipWithIndex()

#2. Swap the key and value - so that the line number is the key.
en_2 = en_1.map(lambda x:(x[1],x[0]))
sv_2 = sv_1.map(lambda x:(x[1],x[0]))

#3. Join the two RDDs together according to the line number key, so you have pairs of matching lines.
joined_en2_sv2 = en_2.join(sv_2)

#print(sv_2.take(10))

print(joined_en2_sv2.take(10))



[(0, ('Resumption of the session', 'Återupptagande av sessionen')), (5, ("Please rise, then, for this minute' s silence.", 'Jag ber er resa er för en tyst minut.')), (10, ("Would it be appropriate for you, Madam President, to write a letter to the Sri Lankan President expressing Parliament's regret at his and the other violent deaths in Sri Lanka and urging her to do everything she possibly can to seek a peaceful reconciliation to a very difficult situation?", 'Skulle det vara möjligt för er, fru talman, att skriva ett brev till den srilankesiska presidenten i vilket parlamentets beklagande uttrycks över hans och de övriga brutala dödsfallen i Sri Lanka och uppmanar henne att göra allt som står i hennes makt för att få en fredlig lösning på en mycket komplicerad situation?')), (15, ('My question relates to something that will come up on Thursday and which I will then raise again.', 'Min fråga har att göra med något som kommer att behandlas på torsdag och som jag då kommer att ta upp ig

In [10]:
#4. Filter to exclude line pairs that have an empty/missing “corresponding” sentence
joined_rdd = joined_en2_sv2 #count: 186223
joined_rdd = joined_rdd.filter(lambda x: (len(x[1][0].split()) != 0 )) #count:1850885
joined_rdd_f1 = joined_rdd.filter(lambda x: (len(x[1][1].split()) != 0 )) #count:1848423


In [11]:
#5. Filter to leave only pairs of sentences with a small number of words per sentence.
joined_rdd_f2 = joined_rdd_f1.filter(lambda x: (len(x[1][0].split()) == 7)) #count with word count = 10 :47029, #count with word count = 5 :23483 #count with word count = 7 :35146
print(joined_rdd_f2.count())

#6. Filter to leave only pairs of sentences with the same number of words in each sentence.
joined_rdd_f3 = joined_rdd_f2.filter(lambda x: (len(x[1][0].split()) == len(x[1][1].split()))) #count with word count = 10 :9175, #count with word count = 5 :8952, #count with word count = 7 :9545 (selected)
print(joined_rdd_f3.count())

print(joined_rdd_f3.take(10))


35146
9545
[(1825, ('We await the outcome with great interest.', 'Vi väntar på resultaten med stort intresse.')), (2770, ('The Commission, like others, has urged dialogue.', 'Kommissionen har, liksom andra, uppmanat till dialog.')), (3345, ('This communication must contain a clear timetable.', 'Detta meddelande måste innehålla en klar tidtabell.')), (4005, ('The laws against racism must be harmonised.', 'Det krävs harmonisering av lagstiftningen mot rasism.')), (4640, ('It is also a matter of conviction.', 'Det är också en fråga om övertygelse.')), (5275, ('That is why he deserves special mention!', 'Därför vill jag här särskilt nämna det!')), (5760, ('This was an important consideration for us.', 'Detta var ett viktigt ställningstagande för oss.')), (6865, ('It has no right to do so.', 'Det tillkommer inte EU att göra så.')), (7755, ('The second subject is flexibility and simplification.', 'Den andra frågan gäller flexibilitet och förenkling.')), (9205, ('We have seen vote after vote 

In [12]:
#7. For each sentence pair, map so that you pair each (in order) word in the two
#sentences. We no longer need the line numbers. (hint: use python’s built in zip() function)
zipped_pair = joined_rdd_f3.map(lambda x:(x[1][0],x[1][1]))
#print(zipped_pair.take(10))

zipped_pair_2 = zipped_pair.map(lambda line: (line[0].split(), line[1].split()))
#print(zipped_pair_2.take(10))

#Checking how data looks on separate basis for creating lambda query
zipped_pair_en = zipped_pair.map(lambda x:x[0].split())
zipped_pair_sv = zipped_pair.map(lambda x:x[1].split())
#print(zipped_pair_en.take(10))
#print(zipped_pair_sv.take(10))

#One way
#zipped_pair_combine_1 = zipped_pair.map(lambda x: [(x[i].split(), x[i+1].split(), 1) for i in range(0, len(x) - 1)])
#print(zipped_pair_combine_1.take(10))

#Another way which is given as hint in assignment - Going with this
zipped_pair_combine = zipped_pair_en.zip(zipped_pair_sv)
#print(zipped_pair_combine.take(10))

zipped_pair_word_pair = zipped_pair_combine.flatMap(lambda x: [((x[0][i], x[1][i]), 1) for i in range(0, len(x[0]) - 1)])
print(zipped_pair_word_pair.take(10))



[(('We', 'Vi'), 1), (('await', 'väntar'), 1), (('the', 'på'), 1), (('outcome', 'resultaten'), 1), (('with', 'med'), 1), (('great', 'stort'), 1), (('The', 'Kommissionen'), 1), (('Commission,', 'har,'), 1), (('like', 'liksom'), 1), (('others,', 'andra,'), 1)]


In [13]:
#8. Use reduce to count the number of occurrences of the word-translation-pairs.
zipped_pair_word_pair_reducer = zipped_pair_word_pair.reduceByKey(lambda a, b: a + b)
zipped_pair_word_pair_reducer_t = zipped_pair_word_pair_reducer.map(lambda x:(x[1],x[0]))

#9. Print some of the most frequently occurring pairs of words.
print(zipped_pair_word_pair_reducer_t.sortByKey(False).take(20))

#Do your translations seem reasonable? -> Translation seems reasonable most of the times.


[(1289, ('is', 'är')), (890, ('I', 'Jag')), (875, ('We', 'Vi')), (564, ('Question', 'Fråga')), (559, ('No', 'nr')), (545, ('and', 'och')), (517, ('by', 'från')), (513, ('a', 'en')), (469, ('not', 'inte')), (417, ('have', 'har')), (346, ('It', 'Det')), (339, ('a', 'ett')), (320, ('in', 'i')), (299, ('to', 'att')), (298, ('for', 'för')), (288, ('This', 'Detta')), (268, ('That', 'Det')), (267, ('must', 'måste')), (252, ('are', 'är')), (239, ('this', 'detta'))]
