In [1]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("PART 1") \
    .getOrCreate()



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/21 01:30:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Read both text files
english_rdd = spark.sparkContext.textFile("inputs/el-en/europarl-v7.el-en.en")
greek_rdd = spark.sparkContext.textFile("inputs/el-en/europarl-v7.el-en.el")



In [3]:
# Define useful functions

# This function takes as arguments an rdd and the language of this rdd
# and prints the number of lines and partitions 

def read_and_count_lines(rdd, language):
    line_count = rdd.count()
    num_partitions_en = rdd.getNumPartitions()
    print(f'Number of lines for {language}= {line_count}')
    print(f'Number of partitions for {language}= {num_partitions_en}')
    
def preprocess(rdd):


    # Lowercase the text
    rdd = rdd.map(lambda line: line.lower())

    # Split the text 
    rdd = rdd.map(lambda line: line.split())
    return rdd

#### Counting the number of partitions

In [4]:
# Question A1
read_and_count_lines(english_rdd, "english")
read_and_count_lines(greek_rdd, "greek")

                                                                                

Number of lines for english= 1235976
Number of partitions for english= 6




Number of lines for greek= 1235976
Number of partitions for greek= 12


                                                                                

#### Lowercasing and tokenizing the texts

In [5]:
# Apply the preprocessing to each of the RDDs
english_rdd = preprocess(english_rdd)
greek_rdd = preprocess(greek_rdd)

In [6]:
# Inspect entries
english_rdd.take(10)
greek_rdd.take(10)

[['επαvάληψη', 'της', 'συvσδoυ'],
 ['κηρύσσω',
  'την',
  'επανάληψη',
  'της',
  'συνόδου',
  'του',
  'ευρωπαϊκού',
  'κοινοβουλίου',
  'η',
  'οποία',
  'είχε',
  'διακοπεί',
  'την',
  'παρασκευή',
  '17',
  'δεκεμβρίου',
  'και',
  'σας',
  'απευθύνω',
  'ξανά',
  'τις',
  'θερμές',
  'ευχές',
  'μου,',
  'ελπίζοντας',
  'να',
  'περάσατε',
  'καλά',
  'στις',
  'διακοπές.'],
 ['όπως',
  'μπορέσατε',
  'να',
  'διαπιστώσετε,',
  'ο',
  'περίφημος',
  '"ιός',
  'του',
  'έτους',
  '2000"',
  'δεν',
  'εμφανίσθηκε.',
  'αντιθέτως,',
  'οι',
  'πολίτες',
  'ορισμένων',
  'χωρών',
  'μας',
  'υπήρξαν',
  'θύματα',
  'φυσικών',
  'καταστροφών,',
  'οι',
  'οποίες',
  'ήταν',
  'όντως',
  'φοβερές.'],
 ['επιθυμείτε',
  'μία',
  'συζήτηση',
  'επί',
  'του',
  'θέματος',
  'τις',
  'επόμενες',
  'ημέρες,',
  'κατά',
  'τη',
  'διάρκεια',
  'της',
  'τρέχουσας',
  'περιόδου',
  'συνόδου.'],
 ['επί',
  'του',
  'παρόντος',
  'θα',
  'ήθελα,',
  'όπως',
  'μου',
  'ζήτησαν',
  'ορισμένοι',


In [7]:
# Verify that line counts are matching after the preprocessing

if (english_rdd.count() == greek_rdd.count()):
    print("Line counts are matching")



Line counts are matching


                                                                                

#### Counting the 10 most frequent words

In [8]:
def word_count(rdd):

    # Flatten the rdd to be a list of words
    words = rdd.flatMap(lambda sentence: sentence)

    # Do the map-reduce process
    mapping = words.map(lambda word: (word, 1))
    reducing = mapping.reduceByKey(lambda x, y: x + y)

    # return a list
    return reducing.collect()


    

In [9]:
print("For the English text, the top 10 words are: ")
word_counts = word_count(english_rdd)
word_counts_sorted = sorted(word_counts, key=lambda x: x[1], reverse= True)



For the English text, the top 10 words are: 


                                                                                

In [10]:
print([item[0] for item in word_counts_sorted[:10]])


['the', 'of', 'to', 'and', 'in', 'that', 'a', 'is', 'for', 'we']


In [11]:
print("For the Greek text, the top 10 words are: ")
word_counts = word_count(greek_rdd)
word_counts_sorted = sorted(word_counts, key=lambda x: x[1], reverse= True)
print([item[0] for item in word_counts_sorted[:10]])


For the Greek text, the top 10 words are: 


                                                                                

['να', 'και', 'της', 'την', 'το', 'η', 'για', 'των', 'του', 'που']


#### Finding translation pairs

In [54]:
# Keying RDD lines by bumber
rdd_en_indexed = english_rdd.zipWithIndex()
rdd_el_indexed = greek_rdd.zipWithIndex()


                                                                                

In [55]:
rdd_en_indexed.take(5)

[(['resumption', 'of', 'the', 'session'], 0),
 (['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999,',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period.'],
  1),
 (['although,',
   'as',
   'you',
   'will',
   'have',
   'seen,',
   'the',
   'dreaded',
   "'millennium",
   "bug'",
   'failed',
   'to',
   'materialise,',
   'still',
   'the',
   'people',
   'in',
   'a',
   'number',
   'of',
   'countries',
   'suffered',
   'a',
   'series',
   'of',
   'natural',
   'disasters',
   'that',
   'truly',
   'were',
   'dreadful.'],
  2),
 (['you',
   'have',
   'requested',
   'a',
   'debate',
   'on',
   'this',
   'subject',
   'in',
   'the',
   'course',
  

In [56]:
# Swap between key-value

rdd_en_keyed = rdd_en_indexed.map(lambda x: (x[1], x[0]))
rdd_el_keyed = rdd_el_indexed.map(lambda x: (x[1], x[0]))


In [57]:
# Join the RDDs according to the line number key
joined_rdd = rdd_en_keyed.join(rdd_el_keyed)
joined_rdd.take(5)

                                                                                

[(1422,
  (['the',
    'situation',
    'as',
    'far',
    'as',
    'self-employed',
    'people',
    'are',
    'concerned',
    'is',
    'that',
    'current',
    'community',
    'invested',
    'legal',
    'rights',
    'do',
    'not',
    'afford',
    'third-country',
    'nationals',
    'the',
    'right',
    'to',
    'provide',
    'cross-border',
    'services.'],
   ['στην',
    'περίπτωση',
    'των',
    'αυτοαπασχολούμενων',
    'η',
    'κατάσταση',
    'έχει',
    'ως',
    'εξής:',
    'το',
    'κοινοτικό',
    'κεκτημένο',
    'στην',
    'σημερινή',
    'του',
    'μορφή',
    'δεν',
    'προβλέπει',
    'για',
    'τους',
    'υπηκόους',
    'τρίτων',
    'χωρών',
    'το',
    'δικαίωμα',
    'να',
    'παρέχουν',
    'διασυνοριακές',
    'υπηρεσίες.'])),
 (1926,
  (['i',
    'would',
    'like',
    'to',
    'remind',
    'the',
    'house',
    'that',
    'at',
    'second',
    'reading',
    'parliament',
    'adopted',
    'this',
    'report',
  

In [58]:
# Remove pairs with missing sentences
filtered_rdd = joined_rdd.filter(lambda x: x[1][0] != '' and x[1][1] != '')
filtered_rdd.take(5)


                                                                                

[(1422,
  (['the',
    'situation',
    'as',
    'far',
    'as',
    'self-employed',
    'people',
    'are',
    'concerned',
    'is',
    'that',
    'current',
    'community',
    'invested',
    'legal',
    'rights',
    'do',
    'not',
    'afford',
    'third-country',
    'nationals',
    'the',
    'right',
    'to',
    'provide',
    'cross-border',
    'services.'],
   ['στην',
    'περίπτωση',
    'των',
    'αυτοαπασχολούμενων',
    'η',
    'κατάσταση',
    'έχει',
    'ως',
    'εξής:',
    'το',
    'κοινοτικό',
    'κεκτημένο',
    'στην',
    'σημερινή',
    'του',
    'μορφή',
    'δεν',
    'προβλέπει',
    'για',
    'τους',
    'υπηκόους',
    'τρίτων',
    'χωρών',
    'το',
    'δικαίωμα',
    'να',
    'παρέχουν',
    'διασυνοριακές',
    'υπηρεσίες.'])),
 (1926,
  (['i',
    'would',
    'like',
    'to',
    'remind',
    'the',
    'house',
    'that',
    'at',
    'second',
    'reading',
    'parliament',
    'adopted',
    'this',
    'report',
  

In [59]:
# Keep only sentences with more than 'min words' word and less than 'max words' 
min_words = 2
max_words = 5
filtered_rdd = filtered_rdd.filter(lambda x: ( (len(x[1][0]) >  min_words and (len(x[1][0]) < max_words)) and (len(x[1][1]) < max_words and len(x[1][1]) > min_words)))

filtered_rdd.count()

                                                                                

13742

In [60]:
# Leave only pairs of sentences with the same number of words in each

filtered_rdd = filtered_rdd.filter(lambda x: len(x[1][0]) == len(x[1][1]))
filtered_rdd.take(5)

                                                                                

[(114660,
  (['decision', 'on', 'urgent', 'procedure'],
   ['απόφαση', 'επί', 'του', 'κατεπείγοντος'])),
 (180072,
  (['this', 'has', 'to', 'change!'], ['αυτό', 'πρέπει', 'να', 'αλλάξει!'])),
 (142956,
  (['bösch', 'report', '(a5-0393/2001)'],
   ['έκθεση', 'bφsch', '(a5-0393/2001)'])),
 (28800,
  (['papayannakis', 'report', '(a5-0088/2000)'],
   ['έκθεση', 'παπαγιαννάκη', '(α5-0088/2000)'])),
 (142578,
  (['watson', 'report', '(a5-0397/2001)'],
   ['έκθεση', 'watson', '(a5-0397/2001)']))]

In [61]:
# In each sentence pair each English word with the Greek one
filtered_rdd = filtered_rdd.map(lambda x: list(zip(x[1][0], x[1][1])))
filtered_rdd.take(50)
                                   

                                                                                

[[('decision', 'απόφαση'),
  ('on', 'επί'),
  ('urgent', 'του'),
  ('procedure', 'κατεπείγοντος')],
 [('this', 'αυτό'), ('has', 'πρέπει'), ('to', 'να'), ('change!', 'αλλάξει!')],
 [('bösch', 'έκθεση'),
  ('report', 'bφsch'),
  ('(a5-0393/2001)', '(a5-0393/2001)')],
 [('papayannakis', 'έκθεση'),
  ('report', 'παπαγιαννάκη'),
  ('(a5-0088/2000)', '(α5-0088/2000)')],
 [('watson', 'έκθεση'),
  ('report', 'watson'),
  ('(a5-0397/2001)', '(a5-0397/2001)')],
 [('report', 'έκθεση'),
  ('karas', 'karas'),
  ('(a5-0031/2002)', '(a5-0031/2002)')],
 [('contradictions?', 'αντιφάσεις;'), ('surely', 'μάλλον'), ('not!', 'όχι!')],
 [('international', 'διεθνές'),
  ('criminal', 'ποινικό'),
  ('court', 'δικαστήριο')],
 [('that', 'αυτό'),
  ('is', 'είναι'),
  ('totally', 'παντελώς'),
  ('incorrect.', 'ανακριβές.')],
 [('thank', 'ευχαριστώ,'),
  ('you,', 'επίτροπε'),
  ('commissioner', 'antonio'),
  ('vitorino.', 'vitorino.')],
 [('thank', 'ευχαριστώ'),
  ('you,', 'πολύ,'),
  ('madam', 'κυρία'),
  ('commis

In [62]:
# Count the number of occurrences of the word-translation-pairs.

# Mapping Phase
filtered_rdd = filtered_rdd.flatMap(lambda sentence: sentence)
filtered_rdd = filtered_rdd.map(lambda pair: (pair, 1))
#word_translation_pairs_with_count.take(10)

# Reduce Phase
filtered_rdd = filtered_rdd.reduceByKey(lambda x, y: x + y)

filtered_rdd = filtered_rdd.reduceByKey(lambda x, y: x + y)

# Sort the counts for word pairs in descending order
sorted_counts = filtered_rdd.sortBy(lambda x: x[1], ascending=False)




                                                                                

In [64]:
sorted_counts.count()

8944

In [65]:
# Fetch top 30 pairs
top_30_pairs = sorted_counts.take(30)
top_30_pairs

[(('is', 'είναι'), 814),
 (('that', 'αυτό'), 470),
 (('-', '-'), 443),
 (('this', 'αυτό'), 361),
 (('mr', 'κύριε'), 327),
 (('thank', 'σας'), 326),
 (('of', 'των'), 297),
 (('thank', 'ευχαριστώ'), 207),
 (('report:', 'έκθεση'), 204),
 (('you,', 'πολύ,'), 193),
 (('you,', 'ευχαριστώ,'), 191),
 (('written', 'γραπτές'), 181),
 (('142)', '142)'), 162),
 (('statements', 'δηλώσεις'), 159),
 (('the', 'συνοπτικών'), 157),
 (('minutes', 'πρακτικών'), 154),
 (('(rule', '(άρθρο'), 153),
 (('and', 'και'), 150),
 (('approval', 'έγκριση'), 135),
 (('the', 'το'), 125),
 (('the', 'η'), 118),
 (('mrs', 'κυρία'), 110),
 (('of', 'της'), 105),
 (('unacceptable.', 'απαράδεκτο.'), 105),
 (('important.', 'σημαντικό.'), 96),
 (('order', 'διάταξη'), 92),
 (('business', 'εργασιών'), 91),
 (('subject:', 'θέμα:'), 90),
 (('minutes', 'τα'), 90),
 (('were', 'συνοπτικά'), 90)]

### Results

Most of the translations are accurate but the thing is that Greek language's syntax is different from the English, so some of them are inaccurate.