# A1

In [1]:
from pyspark.sql import SparkSession 
spark_session = SparkSession.builder \
   .master("spark://192.168.2.156:7077") \
   .appName("alexanderSundquist_A1") \
   .config("spark.dynamicAllocation.enabled", True) \
   .config("spark.dynamicAllocation.shuffleTracking.enabled", True) \
   .config("spark.shuffle.service.enabled", False) \
   .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
   .config("spark.cores.max", 2) \
   .config("spark.driver.port",9999)\
   .config("spark.blockManager.port",10005)\
   .getOrCreate() 

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/05 12:30:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/05 12:30:42 WARN Utils: Service 'sparkDriver' could not bind on port 9999. Attempting port 10000.
25/03/05 12:30:43 WARN StandaloneSchedulerBackend: Dynamic allocation enabled without spark.executor.cores explicitly set, you may get more executors allocated than expected. It's recommended to set spark.executor.cores explicitly. Please check SPARK-30299 for more details.


In [3]:
# RDD API
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

In [4]:
# Read Swedish and its English counterpart from HDFS
en_file = spark_context.textFile("hdfs://192.168.2.156:9000/data/europarl/europarl-v7.sv-en.en")
sv_file = spark_context.textFile("hdfs://192.168.2.156:9000/data/europarl/europarl-v7.sv-en.sv")

In [4]:
en_file.first()

                                                                                

'Resumption of the session'

In [5]:
sv_file.first()

                                                                                

'Återupptagande av sessionen'

In [6]:
print("Line count Swedish file", sv_file.count())
print("Line count English file", en_file.count())

                                                                                

Line count Swedish file 1862234




Line count English file 1862234


                                                                                

In [7]:
# Count number of partitions 
en_file.getNumPartitions()

2

In [8]:
sv_file.getNumPartitions()

3

# A2

In [9]:
first_line = sv_file.first()
first_line_lower = first_line.lower()
print(first_line_lower)

[Stage 4:>                                                          (0 + 1) / 1]

återupptagande av sessionen


                                                                                

In [5]:
import re
def remove_special_char(file):
    return file.map(lambda line: re.sub("[^A-Za-z0-9 -åäöÅÄÖ]+",'', line))

In [6]:
def lower_case(file):
    return file.map(lambda line: line.lower())

In [7]:
def tokenize(file):
    return file.map(lambda line: line.split(" "))

In [8]:
sv_preprocessed = remove_special_char(sv_file)
sv_preprocessed = lower_case(sv_preprocessed)
sv_preprocessed = tokenize(sv_preprocessed)

en_preprocessed = remove_special_char(en_file)
en_preprocessed = lower_case(en_preprocessed)
en_preprocessed = tokenize(en_preprocessed)

In [9]:
print("Swedish preprocessed: ", sv_preprocessed.take(10))
print("English preprocessed: ", en_preprocessed.take(10))

                                                                                

Swedish preprocessed:  [['återupptagande', 'av', 'sessionen'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'], ['som', 'ni', 'kunnat', 'konstatera', 'ägde', '"den', 'stora', 'år', '2000-buggen"', 'aldrig', 'rum.', 'däremot', 'har', 'invånarna', 'i', 'ett', 'antal', 'av', 'våra', 'medlemsländer', 'drabbats', 'av', 'naturkatastrofer', 'som', 'verkligen', 'varit', 'förskräckliga.'], ['ni', 'har', 'begärt', 'en', 'debatt', 'i', 'ämnet', 'under', 'sammanträdesperiodens', 'kommande', 'dagar.'], ['till', 'dess', 'vill', 'jag', 'att', 'vi,', 'som', 'ett', 'antal', 'kolleger', 'begärt,', 'håller', 'en', 'tyst', 'minut', 'för', 'offren', 'för', 'bl.a.', 'stormarna', 'i', 'de', 'länder', 'i', 'europeiska', 'unionen', 'som', 'drabbats.'], ['jag', 'ber', 'er', 'resa', 'er', 'för', 'e

[Stage 1:>                                                          (0 + 1) / 1]

English preprocessed:  [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 

                                                                                

In [14]:
print("Line count Swedish file", sv_preprocessed.count())
print("Line count English file", en_preprocessed.count())

                                                                                

Line count Swedish file 1862234


[Stage 8:>                                                          (0 + 2) / 2]

Line count English file 1862234


                                                                                

# A3

In [10]:
import re
def remove_special_char(file):
    return file.map(lambda line: re.sub("[^A-Za-z0-9 åäöÅÄÖ]+",'', line))

def lower_case(file):
    return file.map(lambda line: line.lower())

# Easier to use flatmap to break out of lists that are kept with map()
def tokenize(file):
    return file.flatMap(lambda line: line.split(" "))

sv_pre = remove_special_char(sv_file)
sv_pre = lower_case(sv_pre)
sv_pre = tokenize(sv_pre)

en_pre = remove_special_char(en_file)
en_pre = lower_case(en_pre)
en_pre = tokenize(en_pre)

In [23]:
# MapReduce to count occurences of each word

en_words = en_pre.map(lambda w: (w,1))
en_counts = en_words.reduceByKey(lambda a, b: a + b)

sv_words = sv_pre.map(lambda w: (w,1))
sv_counts = sv_words.reduceByKey(lambda a, b: a + b)

print(en_counts.take(5))
print(sv_counts.take(5))

                                                                                

[('resumption', 527), ('of', 1662006), ('i', 504521), ('declare', 1386), ('european', 270336)]




[('session', 771), ('efter', 42231), ('avbrottet', 293), ('december', 6019), ('vill', 131230)]


                                                                                

In [24]:
sv_words.take(5)

                                                                                

[('återupptagande', 1),
 ('av', 1),
 ('sessionen', 1),
 ('jag', 1),
 ('förklarar', 1)]

In [17]:
# Calculate 10 most reacurring words in sv file

sorted_sv_counts = sv_counts.sortBy(lambda w: w[1], ascending=False)
sorted_sv_counts.take(10)

                                                                                

[('att', 1709939),
 ('och', 1350379),
 ('i', 1054253),
 ('det', 952991),
 ('som', 917591),
 ('för', 915081),
 ('av', 740724),
 ('är', 701842),
 ('en', 636829),
 ('vi', 546072)]

In [18]:
# Calculate total words in sv file

from operator import add

sv_words = sv_file.map(lambda line: line.split(" "))

sv_word_counts = sv_words.map(lambda w: len(w))

sv_total_words = sv_word_counts.reduce(add)

print(f"total words= {sv_total_words}")  



total words= 41604741


                                                                                

In [19]:
# Calculate 10 most reacurring words in en file
sorted_en_counts = en_counts.sortBy(lambda w: w[1], ascending=False)
sorted_en_counts.take(10)

[('the', 3505175),
 ('of', 1662006),
 ('to', 1543746),
 ('and', 1318369),
 ('in', 1088902),
 ('that', 839083),
 ('is', 774941),
 ('a', 774545),
 ('for', 538192),
 ('we', 526488)]

In [20]:
# Calculate total words for en file
from operator import add

en_words = en_file.map(lambda line: line.split(" "))

en_word_counts = en_words.map(lambda w: len(w))

en_total_words = en_word_counts.reduce(add)

print(f"total words= {en_total_words}")  

[Stage 28:>                                                         (0 + 2) / 2]

total words= 45778381


                                                                                

# A4

## 1

In [11]:
sv_1 = sv_preprocessed.zipWithIndex()
en_1 = en_preprocessed.zipWithIndex()

                                                                                

## 2

In [12]:
sv_2 = sv_1.map(lambda x: (x[1], x[0]))
en_2 = en_1.map(lambda x: (x[1], x[0]))

## 3

In [17]:
sv_en_3 = sv_2.join(en_2)

In [14]:
sv_en_3.take(3)

                                                                                

[(6120,
  (['jag',
    'ser',
    'inget',
    'skäl',
    'till',
    'varför',
    'man',
    'inte',
    'skulle',
    'kunna',
    'organisera',
    'ett',
    'tillståndssystem',
    'för',
    'uppsamlingsplatser',
    'över',
    'hela',
    'europa,',
    'för',
    'att',
    'skrota',
    'de',
    '8',
    'till',
    '9',
    'miljoner',
    'fordon',
    'som',
    'man',
    'årligen',
    'gör',
    'sig',
    'av',
    'med',
    'inom',
    'europeiska',
    'unionen.'],
   ['i',
    'see',
    'no',
    'reason',
    'why',
    'licensing',
    'arrangements',
    'cannot',
    'be',
    'organised',
    'across',
    'europe',
    'for',
    'collection',
    'centres',
    'to',
    'scrap',
    'the',
    '8',
    'to',
    '9',
    'million',
    'vehicles',
    'which',
    'are',
    'disposed',
    'of',
    'within',
    'the',
    'european',
    'union',
    'on',
    'an',
    'annual',
    'basis.'])),
 (10775,
  (['i',
    'och',
    'med',
    'maastrich

## 4

In [18]:
sv_en_4 = sv_en_3.filter(lambda x: x[1][1] != [''] and x[1][0] != [''])

In [25]:
# Calculate total words in sv file

from operator import add

word_count = sv_en_4.map(lambda w: len(w))

total_words = word_count.reduce(add)

print(f"total words= {total_words}")  



total words= 3696846


                                                                                

In [26]:
print("Line count sv en removed empty lines", sv_en_4.count())




Line count sv en removed empty lines 1848423


                                                                                

In [27]:
print("Line count sv en not removed empty lines: ", sv_en_3.count())



Line count sv en not removed empty lines:  1862234


                                                                                

In [28]:
print(sv_en_3.count()-sv_en_4.count())



13811


                                                                                

In [29]:
sv_file.filter(lambda x: x == "").count()

                                                                                

2462

In [30]:
sv_preprocessed.filter(lambda x: x == ['']).count()

                                                                                

2462

In [31]:
en_file.filter(lambda x: x == "").count()

                                                                                

11349

In [32]:
en_preprocessed.filter(lambda x: x == ['']).count()

                                                                                

11349

In [33]:
sv_file.take(3)

                                                                                

['Återupptagande av sessionen',
 'Jag förklarar Europaparlamentets session återupptagen efter avbrottet den 17 december. Jag vill på nytt önska er ett gott nytt år och jag hoppas att ni haft en trevlig semester.',
 'Som ni kunnat konstatera ägde "den stora år 2000-buggen" aldrig rum. Däremot har invånarna i ett antal av våra medlemsländer drabbats av naturkatastrofer som verkligen varit förskräckliga.']

In [34]:
sv_preprocessed.take(3)

                                                                                

[['återupptagande', 'av', 'sessionen'],
 ['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december.',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester.'],
 ['som',
  'ni',
  'kunnat',
  'konstatera',
  'ägde',
  '"den',
  'stora',
  'år',
  '2000-buggen"',
  'aldrig',
  'rum.',
  'däremot',
  'har',
  'invånarna',
  'i',
  'ett',
  'antal',
  'av',
  'våra',
  'medlemsländer',
  'drabbats',
  'av',
  'naturkatastrofer',
  'som',
  'verkligen',
  'varit',
  'förskräckliga.']]

## 5

In [19]:
sv_en_5 = sv_en_4.filter(lambda x: len(x[1][1]) < 10 and len(x[1][0]) < 10) 

In [20]:
sv_en_5.count()

                                                                                

200796

## 6

In [21]:
sv_en_6 = sv_en_5.filter(lambda x: len(x[1][1]) == len(x[1][0])) 

In [22]:
print(f"Line count sv_en_6 {sv_en_6.count()}")



Line count sv_en_6 73844


                                                                                

## 7

In [37]:
sv_en_6.take(3)

                                                                                

[(45385,
  (['den', 'första', 'frågan', 'handlar', 'om', 'kommittförfarandet.'],
   ['the', 'first', 'question', 'refers', 'to', 'commitology.'])),
 (206300,
  (['omröstningen',
    'kommer',
    'att',
    'äga',
    'rum',
    'i',
    'morgon',
    'kl.',
    '11.00.'],
   ['the', 'vote', 'will', 'take', 'place', 'tomorrow', 'at', '11', 'a.m.'])),
 (269015,
  (['parlamentarisk',
    'kontroll',
    'och',
    'en',
    'medlagstiftandebefogenhet',
    'är',
    'också',
    'oumbärligt.'],
   ['central',
    'is',
    'therefore',
    'also',
    'parliamentary',
    'control',
    'and',
    'co-legislation.']))]

In [15]:
# Gives unworkable output, eventhough it works.
def zip_lists(list_1, list_2):
    translation_list = []
    for last, first in zip(list_1, list_2):
        pair_list = [last,first]
        translation_list.append(pair_list)
    return translation_list

In [25]:
sv_en_7 = sv_en_6.flatMap(lambda x: zip(x[1][0], x[1][1]))

In [26]:
sv_en_7.take(3)

                                                                                

[('jag', 'i'), ('hoppas', 'hope'), ('vi', 'we')]

## 8

In [29]:
# MapReduce to count occurences of each word pair
from operator import add

wp_map = sv_en_7.map(lambda wp: (wp,1))

wp_counts = wp_map.reduceByKey(add)

print(wp_counts.take(5))



[(('-', '-'), 643), (('betänkande:', 'report:'), 253), (('är', 'is'), 10040), (('den', 'the'), 1532), (('andra', 'second'), 260)]


                                                                                

In [31]:
# Calculate 10 most reacurring words in sv file

sorted_counts = wp_counts.sortBy(lambda w: w[1], ascending=False)


                                                                                