In [1]:
from pyspark.sql import SparkSession


In [2]:
spark_session = (
    SparkSession.builder
    .master("spark://192.168.2.156:7077")
    .appName("Alexnader_partA")
    .config("spark.dynamicAllocation.enabled", True)
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True)
    .config("spark.shuffle.service.enabled", False)
    .config("spark.dynamicAllocation.executorIdleTimeout", "30s")
    .config("spark.cores.max", 2)
    .getOrCreate()
)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/11 20:26:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/11 20:26:17 WARN StandaloneSchedulerBackend: Dynamic allocation enabled without spark.executor.cores explicitly set, you may get more executors allocated than expected. It's recommended to set spark.executor.cores explicitly. Please check SPARK-30299 for more details.


In [3]:
sc = spark_session.sparkContext  # SparkContext
rdd_en = sc.textFile("hdfs://192.168.2.156:9000/data/europarl/europarl-v7.sv-en.en")
rdd_other = sc.textFile("hdfs://192.168.2.156:9000/data/europarl/europarl-v7.sv-en.sv")

In [4]:

count_en = rdd_en.count()
count_other = rdd_other.count()
print("English lines:", count_en)
print("Other language lines:", count_other)




English lines: 1862234
Other language lines: 1862234


                                                                                

In [5]:
print("Partitions (English):", rdd_en.getNumPartitions())
print("Partitions (Other):", rdd_other.getNumPartitions())

Partitions (English): 2
Partitions (Other): 3


In [6]:
def preprocess_line(line):
    line = line.lower()
    return line.split()

In [7]:
rdd_en_clean = rdd_en.map(preprocess_line)
rdd_other_clean = rdd_other.map(preprocess_line)

In [8]:
print("English sample:", rdd_en_clean.take(5))
print("Other language sample:", rdd_other_clean.take(5))

English sample: [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', '

[Stage 3:>                                                          (0 + 1) / 1]

Other language sample: [['återupptagande', 'av', 'sessionen'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'], ['som', 'ni', 'kunnat', 'konstatera', 'ägde', '"den', 'stora', 'år', '2000-buggen"', 'aldrig', 'rum.', 'däremot', 'har', 'invånarna', 'i', 'ett', 'antal', 'av', 'våra', 'medlemsländer', 'drabbats', 'av', 'naturkatastrofer', 'som', 'verkligen', 'varit', 'förskräckliga.'], ['ni', 'har', 'begärt', 'en', 'debatt', 'i', 'ämnet', 'under', 'sammanträdesperiodens', 'kommande', 'dagar.'], ['till', 'dess', 'vill', 'jag', 'att', 'vi,', 'som', 'ett', 'antal', 'kolleger', 'begärt,', 'håller', 'en', 'tyst', 'minut', 'för', 'offren', 'för', 'bl.a.', 'stormarna', 'i', 'de', 'länder', 'i', 'europeiska', 'unionen', 'som', 'drabbats.']]


                                                                                

In [9]:
word_counts_en = (
    rdd_en_clean
    .flatMap(lambda words: words)     # flatten all tokens
    .map(lambda w: (w, 1))            # create (word, 1) pairs
    .reduceByKey(lambda a, b: a + b)  # sum the counts
)


In [10]:
top_10_en = word_counts_en.takeOrdered(10, key=lambda x: -x[1])
print("Top 10 words (English):", top_10_en)

[Stage 4:>                                                          (0 + 2) / 2]

Top 10 words (English): [('the', 3498574), ('of', 1659884), ('to', 1539823), ('and', 1288620), ('in', 1086089), ('that', 797576), ('a', 773812), ('is', 758087), ('for', 534270), ('we', 522879)]


                                                                                

In [11]:
rdd_en_indexed = rdd_en_clean.zipWithIndex().map(lambda x: (x[1], x[0]))
rdd_other_indexed = rdd_other_clean.zipWithIndex().map(lambda x: (x[1], x[0]))


                                                                                

In [12]:
joined_rdd = rdd_en_indexed.join(rdd_other_indexed)


In [13]:
joined_rdd_nonempty = joined_rdd.filter(
    lambda x: len(x[1][0]) > 0 and len(x[1][1]) > 0
)

In [14]:
joined_rdd_short = joined_rdd_nonempty.filter(
    lambda x: len(x[1][0]) < 10 and len(x[1][1]) < 10
)

In [15]:
joined_rdd_same_length = joined_rdd_short.filter(
    lambda x: len(x[1][0]) == len(x[1][1])
)

In [16]:
# (index, (["hello", "world"], ["hej", "världen"]))
# => [("hello", "hej"), ("world", "världen")]
word_pairs_rdd = joined_rdd_same_length.flatMap(
    lambda x: zip(x[1][0], x[1][1])
)


In [17]:
pair_counts = (
    word_pairs_rdd
    .map(lambda pair: (pair, 1))
    .reduceByKey(lambda a, b: a + b)
)


In [18]:
top_word_pairs = pair_counts.takeOrdered(20, key=lambda x: -x[1])
print("Most common word-translation pairs:", top_word_pairs)




Most common word-translation pairs: [(('is', 'är'), 10070), (('we', 'vi'), 5539), (('i', 'jag'), 5040), (('this', 'detta'), 3257), (('closed.', 'avslutad.'), 2980), (('and', 'och'), 2926), (('a', 'en'), 2892), (('it', 'det'), 2868), (('that', 'det'), 2807), (('not', 'inte'), 2652), (('(applause)', '(applåder)'), 2548), (('.', '.'), 2158), (('have', 'har'), 1981), (('will', 'att'), 1963), (('in', 'i'), 1932), (('a', 'ett'), 1877), (('the', 'omröstningen'), 1820), (('are', 'är'), 1787), (('vote', 'kommer'), 1769), (('there', 'det'), 1675)]


                                                                                

In [19]:
spark_session.stop()
