## Initializing Spark

In [1]:
from pyspark import SparkConf, SparkContext

In [2]:
conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext()

22/10/26 18:08:26 WARN Utils: Your hostname, Alexs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.156.3.142 instead (on interface en0)
22/10/26 18:08:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/26 18:08:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Loading data

In [3]:
quijote_rdd = sc.textFile("./data/quijote.text")

In [4]:
quijote_rdd.take(5)

                                                                                

['The Project Gutenberg EBook of Don Quijote, by Miguel de Cervantes Saavedra',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included']

# Word Count

## Create map function

In [5]:
def count_words(line):
    if len(line) == 0:
        return []
    else:
        word_list = line.strip().lower().split()
        word_dict = {}
        for word in word_list:
            if word in word_dict:
                word_dict[word] += 1
            else:
                word_dict[word] = 1
        return list(word_dict.items())

In [6]:
count_words('no otra alguna, podáis imprimir el dicho libro, intitulado El ingenioso')

[('no', 1),
 ('otra', 1),
 ('alguna,', 1),
 ('podáis', 1),
 ('imprimir', 1),
 ('el', 2),
 ('dicho', 1),
 ('libro,', 1),
 ('intitulado', 1),
 ('ingenioso', 1)]

## Testing the function

In [7]:
quijote_rdd.flatMap(count_words).collect()

[('the', 1),
 ('project', 1),
 ('gutenberg', 1),
 ('ebook', 1),
 ('of', 1),
 ('don', 1),
 ('quijote,', 1),
 ('by', 1),
 ('miguel', 1),
 ('de', 1),
 ('cervantes', 1),
 ('saavedra', 1),
 ('this', 1),
 ('ebook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('almost', 1),
 ('no', 1),
 ('restrictions', 1),
 ('whatsoever.', 1),
 ('you', 1),
 ('may', 1),
 ('copy', 1),
 ('it,', 1),
 ('give', 1),
 ('it', 1),
 ('away', 1),
 ('or', 1),
 ('re-use', 1),
 ('it', 1),
 ('under', 1),
 ('the', 2),
 ('terms', 1),
 ('of', 1),
 ('project', 1),
 ('gutenberg', 1),
 ('license', 1),
 ('included', 1),
 ('with', 1),
 ('this', 1),
 ('ebook', 1),
 ('or', 1),
 ('online', 1),
 ('at', 1),
 ('www.gutenberg.net', 1),
 ('title:', 1),
 ('don', 1),
 ('quijote', 1),
 ('author:', 1),
 ('miguel', 1),
 ('de', 1),
 ('cervantes', 1),
 ('saavedra', 1),
 ('posting', 1),
 ('date:', 1),
 ('april', 1),
 ('27,', 1

## Apply map reduce

In [8]:
word_count_rdd = quijote_rdd.flatMap(count_words).reduceByKey(lambda x,y: x+y)

In [9]:
word_count_rdd.take(10)

                                                                                

[('project', 84),
 ('gutenberg', 25),
 ('ebook', 8),
 ('of', 118),
 ('don', 2643),
 ('quijote,', 531),
 ('saavedra', 3),
 ('this', 47),
 ('is', 25),
 ('use', 14)]

# Max Word

In [10]:
word_count_rdd.reduce(lambda x, y: x if x[1] > y[1] else y)

('que', 19470)

# Inverted Index

In [11]:
quijote_rdd.zipWithIndex().take(10)

[('The Project Gutenberg EBook of Don Quijote, by Miguel de Cervantes Saavedra',
  0),
 ('', 1),
 ('This eBook is for the use of anyone anywhere at no cost and with', 2),
 ('almost no restrictions whatsoever.  You may copy it, give it away or', 3),
 ('re-use it under the terms of the Project Gutenberg License included', 4),
 ('with this eBook or online at www.gutenberg.net', 5),
 ('', 6),
 ('', 7),
 ('Title: Don Quijote', 8),
 ('', 9)]

In [12]:
def map_create_inverted_index(line):
    index = line[1]
    line = line[0]
    if len(line) == 0:
        return []
    else:
        word_list = list(set(line.strip().lower().split())) # Convert to set in order to remove duplicates
        tuple_word_index = [(word, [index]) for word in word_list]
        return tuple_word_index
    
def reduce_create_inverted_index(indices_of_w1, indices_of_w2):
    return indices_of_w1 + indices_of_w2
    

In [13]:
map_create_inverted_index(('The Project Gutenberg EBook of Don Quijote, by Miguel de Cervantes Saavedra',0))

[('don', [0]),
 ('of', [0]),
 ('de', [0]),
 ('the', [0]),
 ('miguel', [0]),
 ('cervantes', [0]),
 ('ebook', [0]),
 ('gutenberg', [0]),
 ('project', [0]),
 ('by', [0]),
 ('saavedra', [0]),
 ('quijote,', [0])]

In [14]:
inverted_index = quijote_rdd.zipWithIndex().flatMap(map_create_inverted_index) \
.reduceByKey(reduce_create_inverted_index)

In [15]:
inverted_index.take(5)

                                                                                

[('don',
  [0,
   8,
   18,
   36,
   124,
   155,
   176,
   202,
   225,
   339,
   351,
   391,
   427,
   446,
   479,
   484,
   506,
   519,
   527,
   546,
   565,
   603,
   610,
   650,
   661,
   732,
   739,
   755,
   776,
   816,
   873,
   877,
   887,
   907,
   924,
   930,
   944,
   949,
   960,
   969,
   988,
   999,
   1008,
   1014,
   1017,
   1055,
   1084,
   1086,
   1100,
   1109,
   1123,
   1127,
   1137,
   1140,
   1166,
   1175,
   1188,
   1193,
   1195,
   1198,
   1199,
   1203,
   1219,
   1254,
   1270,
   1276,
   1277,
   1285,
   1297,
   1305,
   1316,
   1321,
   1350,
   1355,
   1363,
   1376,
   1380,
   1387,
   1404,
   1424,
   1445,
   1495,
   1516,
   1530,
   1533,
   1540,
   1545,
   1553,
   1555,
   1590,
   1620,
   1623,
   1626,
   1704,
   1780,
   1802,
   1899,
   1920,
   1924,
   1932,
   1936,
   1952,
   1953,
   1974,
   1997,
   2002,
   2014,
   2029,
   2033,
   2039,
   2047,
   2054,
   2061,
   2071,
   2091,
   2

## Check lines of Max Word

In [16]:
import os
os.environ["PYTHONHASHSEED"]=str('myseed') # Set seed to maintain reproducibility accross clusters

In [22]:
inverted_index.lookup("que")

[[42,
  45,
  47,
  48,
  49,
  57,
  67,
  70,
  72,
  73,
  76,
  78,
  79,
  80,
  82,
  84,
  85,
  86,
  87,
  89,
  91,
  95,
  96,
  97,
  100,
  121,
  123,
  126,
  127,
  128,
  129,
  133,
  139,
  141,
  142,
  149,
  150,
  152,
  156,
  157,
  160,
  162,
  163,
  164,
  168,
  169,
  170,
  171,
  173,
  175,
  176,
  179,
  180,
  181,
  185,
  186,
  189,
  190,
  191,
  200,
  202,
  203,
  206,
  207,
  208,
  213,
  214,
  216,
  217,
  221,
  223,
  227,
  232,
  233,
  234,
  235,
  236,
  237,
  238,
  239,
  240,
  243,
  244,
  245,
  256,
  257,
  261,
  266,
  269,
  271,
  275,
  280,
  281,
  282,
  287,
  288,
  290,
  292,
  293,
  294,
  297,
  298,
  299,
  301,
  302,
  303,
  306,
  309,
  315,
  317,
  318,
  319,
  320,
  323,
  325,
  332,
  334,
  341,
  342,
  343,
  344,
  346,
  357,
  363,
  365,
  369,
  371,
  372,
  396,
  402,
  407,
  410,
  411,
  414,
  416,
  420,
  421,
  423,
  431,
  432,
  439,
  440,
  451,
  454,
  475,
  480,
  

In [26]:
len(inverted_index.lookup("que")[0])

15220