***Important*** DO NOT CLEAR THE OUTPUT OF THIS NOTEBOOK AFTER EXECUTION!!!

In [1]:
# if the following command generates an error, you probably didn't enable 
# the cluster security option "Allow API access to all Google Cloud services"
# under Manage Security → Project Access when setting up the cluster
!gcloud dataproc clusters list --region us-central1

NAME          PLATFORM  WORKER_COUNT  PREEMPTIBLE_WORKER_COUNT  STATUS   ZONE           SCHEDULED_DELETE
cluster-f1c0  GCE       3                                       RUNNING  us-central1-a


# Imports & Setup

In [2]:
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

[0m

In [3]:
import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from pathlib import Path
import pickle
import pandas as pd
from google.cloud import storage

import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# if nothing prints here you forgot to include the initialization script when starting the cluster
!ls -l /usr/lib/spark/jars/graph*

-rw-r--r-- 1 root root 247882 Dec 22 18:53 /usr/lib/spark/jars/graphframes-0.8.2-spark3.1-s_2.12.jar


In [5]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [6]:
spark

In [7]:
# Put your bucket name below and make sure you can access it without an error
bucket_name = 'wikidata_208373274' 
full_path = f"gs://{bucket_name}/"
paths=[]

client = storage.Client()
blobs = client.list_blobs(bucket_name)
for b in blobs:
    if b.name != 'graphframes.sh':
        paths.append(full_path+b.name)

***GCP setup is complete!*** If you got here without any errors you've earned 10 out of the 35 points of this part.

# Reading the parquet file of the data

Here, we read the entire corpus to an rdd, directly from Google Storage Bucket and use your code from Colab to construct an inverted index.

In [8]:
parquetFile = spark.read.parquet(*paths)

                                                                                

We will count the number of pages to make sure we are looking at the entire corpus. The number of pages should be more than 6M

In [9]:
# Count number of wiki pages
parquetFile.count()

                                                                                

6348910

Let's import the inverted index module. Note that you need to use the staff-provided version called `inverted_index_gcp.py`, which contains helper functions to writing and reading the posting files similar to the Colab version, but with writing done to a Google Cloud Storage bucket.

In [10]:
parquetFile.show()



+-------+--------------------+--------------------+--------------------+
|     id|               title|                text|         anchor_text|
+-------+--------------------+--------------------+--------------------+
|4045403|Foster Air Force ...|'''Foster Air For...|[{1176764, Tactic...|
|4045413|     Torino Palavela|'''Palavela''', f...|[{77743, 2006 Win...|
|4045419|   Mad About the Boy|"'''Mad About the...|[{34028256, Joyce...|
|4045426|       Shayne Breuer|'''Shayne Breuer'...|[{1838386, Woodvi...|
|4045432|         Parantaka I|'''Parantaka Chol...|[{1511716, Aditya...|
|4045456|Arundel (UK Parli...|'''Arundel''' was...|[{4665376, Arunde...|
|4045466|     Andrew Martinez|'''Luis Andrew Ma...|[{4860, Berkeley,...|
|4045471|    Vancouver VooDoo|The '''Vancouver ...|[{32706, Vancouve...|
|4045479|     Invisible plane|The '''Invisible ...|[{2260539, Ross A...|
|4045516|    Shopping channel|'''Shopping chann...|[{592899, special...|
|4045519|      Turgay (river)|The '''Turgay''' ...|

                                                                                

# Building a DataFrame of the relevant documents

In [11]:
relevant_queries_to_doc_ids_dict = {"best marvel movie": [60283633, 61073786, 5676692, 56289553, 60774345, 27306717, 61592102, 42163310, 60952488, 36439749, 48530084, 10589717, 29129051, 59892, 612052, 44254295, 878659, 54653881, 51430647, 66111204, 22114132, 55935213, 41677925, 17296107, 61651800, 9110929, 67055, 37497391, 60744481, 65464184, 41974496, 60616450, 60463979, 65967176, 57069491, 46208997, 22144990, 62372638, 1074657, 44240443, 33463661, 41974555, 43603241, 33700618, 5027882, 66423851, 60754840], "How do kids come to world?": [15474, 1357127, 636806, 43033258, 6271835, 56480301, 23133297, 615418, 73165, 24470328, 1833777, 1380383, 79449, 4827661, 387703, 18863597, 36827305, 494299, 194687, 5591344, 48490547, 634139, 42072639, 44311171, 29384326, 1908019, 296627, 11263877, 101942, 2045465, 56921904, 128987, 22888933, 1072968, 25490788, 83449, 884998, 1151454, 30640885, 35072597, 2535885, 30861, 51046955, 13603, 3060346, 88380, 19698110, 72214, 6236554, 46105], "Information retrieval": [1897206, 10179411, 25130414, 5818361, 1185840, 20948989, 48317971, 509628, 494528, 11486091, 50716473, 24963841, 296950, 35804330, 261193, 15271, 39000674, 19988623, 38156944, 36794719, 731640, 14109784, 10328235, 25935906, 16635934, 33407925, 743971, 3781784, 14343887, 57312392, 24997830, 442684, 7872152, 14473878, 25959000, 9511414], "LinkedIn": [3591502, 55679006, 970755, 36070366, 63641225, 41726116, 51562019, 35549457, 21179478, 62976368, 27769500, 57147095, 31403505, 22291643, 50191962], "How to make coffee?": [4506407, 321546, 37249793, 17668101, 26731675, 6887661, 1566948, 5612891, 211895, 68117784, 4604645, 47660, 3757402, 273707, 8866584, 5964683, 49099835, 28890200, 53151326, 300805, 1623162, 3775558, 273700, 667037, 5212064, 6826364, 63534797, 54459918, 604727, 30860428, 2461806, 3639440, 2929216, 12343966, 408360, 63520964, 838057, 6332026, 19619306, 215424, 482824, 38579961, 8728856, 2165666, 3785715, 366244, 1646753, 31824340], "Ritalin": [649100, 8802530, 6428730, 608718, 13594085, 66391, 25164479, 24754461, 22611786, 964614, 7432624, 5721484, 57068567, 1333695, 4387617, 463961, 23891416, 56961277, 47956615, 4726434, 52780757, 50762105, 40542151, 1186041, 10671710, 7594242, 57762, 2580091, 159284, 2495940, 6281833, 45690249, 1546447, 32325617, 205878, 1790029, 5497377], "How to make wine at home?": [373172, 3602925, 20790067, 223834, 15468138, 3398365, 61014433, 19600890, 927688, 146918, 22216378, 1417287, 13824744, 57098, 3276784, 466664, 41337483, 1031040, 36029170, 29324283, 26924822, 31505523, 13532634, 4378282, 1045027, 1455948, 14825456, 485220, 37468361, 1041458, 8177057, 2866516, 31704630, 21991369, 4554556, 713636, 8608425, 20810258, 22777652, 1039412, 32961, 8778890, 683094, 19561784, 6032951, 10998, 5222577, 7414829, 20185928, 8318345], "Most expensive city in the world": [33508970, 3602421, 94167, 24724090, 30057, 220886, 31453, 19058, 31326350, 32706, 645042, 3928523, 18402, 34374079, 522934, 13476079, 2376810, 36511, 172538, 15218891, 390875, 22309, 12521, 65708464, 1664254, 35368654, 19004, 309890, 27862, 27318, 45470, 10992, 53446, 19261, 19189, 3848717, 11947794, 49749249, 7780, 14900757, 9299090, 26976, 49728, 63946361, 302201], "India": [141896, 14745, 24452, 265059, 14597, 13890, 42737, 2377, 1186115, 6825785, 26457880, 1472206, 17359901, 37756, 53707, 315776, 4208015, 295335, 14598, 1996872, 764545, 1108803, 3574003, 678583, 7564733, 37534, 2198463, 720414, 6622547, 1683930, 231623, 17774253, 14533, 19189, 275047, 20611562, 43281, 17719886, 10710364, 5864614, 3315459, 14580, 47905, 3799826, 553883, 375986, 408215], "how to make money fast?": [67987778, 12789839, 5624681, 44379765, 400777, 47720307, 45332, 1531043, 48732, 7322279, 51895777, 65228, 60739751, 21175589, 846772, 9833167, 22226313, 63809606, 35666788, 1527716, 4416646, 23830729, 264058, 32595633, 1335238, 12020461, 1793651, 1370831, 63121, 2913859, 42994, 4090453, 17418777, 5145001, 43250171, 8957449, 43030666, 473309, 624998, 7555986, 22156522, 13681, 29681566, 17362858, 19390, 407288, 1276547, 2763667], "Netflix": [65595607, 34075129, 50602056, 65741484, 32670973, 61972257, 66174045, 47048067, 49016960, 63732884, 175537, 56312051, 65073808, 59629338, 54671372, 56312054, 50276542, 57041239, 66422422, 67450679, 66299065, 9399111, 50137861, 40030145], "Apple computer": [254496, 50865995, 5285468, 5653238, 3356874, 345676, 2275, 4478297, 2593693, 3608414, 18640, 248101, 15183570, 20647724, 1159939, 17826747, 619983, 856, 46728817, 2116, 1492625, 77118, 32327247, 15357987, 400593, 17997437, 1005263, 345354, 2020710, 660310, 1344, 19006979, 15295713, 2786155, 2117, 21694, 233780, 5078775, 73262, 21347643, 27848, 548115], "The Simpsons": [19293758, 1424178, 74813, 1625137, 34519668, 4939408, 11028525, 49387265, 4939471, 292279, 60534017, 9306179, 33350134, 4939519, 1466966, 4939306, 4939444, 140332, 4939501, 29838, 5451605, 19266557, 3038969, 14040227, 4939334, 188572, 10765975, 22423628, 4776930], "World cup": [32516422, 42931572, 2996777, 33727, 183628, 60637832, 8821389, 16842834, 22230053, 1166428, 29868391, 64467696, 4743361, 13327177, 61269058, 26814387, 62528055, 10822574, 3482503, 36581929, 8258172, 16966712, 39302261, 244862, 67608822, 1853149, 39812824, 55490096, 2150801, 8734046, 32352129, 16383, 59707, 19537336, 3556431, 17742072, 11370, 656933, 168079, 41648358, 4723188, 1248592], "How to lose weight?": [400199, 1151047, 791546, 67730903, 27300359, 84252, 26639763, 8581665, 1148926, 64543917, 6319249, 2029766, 56885915, 11665493, 1958879, 28396636, 56435, 2883760, 31429041, 32051848, 277790, 11884255, 49051658, 1017976, 42528947, 1149933, 65004286, 4748844, 44442017, 35281209, 40925771, 30687447, 11249433, 45280337, 17659030, 8460, 3549164, 727293, 28541957, 12523816, 33825347, 18168862, 9972157, 410007, 27148738], "Java": [1179384, 17521476, 5516020, 5863400, 15628, 4093054, 135063, 663788, 9845, 1455590, 3901428, 731735, 1079500, 24920873, 11125049, 7955681, 38321273, 456722, 15881, 16389, 26257672, 43284, 651278, 127604, 43826, 314356, 53078721, 611589, 1131136, 230828, 417018, 42870, 69336, 4718446, 1414212, 7811267, 42871, 40659966, 13593, 1326984, 453584, 320443, 30120784, 7771171, 269441, 4294832], "Air Jordan": [3647739, 3890370, 6722408, 105344, 18998781, 1371219, 60601430, 7851893, 28155315, 1394509, 4253801, 36916362, 265033, 23353937, 13365219, 20455, 3097723, 50066979, 51546226, 2310146, 67838974, 9998569, 62741501, 58209447], "how to deal with depression?": [2721889, 13190302, 63499429, 16360289, 39218436, 33310173, 2367697, 57688, 20529621, 4041101, 49233423, 2685269, 840273, 25258288, 43600438, 60611538, 19283335, 18550003, 33255495, 19356, 60457349, 2891701, 66811, 34753948, 43875835, 42730418, 717119, 1295947, 18176448, 2353519, 1879108, 14325087, 3440273, 175357, 16407460, 3762294, 4531, 19064282, 52316, 8389, 255475, 341658, 20448627, 22481627, 21211994, 5144613, 30846934, 1500618, 234796], "How do you make gold": [323246, 5580137, 1686492, 1385632, 23290471, 6890967, 15739, 39740796, 62929, 1020809, 251087, 6109962, 6996576, 402244, 2015573, 20063724, 1230653, 180211, 7133952, 23324, 12240, 1291393, 3519942, 12095348, 44712684, 27119, 886856, 18300514, 25918508, 37412, 2526649, 39639653, 390698, 1356272, 10865561, 1386629, 5024105, 3706246, 67110306, 2732267, 15457257, 56226, 19074264, 63280480, 1581831, 45756, 2927992, 27345986, 152176], "Marijuana": [60920, 52227830, 22707918, 4512923, 68188835, 28985374, 31188467, 52184272, 52209782, 27202445, 20481920, 1481886, 19920359, 2331004, 19357, 44975261, 145891, 28572685, 20566488, 37646421, 383537, 20866399, 53836251, 150113, 53871120, 19760623, 3045683, 8596369, 1227367, 168917, 14942276, 48640150, 52342272, 52356241, 56078060, 38310, 175440, 53897655, 52228042, 52183794, 11164587, 168915, 48920848, 47227709, 23154203, 184488], "How to make hummus": [2322115, 9513043, 3260137, 7329519, 3736012, 8559295, 164311, 13607, 682549, 7489122, 289691, 24230253, 22736969, 49643204, 52682605, 3841447, 3099917, 47863605, 5033181, 1626287, 64051004, 42947658, 11287682, 11447140, 23619350, 5334377, 2243880, 1039663, 53350936, 3508935, 62166289, 20657443, 56494240, 4925720, 38936168, 82789, 48876576, 75065, 607255, 11577897, 453166, 57146, 14320, 2578570, 3548013], "Winter": [1511596, 52709838, 1673945, 17349106, 8521120, 3060382, 16615604, 8438818, 8351234, 979072, 64928991, 36439749, 1817908, 1971153, 30276826, 4538366, 1298502, 22933429, 6511088, 38950, 19431459, 1221144, 19938267, 1843684, 34069, 6201653, 33672235, 3548574, 1372169, 65601132, 38416091, 1088531, 2020857, 316711, 34061, 9825536, 22190045, 43343961, 1632099, 962053, 961505], "Rick and Morty": [54046846, 49170369, 55339286, 62417830, 54251265, 41283158, 65819511, 49029294, 67520032, 49134382, 42311608, 63656330, 52261594, 41185040, 43794572, 64413225, 49131135, 49127974, 43794574, 63656361, 49128142, 41699729, 51759111, 47762921, 55708102, 63656365, 55339303, 57390230, 67830379, 61805032, 26091326, 54802759], "Natural Language processing": [252008, 18784729, 42799166, 57932194, 18863997, 13805160, 27857167, 4561188, 43771647, 61603971, 1661566, 43561218, 1936537, 11147298, 27837170, 6650456, 563439, 5561, 10235, 360030, 64695824, 32472154, 2891758, 21173, 56142183, 21652, 40573, 14003441, 60360004, 20892159, 32707853, 67147, 98778, 37764426, 301999, 53358397, 36323189, 62026514], "World Cup 2022": [42931572, 27007503, 60637832, 29868391, 64467696, 4743361, 60410401, 61269058, 62528055, 10822574, 36581929, 45271353, 67608822, 27226732, 61715824, 57240806, 2150801, 64999924, 16383, 57918704, 66040086, 66040084, 3556431, 17742072, 51765484, 11370, 64999764, 57918711, 1248592], "Dolly the sheep": [9146, 39379960, 48188481, 9649607, 192685, 1751707, 16285933, 45485344, 8716, 52793670, 1731036, 168927, 2372209, 1140293, 2828101, 56398129, 42555506, 932553, 7932132, 14020881, 915258, 6910, 14094, 1567101, 1321047, 1631732, 17842616, 913362, 8394105, 63031051, 6832430, 2082914, 38889846, 1962277, 17158563, 1632972, 1857574, 66603787, 53431353, 12054042, 383180], "Ciggarets": [5801264, 2236126, 30942, 3015678, 55822753, 56132631, 10833234, 55836725, 280437, 11996885, 2672131, 4576717, 43345713, 46734540, 50164035, 4870997, 489585, 25913130, 2418612, 11938696, 56178521, 2761281, 3915251, 9254970, 1556887, 655861, 52958915, 8655214, 20587357, 73298, 56107088, 63616836, 5892113, 17596651, 56000054, 38327, 6003061, 14501317, 2199688, 13834142, 35077599, 1910732, 312963, 2627188, 54258598, 2536648, 1287604, 32695480], "What is the best place to live in?": [53487, 22989, 37321573, 80735, 3138, 645042, 33018516, 2694428, 124779, 1242998, 3928523, 844, 16760693, 58586, 60333700, 8522, 53837, 3708, 200427, 13602714, 42881894, 47744894, 24534207, 99648, 1664254, 28139692, 309890, 199292, 31885991, 311130, 5391, 5201333, 47789, 5407, 19159283, 109780, 1649321, 784781, 32950054, 48461477, 26976, 23189729, 37325161, 22912415, 1998, 3367760, 34361], "Elon musk": [52247588, 43407192, 8046414, 48778030, 31406060, 51237650, 21821257, 36971117, 65175052, 2614738, 66405413, 55947330, 5533631, 65212863, 53215263, 832774, 909036, 4335905, 50399439, 51714024, 803102, 45111627, 53615490, 9988187, 48795986], "How do you breed flowers?": [30876044, 4576465, 16128216, 33336442, 26537, 57374888, 63180590, 68213121, 893280, 31552410, 3288269, 224785, 1183979, 1104639, 1028614, 19049100, 277231, 42680256, 1392524, 407234, 18967, 55819873, 63539530, 57141131, 56170677, 233609, 3514423, 200646, 39683, 66556, 6614349, 41244, 5902061, 167906, 35646178, 63484108, 13799261, 18691124, 4226137, 1390689, 2327234, 33131935, 630109, 76143, 18952271, 1071613, 971961]}
relevant_doc_ids = {doc_id for doc_ids_list in relevant_queries_to_doc_ids_dict.values() for doc_id in doc_ids_list}

In [12]:
relevant_docs = parquetFile.filter(col("id").isin(relevant_doc_ids))

In [13]:
relevant_docs.show()



+-------+--------------------+--------------------+--------------------+
|     id|               title|                text|         anchor_text|
+-------+--------------------+--------------------+--------------------+
|4090453|         Money Train|'''''Money Train'...|[{23718565, Josep...|
|4093054|     Java annotation|In the Java compu...|[{15881, Java com...|
|4208015|         British Raj| 

The '''British...|[{4721, Imperial ...|
|4226137|     Hybrid tea rose|'''Hybrid tea''' ...|[{29640296, garde...|
|4253801|      Jumpman (logo)|The "'''Jumpman''...|[{1394509, Air Jo...|
|4294832|Java collections ...|thumb|java.util.C...|[{15881, Java}, {...|
|4335905|       Talulah Riley|'''Talulah Jane R...|[{22635683, St Tr...|
|4378282|Secondary ferment...|right|thumb|400px...|[{1455948, macera...|
|4387617|Grapefruit–drug i...|thumb|500px
Some ...|[{1099396, intera...|
|4416646|           Don Lapre|'''Donald D. Lapr...|[{19356538, Provi...|
|4478297|Timeline of Macin...|This '''timeline ...|

                                                                                

In [14]:
print(len(relevant_doc_ids) == relevant_docs.count())



True


                                                                                

# Writing and Reading the relevant DataFrame as a CSV file

In [15]:
pandas_relevant_docs = relevant_docs.toPandas()
pandas_relevant_docs.to_csv("/home/dataproc/test.csv", index=False)

                                                                                

In [16]:
pandas_copy_of_relevant_docs = pd.read_csv("/home/dataproc/test.csv")
copy_of_relevant_docs = spark.createDataFrame(pandas_copy_of_relevant_docs)
copy_of_relevant_docs.show()

22/12/22 21:28:53 WARN org.apache.spark.scheduler.TaskSetManager: Stage 13 contains a task of very large size (2021 KiB). The maximum recommended task size is 1000 KiB.


+-------+--------------------+--------------------+--------------------+
|     id|               title|                text|         anchor_text|
+-------+--------------------+--------------------+--------------------+
|4090453|         Money Train|'''''Money Train'...|[Row(id=23718565,...|
|4093054|     Java annotation|In the Java compu...|[Row(id=15881, te...|
|4208015|         British Raj| 

The '''British...|[Row(id=4721, tex...|
|4226137|     Hybrid tea rose|'''Hybrid tea''' ...|[Row(id=29640296,...|
|4253801|      Jumpman (logo)|The "'''Jumpman''...|[Row(id=1394509, ...|
|4294832|Java collections ...|thumb|java.util.C...|[Row(id=15881, te...|
|4335905|       Talulah Riley|'''Talulah Jane R...|[Row(id=22635683,...|
|4378282|Secondary ferment...|right|thumb|400px...|[Row(id=1455948, ...|
|4387617|Grapefruit–drug i...|thumb|500px
Some ...|[Row(id=1099396, ...|
|4416646|           Don Lapre|'''Donald D. Lapr...|[Row(id=19356538,...|
|4478297|Timeline of Macin...|This '''timeline ...|

                                                                                

## Checking that writing and reading is working

In [17]:
relevant_docs_original_texts = relevant_docs.select("text").collect()

                                                                                

In [20]:
original_45_text = relevant_docs_original_texts[77]

In [22]:
copied_texts = copy_of_relevant_docs.select("text").collect()

22/12/22 21:30:15 WARN org.apache.spark.scheduler.TaskSetManager: Stage 15 contains a task of very large size (2021 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [23]:
copy_45_text = copied_texts[77]

In [24]:
print(copy_45_text == original_45_text)

True


# Repeating the process for random sample of documents

In [25]:
first_2000_docs = parquetFile.limit(2000)
docs_from_0_to_1000 = first_2000_docs.orderBy(first_2000_docs["id"].desc()).limit(1000)
docs_from_1000_to_2000 = first_2000_docs.orderBy(first_2000_docs["id"].asc()).limit(1000)

In [26]:
docs_from_0_to_1000.show()

                                                                                

+-------+--------------------+--------------------+--------------------+
|     id|               title|                text|         anchor_text|
+-------+--------------------+--------------------+--------------------+
|4059808|    Operation Protea|'''Operation Prot...|[{4000830, South ...|
|4059806|Green-billed malkoha|The '''green-bill...|[{171166, Nepal},...|
|4059798|Conrad I, Duke of...|'''Conrad I''' (a...|[{663122, Duke of...|
|4059788|List of operation...|This '''List of o...|[{666301, militar...|
|4059786|          Tubiflorae|thumb|Tubiflorae
...|[{1651329, botani...|
|4059771|          Barry Long|'''Barry Long''' ...|[{259082, The Syd...|
|4059769|Wing Commander: P...|'''''Wing Command...|[{380007, Origin ...|
|4059764|Hong Kong Mathema...|'''Hong Kong Math...|[{266210, Traditi...|
|4059761|        Ernie Vincze|'''Ernest Vincze'...|[{36787, Budapest...|
|4059752|   The Hive (studio)|'''The Hive''' is...|[{435035, North H...|
|4059749|Artsakh Defence Army|The '''Artsakh De...|

In [27]:
docs_from_1000_to_2000.show()



+-------+--------------------+--------------------+--------------------+
|     id|               title|                text|         anchor_text|
+-------+--------------------+--------------------+--------------------+
|5399373|     State of Change|'''''State of Cha...|[{3750950, Christ...|
|5399379|1954 FIFA World C...|Below are the '''...|[{60523, 1954 FIF...|
|5399386|Brandão (football...|'''Evaeverson Lem...|[{492068, Marseil...|
|5399388|      James E. Hayes|'''James E. Hayes...|[{5399031, Suprem...|
|5399402|      John J. Phelan|'''John Joseph Ph...|[{5399031, Suprem...|
|5399415|    Mikael Lindström|thumb|Lindström (...|[{82433, diplomat...|
|5399421|     James T. Mullen|'''James Terrance...|[{5399031, Suprem...|
|5399424|       Kiss and Tell|'''Kiss and Tell'...|[{60663435, unaut...|
|5399431|The Romance of Crime|'''''The Romance ...|[{2672639, Gareth...|
|5399441|George Wallace (A...|'''George Stephen...|[{718976, Aberdee...|
|5399447|List of Billboard...|thumb|200px|uprig...|

                                                                                

In [28]:
print(docs_from_0_to_1000.count() == docs_from_1000_to_2000.count())



True


                                                                                

In [29]:
first_2000_docs.show()



+--------+--------------------+--------------------+--------------------+
|      id|               title|                text|         anchor_text|
+--------+--------------------+--------------------+--------------------+
|10672809|       Dorsa Whiston|'''Dorsa Whiston'...|[{957903, wrinkle...|
|10672850|     Humor Monastery|thumb|Petru Rareş...|[{17549413, Mănăs...|
|10672900|   Tir na n-Og Award|The '''Tir na n-O...|[{52847, children...|
|10672928|Barrington Passag...|'''Barrington Pas...|[{639401, Country...|
|10672963|University of Wis...|The '''College of...|[{1640986, Public...|
|10672987|University of Wis...|The '''University...|[{1640986, Public...|
|10673017|              Loiner|'''Loiner''' is a...|[{277226, demonym...|
|10673030|Standin' on the C...|'''Standin' on th...|[{149564, Jackson...|
|10673063|The Adventurers (...|'''''The Adventur...|[{959038, Lewis G...|
|10673064|    Stéphane Haccoun|'''Stéphane Hacco...|[{787578, Feather...|
|10673144|Rendezvous Hotel ...|The '''

                                                                                

In [31]:
print(docs_from_0_to_1000.intersect(docs_from_1000_to_2000).count())

                                                                                

0


In [39]:
filtered_docs_from_0_to_1000 = docs_from_0_to_1000.where(~docs_from_0_to_1000["id"].isin(relevant_doc_ids))
filtered_docs_from_1000_to_2000 = docs_from_1000_to_2000.where(~docs_from_1000_to_2000["id"].isin(relevant_doc_ids))

In [41]:
print(filtered_docs_from_0_to_1000.count())
print(filtered_docs_from_1000_to_2000.count())

                                                                                

1000




1000


                                                                                

In [43]:
print(filtered_docs_from_0_to_1000.intersect(relevant_docs).count())

[Stage 94:>                                                        (0 + 4) / 13]

0


                                                                                

In [44]:
print(filtered_docs_from_1000_to_2000.intersect(relevant_docs).count())

                                                                                

0


# Writing and Reading the random documents DataFrames as a CSV files

In [45]:
pandas_filtered_random_1000_docs_A = filtered_docs_from_0_to_1000.toPandas()
pandas_filtered_random_1000_docs_A.to_csv("/home/dataproc/filtered_random_1000_docs_A.csv", index=False)

                                                                                

In [46]:
pandas_filtered_random_1000_docs_B = filtered_docs_from_1000_to_2000.toPandas()
pandas_filtered_random_1000_docs_B.to_csv("/home/dataproc/filtered_random_1000_docs_B.csv", index=False)

                                                                                

In [None]:
pandas_filtered_random_1000_docs_B = pd.read_csv("/home/dataproc/test.csv")
copy_of_relevant_docs = spark.createDataFrame(pandas_copy_of_relevant_docs)