In [1]:
from operator import add
import re
from collections import OrderedDict
from operator import itemgetter 
import itertools
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName("common_crawl")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
#        .config('spark.executor.cores', 2)\


# Old API (RDD)
spark_context = spark_session.sparkContext

# (*/*) - out of memory›
# ~6.4mins for 39496 files. (...00000/)  (takes 1 minute with 40 partitions)
# ~5 secs for 10 files (...00000/0*) 
# ~20 secs for 11110 files (...00000/1*) 


rdd = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.87:9000/crawl-wet/CC-MAIN-20200328074047-20200328104047-00000.warc.wet',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text',
    conf={'textinputformat.record.delimiter': 'WARC/1.0'}
)\
.cache() # Keep this RDD in memory!

rdd.count()
# Only one job (previous .cache() did not trigger a job)

52455

In [2]:
rdd.take(3)
# [(line_number, partition)]

[(0, ''),
 (8,
  '\r\nWARC-Type: warcinfo\r\nWARC-Date: 2020-04-10T16:44:20Z\r\nWARC-Filename: CC-MAIN-20200328074047-20200328104047-00000.warc.wet.gz\r\nWARC-Record-ID: <urn:uuid:3b06c614-57de-423a-aba2-2e8c39f02c0b>\r\nContent-Type: application/warc-fields\r\nContent-Length: 376\r\n\r\nSoftware-Info: ia-web-commons.1.1.10-SNAPSHOT-20200324091056\r\nExtracted-Date: Fri, 10 Apr 2020 16:44:20 GMT\r\nrobots: checked via crawler-commons 1.1-SNAPSHOT (https://github.com/crawler-commons/crawler-commons)\r\nisPartOf: CC-MAIN-2020-16\r\noperator: Common Crawl Admin (info@commoncrawl.org)\r\ndescription: Wide crawl of the web for March/April 2020\r\npublisher: Common Crawl\r\n\r\n\r\n\r\n'),
 (651,
  '\r\nWARC-Type: conversion\r\nWARC-Target-URI: http://000ojfb.wcomhost.com/ushwa/2018-dan-patch-banquet/ken-weingartner-chof/\r\nWARC-Date: 2020-03-28T09:27:14Z\r\nWARC-Record-ID: <urn:uuid:1ebf934c-633e-4362-ac23-b2b17993ec64>\r\nWARC-Refers-To: <urn:uuid:7ee10237-5635-4404-9a2c-b546e8aef370>\r\n

In [3]:
rdd.getNumPartitions()

4

In [4]:
print(spark_context.uiWebUrl)

http://host-192-168-2-131-ldsa:4043


In [7]:
## Example #1 - Filter by TLD and compute most common words ##

# Try .ac.uk, .ru, .se, .com
p = re.compile('WARC-Target-URI: \S+\.ac.uk', re.IGNORECASE)

rdd1 = rdd\
.filter(lambda doc: bool(p.search(doc[1])))\
.map(lambda web_text: web_text[1].partition('\r\n\r\n')[2])\
.flatMap(lambda t: t.split(' '))\
.flatMap(lambda w: w.split('\n'))\

rdd2 = rdd1.map(lambda w: w.strip())\
.map(lambda w: (w,1))\
.reduceByKey(add)

result = rdd2.takeOrdered(40, key=lambda x: -x[1])

print(result)

[('and', 2044), ('the', 1962), ('of', 1597), ('to', 1166), ('1', 994), ('in', 786), ('a', 717), ('for', 552), ('-', 521), ('The', 373), ('on', 339), ('&', 328), ('is', 324), ('Research', 306), ('Photographic', 287), ('collection:', 286), ('at', 278), ('with', 278), ('by', 272), ('', 261), ('University', 261), ('(←', 250), ('links)', 250), ('\u200e', 250), ('|', 235), ('that', 232), ('College', 229), ('view', 227), ('this', 225), ('as', 218), ('webpages', 211), ('are', 203), ('from', 175), ('be', 170), ('About', 168), ('hircus', 168), ('you', 162), ('Contact', 161), ('your', 161), ('or', 158)]


In [None]:
## Example #2 - Group by TLD and compute most common words for each ##

ex = "WARC-Type: conversion\
WARC-Target-URI: http://news.bbc.co.uk/2/hi/africa/3414345.stm\
WARC-Date: 2014-08-02T09:52:13Z"

p = re.compile('WARC-Target-URI: \S+\.([a-zA-Z]{2,3})/', re.IGNORECASE)
# print(p.search(ex).group(1))
# uk

def get_tld(content):
    match = p.search(content)
    if match is not None:
        return match.group(1)
    else:
        return None

# discard the line number
# partition() -- python function -- split on the first occurance, returns (before,split,after)
# filter out those with no TLD

    
words_by_tld_rdd = rdd\
    .map(lambda filename_content: filename_content[1])\
    .map(lambda content: (get_tld(content), content.partition('\r\n\r\n')[2]))\
    .filter(lambda tld_content: tld_content[0] is not None)\
    .flatMapValues(lambda words: words.split(' '))\
    .flatMapValues(lambda words: words.split('\n'))\
    .mapValues(lambda word: word.strip())
    #.take(10)

# print(words_by_tld_rdd.take(10))

tlds = words_by_tld_rdd.countByKey()
#print(tlds)

tlds = OrderedDict(sorted(tlds.items(), key = itemgetter(1), reverse = True))
# print(tlds)  

top_tlds = dict(itertools.islice(tlds.items(), 10))

# print(top_tlds)

print("Results:")

for tld in top_tlds:
    print(tld)
    top_words_for_tld = words_by_tld_rdd\
        .filter(lambda tld_word: tld_word[0] == tld)\
        .values()\
        .map(lambda w: (w,1))\
        .reduceByKey(add)\
        .takeOrdered(20, key=lambda x: -x[1])
    print(top_words_for_tld)

Results:
com


In [None]:
#file_content = rdd.take(1)[0][1]
#print(file_content.partition('\r\n\r\n')[2])
from operator import add
import re

ex = "WARC-Type: conversion\
WARC-Target-URI: http://news.bbc.co.uk/2/hi/africa/3414345.stm\
WARC-Date: 2014-08-02T09:52:13Z"

p = re.compile('WARC-Target-URI: \S+\.(([a-zA-Z]{2,3}}\.)?[a-zA-Z]{2,3}})/', re.IGNORECASE)

print(p.search(ex))

#print(bool(p.search('\nWARC-Target-URI:\n')))

#rdd\
#.filter(lambda doc: bool(p.search(doc[1])))\
#.map(lambda filename_content: filename_content[1].partition('\r\n\r\n')[2])\
#.flatMap(lambda t: t.split(' '))\
#.flatMap(lambda w: w.split('\n'))\
#.map(lambda w: w.strip())\
#.map(lambda w: (w,1))\
#.reduceByKey(add)\
#.takeOrdered(100, key=lambda x: -x[1])
#.take(100)
#.take(10)
#.flatMap(lambda text: text.split(' ')).take(100)

In [None]:
#spark_session.stop()