In [None]:
from operator import add
import re
from collections import OrderedDict
from operator import itemgetter 
import itertools
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://ben-spark-master:7077") \
        .config('spark.executor.memory', '2g') \
        .config('spark.driver.maxResultSize', 0) \
        .appName("common_crawl")\
        .getOrCreate()
#        .config('spark.executor.cores', 2)\


# Old API (RDD)
spark_context = spark_session.sparkContext

# (*/*) - out of memory
# ~6.4mins for 39496 files. (...00000/)  (takes 1 minute with 40 partitions)
# ~5 secs for 10 files (...00000/0*) 
rdd = spark_context.wholeTextFiles('/mnt/nfs/ben-spark-master/teaching/crawl/CC-MAIN-2018-03/splits/CC-MAIN-20180317035630-20180317055630-00000.warc.wet/', minPartitions=40).cache()
rdd.count()


In [4]:
rdd.getNumPartitions()

7

In [5]:
print(spark_context.uiWebUrl)

http://ben-spark-master:4040


In [None]:
## Example #1 - Filter by TLD and compute most common words ##

p = re.compile('WARC-Target-URI: \S+\.com', re.IGNORECASE)

rdd\
.filter(lambda doc: bool(p.search(doc[1])))\
.map(lambda filename_content: filename_content[1].partition('\r\n\r\n')[2])\
.flatMap(lambda t: t.split(' '))\
.flatMap(lambda w: w.split('\n'))\
.map(lambda w: w.strip())\
.map(lambda w: (w,1))\
.reduceByKey(add)\
.takeOrdered(100, key=lambda x: -x[1])


In [18]:
## Example #2 - Group by TLD and compute most common words for each ##

ex = "WARC-Type: conversion\
WARC-Target-URI: http://news.bbc.co.uk/2/hi/africa/3414345.stm\
WARC-Date: 2014-08-02T09:52:13Z"

p = re.compile('WARC-Target-URI: \S+\.([a-zA-Z]{2,3})/', re.IGNORECASE)
# print(p.search(ex).group(1))
# uk

def get_tld(content):
    match = p.search(content)
    if match is not None:
        return match.group(1)
    else:
        return None

words_by_tld_rdd = rdd\
.map(lambda filename_content: filename_content[1])\
.map(lambda content: (get_tld(content), content.partition('\r\n\r\n')[2]))\
.filter(lambda tld_content: tld_content[0] is not None)\
.flatMapValues(lambda words: words.split(' '))\
.flatMapValues(lambda words: words.split('\n'))\
.mapValues(lambda word: word.strip())
#.take(10)

print(words_by_tld_rdd.take(10))

tlds = words_by_tld_rdd.countByKey()
#print(tlds)

tlds = OrderedDict(sorted(tlds.items(), key = itemgetter(1), reverse = True))
print(tlds)  

top_tlds = dict(itertools.islice(tlds.items(), 10))

print(top_tlds)

print("Results:")

for tld in top_tlds:
    print(tld)
    top_words_for_tld = words_by_tld_rdd\
        .filter(lambda tld_word: tld_word[0] == tld)\
        .values()\
        .map(lambda w: (w,1))\
        .reduceByKey(add)\
        .takeOrdered(20, key=lambda x: -x[1])
    print(top_words_for_tld)

[('com', 'mars,'), ('com', '2017'), ('com', '|'), ('com', 'Script'), ('com', 'Webmaster'), ('com', 'Scripts'), ('com', 'Clone'), ('com', 'CMS'), ('com', 'BoonEx'), ('com', '–')]
OrderedDict([('ru', 1580), ('com', 1560), ('rs', 393), ('net', 120), ('jp', 49)])
{'rs': 393, 'net': 120, 'ru': 1580, 'jp': 49, 'com': 1560}
Results:
rs
[('i', 15), ('Za', 5), ('011', 5), ('Majice', 5), ('sajta', 5), ('/', 4), ('shop', 4), ('sa', 4), ('|', 4), ('', 3), ('kupaći', 3), ('Uplate', 3), ('iz', 3), ('Preko', 3), ('za', 3), ('Isporuka', 3), ('-', 3), ('decu', 3), ('čirak', 3), ('kupce', 3)]
net
[('[フリー写真]', 4), ('', 3), ('壱', 3), ('・著作者のクレジットを表示すること。', 3), ('・改変して二次創作すること。', 3), ('・営利目的で作品を二次利用すること。', 3), ('(1)', 3), ('GATAG｜フリー素材集', 3), ('(0)', 2), ('-', 2), ('ライセンス', 2), ('体育座りして天を見上げるビジネスウーマン', 2), ('2015年6月', 1), ('←', 1), ('2', 1), ('フリー写真', 1), ('「クール」順に表示', 1), ('300', 1), ('表示', 1), ('1', 1)]
ru
[('в', 69), ('и', 36), ('казани', 25), ('с', 21), ('калипсо', 19), ('не', 17), ('что', 14), ('на', 

In [52]:
#file_content = rdd.take(1)[0][1]
#print(file_content.partition('\r\n\r\n')[2])
from operator import add
import re

ex = "WARC-Type: conversion\
WARC-Target-URI: http://news.bbc.co.uk/2/hi/africa/3414345.stm\
WARC-Date: 2014-08-02T09:52:13Z"

p = re.compile('WARC-Target-URI: \S+\.(([a-zA-Z]{2,3}}\.)?[a-zA-Z]{2,3}})/', re.IGNORECASE)

print(p.search(ex))

#print(bool(p.search('\nWARC-Target-URI:\n')))

#rdd\
#.filter(lambda doc: bool(p.search(doc[1])))\
#.map(lambda filename_content: filename_content[1].partition('\r\n\r\n')[2])\
#.flatMap(lambda t: t.split(' '))\
#.flatMap(lambda w: w.split('\n'))\
#.map(lambda w: w.strip())\
#.map(lambda w: (w,1))\
#.reduceByKey(add)\
#.takeOrdered(100, key=lambda x: -x[1])
#.take(100)
#.take(10)
#.flatMap(lambda text: text.split(' ')).take(100)

None


In [6]:
rdd.map(lambda filename_content: filename_content[1].split('WARC/1.0')).cache().take(1)

KeyboardInterrupt: 

In [None]:
rdd.take(1)[0]

In [1]:
spark_session.stop()

NameError: name 'spark_session' is not defined