In [1]:
from aut import *
from pyspark.sql.functions import col, desc, explode

In [2]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .all()\
  .select("url", "http_status_code")\
  .show(10, False)

+-----------------------------------------------------------------------------------+----------------+
|url                                                                                |http_status_code|
+-----------------------------------------------------------------------------------+----------------+
|http://geocities.com/babiekaos/Links.html                                          |200             |
|http://geocities.com/cloneaccount3/6490/                                           |200             |
|http://www.geocities.com/coledale28/hi-power-soldiers-music.html                   |200             |
|http://www.geocities.com/orvilleduncan811/12-day-of-christmas-sheet-music.html     |200             |
|http://geocities.com/jtbm71/fotos/2000/                                            |200             |
|http://geocities.com/cancmay/s/sunshine.html                                       |200             |
|http://www.talent-direct.com/cgi-bin/tal_pro.cgi?profile=ARZCdYbJU5KsMAR

In [3]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .all()\
  .select("url", "archive_filename")\
  .show(10, True)

+--------------------+--------------------+
|                 url|    archive_filename|
+--------------------+--------------------+
|http://geocities....|file:/home/nruest...|
|http://geocities....|file:/home/nruest...|
|http://www.geocit...|file:/home/nruest...|
|http://www.geocit...|file:/home/nruest...|
|http://geocities....|file:/home/nruest...|
|http://geocities....|file:/home/nruest...|
|http://www.talent...|file:/home/nruest...|
|http://geocities....|file:/home/nruest...|
|http://geocities....|file:/home/nruest...|
|http://www.geocit...|file:/home/nruest...|
+--------------------+--------------------+
only showing top 10 rows



In [4]:
url_pattern = "%http://geocities.com/babiekaos/%"

WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .filter(col("url").like(url_pattern))\
  .select(explode(Udf.extract_links("url", "content")).alias("links"))\
  .select(Udf.remove_prefix_www(Udf.extract_domain(col("links._1"))).alias("src"), Udf.remove_prefix_www(Udf.extract_domain(col("links._2"))).alias("dest"))\
  .groupBy("src", "dest")\
  .count()\
  .show(10, False)

+-------------+---------------------+-----+
|src          |dest                 |count|
+-------------+---------------------+-----+
|geocities.com|sushi.perfectdrug.net|1    |
|geocities.com|eatsushi.com         |1    |
|geocities.com|sushilinks.com       |1    |
|geocities.com|sushifaq.com         |1    |
+-------------+---------------------+-----+



In [5]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webgraph()\
  .groupBy("crawl_date", Udf.remove_prefix_www(Udf.extract_domain("src")).alias("src"), Udf.remove_prefix_www(Udf.extract_domain("dest")).alias("dest"))\
  .count()\
  .filter(col("count") > 5)\
  .show(10, True)

+----------+-------------+--------------------+-----+
|crawl_date|          src|                dest|count|
+----------+-------------+--------------------+-----+
|  20091027|geocities.com|             come.to|  142|
|  20091027|geocities.com|        hotfiles.com|    8|
|  20091027|geocities.com|           itv.co.th|   36|
|  20091027|geocities.com|          nhacso.net|   10|
|  20091027|geocities.com|   destinyschild.com|  134|
|  20091027|geocities.com|           sciam.com|    6|
|  20091027|geocities.com|            best.com|   26|
|  20091027|geocities.com|    techlearning.com|    6|
|  20091027|geocities.com|meltingpot.fortun...|   15|
|  20091027|geocities.com|           tv3.co.th|  104|
+----------+-------------+--------------------+-----+
only showing top 10 rows



In [6]:
domains = ["www.geocities.com"]

WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("conten  t"))\
  .filter(col("domain").isin(domains))\
  .show(20, True)

+----------+-----------------+--------------------+--------------------+
|crawl_date|           domain|                 url|           conten  t|
+----------+-----------------+--------------------+--------------------+
|  20091027|www.geocities.com|http://www.geocit...|Hi Power Soldiers...|
|  20091027|www.geocities.com|http://www.geocit...|12 Day Of Christm...|
|  20091027|www.geocities.com|http://www.geocit...|Child Youth Elbow...|
|  20091027|www.geocities.com|http://www.geocit...|Hiawatha Golf Cou...|
|  20091027|www.geocities.com|http://www.geocit...|Hewlett Packard 6...|
|  20091027|www.geocities.com|http://www.geocit...|Kelly Taphouse Ke...|
|  20091027|www.geocities.com|http://www.geocit...|profile Name: Joe...|
|  20091027|www.geocities.com|http://www.geocit...|Women Yeast Infec...|
|  20091027|www.geocities.com|http://www.geocit...|Hi 5 Recycling Hi...|
|  20091027|www.geocities.com|http://www.geocit...|Hazel Court: Auto...|
|  20091027|www.geocities.com|http://www.geocit...|

In [7]:
url_pattern = "%http://geocities.com/cancmay%"

WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url")\
  .filter(col("url").like(url_pattern))\
  .show(20, False)

+----------+-------------+-----------------------------------------------------------+
|crawl_date|domain       |url                                                        |
+----------+-------------+-----------------------------------------------------------+
|20091027  |geocities.com|http://geocities.com/cancmay/s/sunshine.html               |
|20091027  |geocities.com|http://geocities.com/cancmay/s/save-tonight.html           |
|20091027  |geocities.com|http://geocities.com/cancmay/s/so-far-away.html            |
|20091027  |geocities.com|http://geocities.com/cancmay/s/sweet-georgia-brown.html    |
|20091027  |geocities.com|http://geocities.com/cancmay/s/smoke-gets-in-your-eyes.html|
|20091027  |geocities.com|http://geocities.com/cancmay/s/smooth-operator.html        |
|20091027  |geocities.com|http://geocities.com/cancmay/s/stars.html                  |
|20091027  |geocities.com|http://geocities.com/cancmay/s/stardust.html               |
|20091027  |geocities.com|http://geocities.

In [8]:
dates = "2009[10][09]\d\d"
 
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
  .filter(col("crawl_date").rlike(dates))\
  .show(20, True)

+----------+--------------------+--------------------+--------------------+
|crawl_date|              domain|                 url|             content|
+----------+--------------------+--------------------+--------------------+
|  20091027|       geocities.com|http://geocities....|Sushi Land Sushi ...|
|  20091027|       geocities.com|http://geocities....|Andrea Cruz Welco...|
|  20091027|   www.geocities.com|http://www.geocit...|Hi Power Soldiers...|
|  20091027|   www.geocities.com|http://www.geocit...|12 Day Of Christm...|
|  20091027|       geocities.com|http://geocities....|Index of /jtbm71/...|
|  20091027|       geocities.com|http://geocities....|sunshine CanCMay ...|
|  20091027|www.talent-direct...|http://www.talent...|talent direct voi...|
|  20091027|       geocities.com|http://geocities....|Index of /akimi91...|
|  20091027|       geocities.com|http://geocities....|stardust CanCMay ...|
|  20091027|   www.geocities.com|http://www.geocit...|Child Youth Elbow...|
|  20091027|

In [9]:
content = "%radio%"

WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
  .filter(col("content").like(content))\
  .show(10, True)

+----------+-----------------+--------------------+--------------------+
|crawl_date|           domain|                 url|             content|
+----------+-----------------+--------------------+--------------------+
|  20091027|www.geocities.com|http://www.geocit...|Hewlettpackard Sc...|
|  20091027|    geocities.com|http://geocities....|Index of /the_mag...|
|  20091027|www.geocities.com|http://www.geocit...|Women S Petite Pl...|
|  20091027|    geocities.com|http://geocities....|dec2002 Nov 5th, ...|
|  20091027|    geocities.com|http://geocities....|titlepagereaserch...|
|  20091027|www.geocities.com|http://www.geocit...|Women S Sports Ap...|
|  20091027|    geocities.com|http://geocities....|Index of /the_mag...|
|  20091027|    geocities.com|http://geocities....|something beautif...|
|  20091027|www.geocities.com|http://www.geocit...|Hewlett Packard C...|
|  20091027|www.geocities.com|http://www.geocit...|Kelly Swain Kelly...|
+----------+-----------------+--------------------+

In [10]:
domains = ["www.geocities.com"]
languages = ["fr"]

WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
  .filter(col("domain").isin(domains))\
  .filter(col("language").isin(languages))\
  .show(20, True)

+----------+-----------------+--------------------+--------------------+
|crawl_date|           domain|                 url|             content|
+----------+-----------------+--------------------+--------------------+
|  20091027|www.geocities.com|http://www.geocit...|lillielonnie Lill...|
|  20091027|www.geocities.com|http://www.geocit...|lillielonnie2 Lil...|
|  20091027|www.geocities.com|http://www.geocit...|29 janvier 2004 -...|
|  20091027|www.geocities.com|http://www.geocit...|28 janvier 2004 -...|
|  20091027|www.geocities.com|http://www.geocit...|31 janvier 2004 -...|
|  20091027|www.geocities.com|http://www.geocit...|Lettre � mon amou...|
|  20091027|www.geocities.com|http://www.geocit...|26 janvier 2004 -...|
|  20091027|www.geocities.com|http://www.geocit...|19 janvier 2004 -...|
|  20091027|www.geocities.com|http://www.geocit...|Samedi, le 17 jan...|
|  20091027|www.geocities.com|http://www.geocit...|6 mai 2004 -- Jou...|
|  20091027|www.geocities.com|http://www.geocit...|

In [11]:
content = "%radio%"

WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .filter(col("content").like(content))\
  .select(explode(Udf.extract_links("url", "content")).alias("links"))\
  .select(Udf.remove_prefix_www(Udf.extract_domain(col("links._1"))).alias("src"), Udf.remove_prefix_www(Udf.extract_domain(col("links._2"))).alias("dest"))\
  .groupBy("src", "dest")\
  .count()\
  .filter(col("count") > 5)\
  .show(10, True)

+----------------+--------------------+-----+
|             src|                dest|count|
+----------------+--------------------+-----+
|   geocities.com|     worldscouts.net|    6|
|saibabalinks.org|    saibabalinks.org|   14|
|   geocities.com|   service.bfast.com|   15|
|   geocities.com|   idmg.blogspot.com|    9|
|   geocities.com|       terravista.pt|   15|
|   geocities.com| img.photobucket.com|   23|
|   geocities.com|   privacy.yahoo.com|    6|
|   geocities.com|free.hostdepartme...|   30|
|   geocities.com|provincia.venezia.it|    6|
|   geocities.com|dontkillspike.pro...|  573|
+----------------+--------------------+-----+
only showing top 10 rows



In [12]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webgraph()\
  .groupBy(Udf.extract_domain("src"), Udf.extract_domain("dest"))\
  .count()\
  .filter(col("count") > 5)\
  .show(10, True)

+-----------------+--------------------+-----+
|         UDF(src)|           UDF(dest)|count|
+-----------------+--------------------+-----+
|    geocities.com|www.buyandselldb.com|16200|
|    geocities.com| www.metroactive.com|    6|
|www.geocities.com|     www.bigfoot.com|   10|
|    geocities.com|  pub16.bravenet.com|   24|
|    geocities.com|     worldscouts.net|    6|
|    geocities.com|      www.disney.com|   12|
|www.geocities.com|www.phenomenalwom...|   20|
|    geocities.com|   service.bfast.com|  139|
|www.geocities.com|         www.cdc.gov|    8|
|    geocities.com|  hc2.humanclick.com|   11|
+-----------------+--------------------+-----+
only showing top 10 rows



In [13]:
url_pattern = "http://[^/]+/[^/]+/"

WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("url")\
  .filter(col("url").rlike(url_pattern))\
  .show(10, False)


+-----------------------------------------------------------------------------------+
|url                                                                                |
+-----------------------------------------------------------------------------------+
|http://geocities.com/babiekaos/Links.html                                          |
|http://geocities.com/cloneaccount3/6490/                                           |
|http://www.geocities.com/coledale28/hi-power-soldiers-music.html                   |
|http://www.geocities.com/orvilleduncan811/12-day-of-christmas-sheet-music.html     |
|http://geocities.com/jtbm71/fotos/2000/                                            |
|http://geocities.com/cancmay/s/sunshine.html                                       |
|http://www.talent-direct.com/cgi-bin/tal_pro.cgi?profile=ARZCdYbJU5KsMARKdUxiO4l3DY|
|http://geocities.com/akimi919/sp_ph/?M=A                                           |
|http://geocities.com/cancmay/s/save-tonight.html     

In [14]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select(Udf.extract_domain("url").alias("Domain"))\
  .groupBy("Domain")\
  .count()\
  .sort(desc("count"))\
  .show(10, False)


+---------------------------+-----+
|Domain                     |count|
+---------------------------+-----+
|geocities.com              |93886|
|www.geocities.com          |29223|
|www.infocastfn.com         |430  |
|rcm.amazon.com             |201  |
|www.bagus.com              |133  |
|www.globalimagegallery.com |130  |
|www.physforum.com          |124  |
|www.internetarchaeology.org|121  |
|us.geocities.com           |121  |
|www.spb.tvoe.tv            |108  |
+---------------------------+-----+
only showing top 10 rows



In [15]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html("content").alias("content"))\
  .show(10, True)

+----------+--------------------+--------------------+--------------------+
|crawl_date|              domain|                 url|             content|
+----------+--------------------+--------------------+--------------------+
|  20091027|       geocities.com|http://geocities....|Sushi Land Sushi ...|
|  20091027|       geocities.com|http://geocities....|Andrea Cruz Welco...|
|  20091027|   www.geocities.com|http://www.geocit...|Hi Power Soldiers...|
|  20091027|   www.geocities.com|http://www.geocit...|12 Day Of Christm...|
|  20091027|       geocities.com|http://geocities....|Index of /jtbm71/...|
|  20091027|       geocities.com|http://geocities....|sunshine CanCMay ...|
|  20091027|www.talent-direct...|http://www.talent...|talent direct voi...|
|  20091027|       geocities.com|http://geocities....|Index of /akimi91...|
|  20091027|       geocities.com|http://geocities....|stardust CanCMay ...|
|  20091027|   www.geocities.com|http://www.geocit...|Child Youth Elbow...|
+----------+

In [16]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select(Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
  .show(10, True)

+--------------------+
|             content|
+--------------------+
|Sushi Land Sushi ...|
|Andrea Cruz Welco...|
|Hi Power Soldiers...|
|12 Day Of Christm...|
|Index of /jtbm71/...|
|sunshine CanCMay ...|
|talent direct voi...|
|Index of /akimi91...|
|stardust CanCMay ...|
|Child Youth Elbow...|
+--------------------+
only showing top 10 rows



In [17]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
  .show(10, True)

+----------+--------------------+--------------------+--------------------+
|crawl_date|              domain|                 url|             content|
+----------+--------------------+--------------------+--------------------+
|  20091027|       geocities.com|http://geocities....|Sushi Land Sushi ...|
|  20091027|       geocities.com|http://geocities....|Andrea Cruz Welco...|
|  20091027|   www.geocities.com|http://www.geocit...|Hi Power Soldiers...|
|  20091027|   www.geocities.com|http://www.geocit...|12 Day Of Christm...|
|  20091027|       geocities.com|http://geocities....|Index of /jtbm71/...|
|  20091027|       geocities.com|http://geocities....|sunshine CanCMay ...|
|  20091027|www.talent-direct...|http://www.talent...|talent direct voi...|
|  20091027|       geocities.com|http://geocities....|Index of /akimi91...|
|  20091027|       geocities.com|http://geocities....|stardust CanCMay ...|
|  20091027|   www.geocities.com|http://www.geocit...|Child Youth Elbow...|
+----------+

In [18]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.extract_boilerplate(Udf.remove_http_header("content")).alias("content"))\
  .show(10, True)

+----------+--------------------+--------------------+--------------------+
|crawl_date|              domain|                 url|             content|
+----------+--------------------+--------------------+--------------------+
|  20091027|       geocities.com|http://geocities....|Nori (seaweed) wa...|
|  20091027|       geocities.com|http://geocities....| This site is about:|
|  20091027|   www.geocities.com|http://www.geocit...|Hi Power Soldiers...|
|  20091027|   www.geocities.com|http://www.geocit...|12 Day Of Christm...|
|  20091027|       geocities.com|http://geocities....|                    |
|  20091027|       geocities.com|http://geocities....|CanCMay Sunshine ...|
|  20091027|www.talent-direct...|http://www.talent...|                    |
|  20091027|       geocities.com|http://geocities....|                    |
|  20091027|       geocities.com|http://geocities....|Save Tonight Mind...|
|  20091027|   www.geocities.com|http://www.geocit...|Child Youth Elbow...|
+----------+

In [19]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_http_header("content").alias("content"))\
  .show(10, True)

+----------+--------------------+--------------------+--------------------+
|crawl_date|              domain|                 url|             content|
+----------+--------------------+--------------------+--------------------+
|  20091027|       geocities.com|http://geocities....|
<html>

<head...|
|  20091027|       geocities.com|http://geocities....|<html><head><titl...|
|  20091027|   www.geocities.com|http://www.geocit...|<html>

<head>...|
|  20091027|   www.geocities.com|http://www.geocit...|<html>

<head>...|
|  20091027|       geocities.com|http://geocities....|<!DOCTYPE HTML PU...|
|  20091027|       geocities.com|http://geocities....|<html>
<head><ti...|
|  20091027|www.talent-direct...|http://www.talent...|

<!DOCTYPE htm...|
|  20091027|       geocities.com|http://geocities....|<!DOCTYPE HTML PU...|
|  20091027|       geocities.com|http://geocities....|<html>
<head><ti...|
|  20091027|   www.geocities.com|http://www.geocit...|<html>

<head>...|
+----------+

In [20]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .images()\
  .select(Udf.compute_sha1("bytes").alias("udf_sha1"), "sha1", Udf.compute_md5("bytes").alias("udf_md5"), "md5", Udf.compute_image_size("bytes").alias("udf_image_size"), "height", "width")\
  .show(10, True)

+--------------------+--------------------+--------------------+--------------------+--------------+------+-----+
|            udf_sha1|                sha1|             udf_md5|                 md5|udf_image_size|height|width|
+--------------------+--------------------+--------------------+--------------------+--------------+------+-----+
|99d74a3b4fbd6d7cd...|9ca2bc31550f9369e...|ce1b5ab3e51fd9f6b...|ce4c718e925105232...|        [0, 0]|   432|  288|
|245b94c90eac0dcd9...|ff0467d8d2cbc5d50...|9b8909a52d94b6d17...|f6b631a4db5f4c7a3...|        [0, 0]|   103| 1200|
|cd19e4e7e2dd9e090...|faa81452f0c19b304...|a97c139a3a31467ae...|4f59788bde58d15d5...|        [0, 0]|     1|    1|
|fd5eb52badba72a29...|0720946d3ced04976...|83ca84887072a62b9...|2677171223600bf34...|        [0, 0]|   480| 1050|
|9333370d1f79af66c...|f9aa611fc62b735c3...|586628aaae7e0076a...|0a089830419a5c0ed...|        [0, 0]|   315|  217|
|676b4a596a901024a...|5bb4bf5dfe39520a3...|dcec4d3ffac515f73...|a0210969ba9fac53a...|   

In [21]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select("crawl_date", Udf.detect_language("content").alias("udf_language"), "language")\
  .show(10, True)

+----------+------------+--------+
|crawl_date|udf_language|language|
+----------+------------+--------+
|  20091027|          en|      en|
|  20091027|          en|      en|
|  20091027|          en|      en|
|  20091027|          en|      en|
|  20091027|          en|      en|
|  20091027|          en|      en|
|  20091027|          en|      en|
|  20091027|          en|      ms|
|  20091027|          en|      en|
|  20091027|          en|      en|
+----------+------------+--------+
only showing top 10 rows



In [22]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .all()\
  .select("crawl_date", Udf.detect_mime_type_tika("bytes").alias("udf_tika"), "mime_type_tika")\
  .show(10, True)

+----------+--------------------+--------------------+
|crawl_date|            udf_tika|      mime_type_tika|
+----------+--------------------+--------------------+
|  20091027|           text/html|           text/html|
|  20091027|           text/html|           text/html|
|  20091027|           text/html|           text/html|
|  20091027|           text/html|           text/html|
|  20091027|           text/html|           text/html|
|  20091027|           text/html|           text/html|
|  20091027|application/xhtml...|application/xhtml...|
|  20091027|           text/html|           text/html|
|  20091027|           text/html|           text/html|
|  20091027|           text/html|           text/html|
+----------+--------------------+--------------------+
only showing top 10 rows



In [23]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select(explode(Udf.extract_links("url", "content")).alias("links"))\
  .show(10, False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|links                                                                                                                                                                                                              |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[http://geocities.com/babiekaos/Links.html, http://www.sushilinks.com, Sushi Links]                                                                                                                                |
|[http://geocities.com/babiekaos/Links.html, http://www.eatsushi.com, Eat Sushi]                                                                

In [24]:
WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .select(explode(Udf.extract_image_links("url", "content")).alias("image_links"))\
  .show(10, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------+
|image_links                                                                                                                                        |
+---------------------------------------------------------------------------------------------------------------------------------------------------+
|[http://geocities.com/babiekaos/Links.html, http://geocities.com/babiekaos/PiX/sushi-index.jpg, ]                                                  |
|[http://geocities.com/babiekaos/Links.html, file:/E|/WEB%20Stuff/templates/137/spaceline.gif, ]                                                    |
|[http://geocities.com/babiekaos/Links.html, http://visit.geocities.yahoo.com/visit.gif?us1256654020, setstats]                                     |
|[http://geocities.com/babiekaos/Links.html, http://geo.yahoo.com/serv?s=76001079&t=1256654020&f=us-

In [25]:
languages = ["es", "fr"]

WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .filter(col("language").isin(languages))\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", "language")\
  .show(50, True)

+----------+-----------------+--------------------+--------+
|crawl_date|           domain|                 url|language|
+----------+-----------------+--------------------+--------+
|  20091027|    geocities.com|http://geocities....|      es|
|  20091027|    geocities.com|http://geocities....|      es|
|  20091027|    geocities.com|http://geocities....|      es|
|  20091027|www.geocities.com|http://www.geocit...|      es|
|  20091027|www.geocities.com|http://www.geocit...|      es|
|  20091027|www.geocities.com|http://www.geocit...|      es|
|  20091027|    geocities.com|http://geocities....|      fr|
|  20091027|    geocities.com|http://geocities....|      es|
|  20091027|www.geocities.com|http://www.geocit...|      es|
|  20091027|www.geocities.com|http://www.geocit...|      es|
|  20091027|    geocities.com|http://geocities....|      fr|
|  20091027|    geocities.com|http://geocities....|      es|
|  20091027|    geocities.com|http://geocities....|      fr|
|  20091027|    geocitie

In [26]:
languages = ["es", "fr"]

WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities")\
  .webpages()\
  .filter(~col("language").isin(languages))\
  .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", "language")\
  .show(50, True)

+----------+--------------------+--------------------+--------+
|crawl_date|              domain|                 url|language|
+----------+--------------------+--------------------+--------+
|  20091027|       geocities.com|http://geocities....|      en|
|  20091027|       geocities.com|http://geocities....|      en|
|  20091027|   www.geocities.com|http://www.geocit...|      en|
|  20091027|   www.geocities.com|http://www.geocit...|      en|
|  20091027|       geocities.com|http://geocities....|      en|
|  20091027|       geocities.com|http://geocities....|      en|
|  20091027|www.talent-direct...|http://www.talent...|      en|
|  20091027|       geocities.com|http://geocities....|      ms|
|  20091027|       geocities.com|http://geocities....|      en|
|  20091027|   www.geocities.com|http://www.geocit...|      en|
|  20091027|   www.geocities.com|http://www.geocit...|      en|
|  20091027|       geocities.com|http://geocities....|      en|
|  20091027|       geocities.com|http://