In [1]:
from aut import *
from pyspark.sql.functions import col, desc

In [2]:
# Web archive collection; web pages.
webpages = WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities").webpages()

In [3]:
# Web archive collection; web graph.
webgraph = WebArchive(sc, sqlContext, "/home/nruest/Projects/au/sample-data/geocities").webgraph()

In [4]:
# Domains file.
webpages.groupBy(remove_prefix_www(extract_domain("url")).alias("url"))\
  .count()\
  .sort(col("count").desc())\
  .write.csv("all-domains")

In [5]:
# Full-text.
webpages.select("crawl_date", remove_prefix_www(extract_domain("url")).alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\
  .write.csv("full-text")

In [6]:
# Create DataFrame for GraphML output
graph = webgraph.groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain"))\
          .count()\
          .filter((col("dest_domain").isNotNull()) & (col("dest_domain") !=""))\
          .filter((col("src_domain").isNotNull()) & (col("src_domain") !=""))\
          .filter(col("count") > 5)\
          .orderBy(desc("count"))

In [7]:
# Write the GraphML out to a file.
WriteGraphML(graph.collect(), "test.graphml")

In [8]:
# Write the GEXF out to a file.

# Not part of auk standard derivative process,
# but apart of https://github.com/archivesunleashed/aut/pull/466 testing.

WriteGEXF(graph.collect(), "test.gexf")