From 2a1fcd7f02239d9e6bb742694ea059904f129a1d Mon Sep 17 00:00:00 2001 From: Nick Ruest Date: Fri, 29 May 2020 14:23:25 -0400 Subject: [PATCH] Updates for https://github.com/archivesunleashed/aut/issues/467 (#66) --- current/collection-analysis.md | 2 +- current/link-analysis.md | 16 ++++++++-------- current/standard-derivatives.md | 6 +++--- current/text-analysis.md | 18 +++++++++--------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/current/collection-analysis.md b/current/collection-analysis.md index be68625..0f95269 100644 --- a/current/collection-analysis.md +++ b/current/collection-analysis.md @@ -95,7 +95,7 @@ from pyspark.sql.functions import desc WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select(Udf.extract_domain("url").alias("Domain"))\ + .select(extract_domain("url").alias("Domain"))\ .groupBy("Domain")\ .count()\ .sort(desc("count"))\ diff --git a/current/link-analysis.md b/current/link-analysis.md index 127f7b4..27b04df 100644 --- a/current/link-analysis.md +++ b/current/link-analysis.md @@ -106,8 +106,8 @@ content = "%radio%" WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ .filter(col("content").like(content))\ - .select(explode(Udf.extract_links("url", "content")).alias("links"))\ - .select(Udf.remove_prefix_www(Udf.extract_domain(col("links._1"))).alias("src"), Udf.remove_prefix_www(Udf.extract_domain(col("links._2"))).alias("dest"))\ + .select(explode(extract_links("url", "content")).alias("links"))\ + .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain(col("links._2"))).alias("dest"))\ .groupBy("src", "dest")\ .count()\ .filter(col("count") > 5)\ @@ -170,7 +170,7 @@ from pyspark.sql.functions import col WebArchive(sc, sqlContext, "/path/to/warcs")\ .webgraph()\ - .groupBy(Udf.extract_domain("src"), Udf.extract_domain("dest"))\ + .groupBy(extract_domain("src"), extract_domain("dest"))\ .count()\ .filter(col("count") > 5)\ .write.csv("full-links-all-df/") @@ -230,8 +230,8 @@ url_pattern = "%http://www.archive.org/details/%" WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ .filter(col("url").like(url_pattern))\ - .select(explode(Udf.extract_links("url", "content").alias("links")))\ - .select(Udf.remove_prefix_www(Udf.extract_domain(col("links._1"))).alias("src"), Udf.remove_prefix_www(Udf.extract_domain("links._2")).alias("dest"))\ + .select(explode(extract_links("url", "content").alias("links")))\ + .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain("links._2")).alias("dest"))\ .groupBy("src", "dest")\ .count()\ .filter(col("count") > 5)\ @@ -311,7 +311,7 @@ from pyspark.sql.functions import col WebArchive(sc, sqlContext, "/path/to/warcs")\ .webgraph()\ - .groupBy("crawl_date", Udf.remove_prefix_www(Udf.extract_domain("src")).alias("src"), Udf.remove_prefix_www(Udf.extract_domain("dest")).alias("dest"))\ + .groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src"), remove_prefix_www(extract_domain("dest")).alias("dest"))\ .count()\ .filter(col("count") > 5)\ .write.csv("sitelinks-by-date-df/") @@ -369,8 +369,8 @@ url_pattern = "http://www.archive.org/details/.*" WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ .filter(col("url").rlike(url_pattern))\ - .select(explode(Udf.extract_links("url", "content")).alias("links"))\ - .select(Udf.remove_prefix_www(Udf.extract_domain(col("links._1"))).alias("src"), Udf.remove_prefix_www(Udf.extract_domain(col("links._2"))).alias("dest"))\ + .select(explode(extract_links("url", "content")).alias("links"))\ + .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain(col("links._2"))).alias("dest"))\ .groupBy("src", "dest")\ .count()\ .filter(col("count") > 5)\ diff --git a/current/standard-derivatives.md b/current/standard-derivatives.md index c9415b1..a854330 100644 --- a/current/standard-derivatives.md +++ b/current/standard-derivatives.md @@ -118,17 +118,17 @@ webpages = WebArchive(sc, sqlContext, "/path/to/data").webpages() webgraph = WebArchive(sc, sqlContext, "/path/to/data").webgraph() # Domains file. -webpages.groupBy(Udf.remove_prefix_www(Udf.extract_domain("url")).alias("url"))\ +webpages.groupBy(remove_prefix_www(extract_domain("url")).alias("url"))\ .count()\ .sort(col("count").desc())\ .write.csv("/path/to/derivatives/auk/all-domains/output"") # Full-text. -webpages.select("crawl_date", Udf.remove_prefix_www(Udf.extract_domain("url")).alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\ +webpages.select("crawl_date", remove_prefix_www(extract_domain("url")).alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\ .write.csv("/path/to/derivatives/auk/full-text/output") # Create DataFrame for GraphML output -graph = webgraph.groupBy("crawl_date", Udf.remove_prefix_www(Udf.extract_domain("src")).alias("src_domain"), Udf.remove_prefix_www(Udf.extract_domain("dest")).alias("dest_domain"))\ +graph = webgraph.groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain"))\ .count()\ .filter((col("dest_domain").isNotNull()) & (col("dest_domain") !=""))\ .filter((col("src_domain").isNotNull()) & (col("src_domain") !=""))\ diff --git a/current/text-analysis.md b/current/text-analysis.md index 3c217fc..afe8d16 100644 --- a/current/text-analysis.md +++ b/current/text-analysis.md @@ -55,7 +55,7 @@ from aut import * WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html("content").alias("content")) + .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html("content").alias("content")) .write.csv("plain-text-df/") ``` @@ -98,7 +98,7 @@ from aut import * WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select(Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\ + .select(remove_html(remove_http_header("content")).alias("content"))\ .write.csv("plain-text-noheaders-df/") ``` @@ -147,7 +147,7 @@ domains = ["www.archive.org"] WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\ + .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\ .filter(col("domain").isin(domains))\ .write.csv("plain-text-domain-df/") ``` @@ -199,7 +199,7 @@ url_pattern = "%http://www.archive.org/details/%" WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\ + .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\ .filter(col("url").like(url_pattern))\ .write.csv("details-df/") ``` @@ -247,7 +247,7 @@ from aut import * WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.extract_boilerplate(Udf.remove_http_header("content")).alias("content"))\ + .select("crawl_date", extract_domain("url").alias("domain"), "url", extract_boilerplate(remove_http_header("content")).alias("content"))\ .write.csv("plain-text-no-boilerplate-df/") ``` @@ -338,7 +338,7 @@ dates = "2009[10][09]\d\d" WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\ + .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\ .filter(col("crawl_date").rlike(dates))\ .write.csv("plain-text-date-filtered-2008-2015-df/") ``` @@ -406,7 +406,7 @@ languages = ["fr"] WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\ + .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\ .filter(col("domain").isin(domains))\ .filter(col("language").isin(languages))\ .write.csv("plain-text-fr-df/") @@ -462,7 +462,7 @@ content = "%radio%" WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\ + .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\ .filter(col("content").like(content)) .write.csv("plain-text-radio-df/") ``` @@ -507,6 +507,6 @@ from aut import * WebArchive(sc, sqlContext, "/path/to/warcs")\ .webpages()\ - .select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_http_header("content").alias("content"))\ + .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_http_header("content").alias("content"))\ .write.csv("plain-html-df/") ``` \ No newline at end of file