Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
ruebot committed May 29, 2020
1 parent 8956541 commit 2a1fcd7
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 21 deletions.
2 changes: 1 addition & 1 deletion current/collection-analysis.md
Expand Up @@ -95,7 +95,7 @@ from pyspark.sql.functions import desc

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select(Udf.extract_domain("url").alias("Domain"))\
.select(extract_domain("url").alias("Domain"))\
.groupBy("Domain")\
.count()\
.sort(desc("count"))\
Expand Down
16 changes: 8 additions & 8 deletions current/link-analysis.md
Expand Up @@ -106,8 +106,8 @@ content = "%radio%"
WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.filter(col("content").like(content))\
.select(explode(Udf.extract_links("url", "content")).alias("links"))\
.select(Udf.remove_prefix_www(Udf.extract_domain(col("links._1"))).alias("src"), Udf.remove_prefix_www(Udf.extract_domain(col("links._2"))).alias("dest"))\
.select(explode(extract_links("url", "content")).alias("links"))\
.select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain(col("links._2"))).alias("dest"))\
.groupBy("src", "dest")\
.count()\
.filter(col("count") > 5)\
Expand Down Expand Up @@ -170,7 +170,7 @@ from pyspark.sql.functions import col

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webgraph()\
.groupBy(Udf.extract_domain("src"), Udf.extract_domain("dest"))\
.groupBy(extract_domain("src"), extract_domain("dest"))\
.count()\
.filter(col("count") > 5)\
.write.csv("full-links-all-df/")
Expand Down Expand Up @@ -230,8 +230,8 @@ url_pattern = "%http://www.archive.org/details/%"
WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.filter(col("url").like(url_pattern))\
.select(explode(Udf.extract_links("url", "content").alias("links")))\
.select(Udf.remove_prefix_www(Udf.extract_domain(col("links._1"))).alias("src"), Udf.remove_prefix_www(Udf.extract_domain("links._2")).alias("dest"))\
.select(explode(extract_links("url", "content").alias("links")))\
.select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain("links._2")).alias("dest"))\
.groupBy("src", "dest")\
.count()\
.filter(col("count") > 5)\
Expand Down Expand Up @@ -311,7 +311,7 @@ from pyspark.sql.functions import col

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webgraph()\
.groupBy("crawl_date", Udf.remove_prefix_www(Udf.extract_domain("src")).alias("src"), Udf.remove_prefix_www(Udf.extract_domain("dest")).alias("dest"))\
.groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src"), remove_prefix_www(extract_domain("dest")).alias("dest"))\
.count()\
.filter(col("count") > 5)\
.write.csv("sitelinks-by-date-df/")
Expand Down Expand Up @@ -369,8 +369,8 @@ url_pattern = "http://www.archive.org/details/.*"
WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.filter(col("url").rlike(url_pattern))\
.select(explode(Udf.extract_links("url", "content")).alias("links"))\
.select(Udf.remove_prefix_www(Udf.extract_domain(col("links._1"))).alias("src"), Udf.remove_prefix_www(Udf.extract_domain(col("links._2"))).alias("dest"))\
.select(explode(extract_links("url", "content")).alias("links"))\
.select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain(col("links._2"))).alias("dest"))\
.groupBy("src", "dest")\
.count()\
.filter(col("count") > 5)\
Expand Down
6 changes: 3 additions & 3 deletions current/standard-derivatives.md
Expand Up @@ -118,17 +118,17 @@ webpages = WebArchive(sc, sqlContext, "/path/to/data").webpages()
webgraph = WebArchive(sc, sqlContext, "/path/to/data").webgraph()

# Domains file.
webpages.groupBy(Udf.remove_prefix_www(Udf.extract_domain("url")).alias("url"))\
webpages.groupBy(remove_prefix_www(extract_domain("url")).alias("url"))\
.count()\
.sort(col("count").desc())\
.write.csv("/path/to/derivatives/auk/all-domains/output"")

# Full-text.
webpages.select("crawl_date", Udf.remove_prefix_www(Udf.extract_domain("url")).alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
webpages.select("crawl_date", remove_prefix_www(extract_domain("url")).alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\
.write.csv("/path/to/derivatives/auk/full-text/output")

# Create DataFrame for GraphML output
graph = webgraph.groupBy("crawl_date", Udf.remove_prefix_www(Udf.extract_domain("src")).alias("src_domain"), Udf.remove_prefix_www(Udf.extract_domain("dest")).alias("dest_domain"))\
graph = webgraph.groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain"))\
.count()\
.filter((col("dest_domain").isNotNull()) & (col("dest_domain") !=""))\
.filter((col("src_domain").isNotNull()) & (col("src_domain") !=""))\
Expand Down
18 changes: 9 additions & 9 deletions current/text-analysis.md
Expand Up @@ -55,7 +55,7 @@ from aut import *

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html("content").alias("content"))
.select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html("content").alias("content"))
.write.csv("plain-text-df/")
```

Expand Down Expand Up @@ -98,7 +98,7 @@ from aut import *

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select(Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
.select(remove_html(remove_http_header("content")).alias("content"))\
.write.csv("plain-text-noheaders-df/")
```

Expand Down Expand Up @@ -147,7 +147,7 @@ domains = ["www.archive.org"]

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
.select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\
.filter(col("domain").isin(domains))\
.write.csv("plain-text-domain-df/")
```
Expand Down Expand Up @@ -199,7 +199,7 @@ url_pattern = "%http://www.archive.org/details/%"

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
.select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\
.filter(col("url").like(url_pattern))\
.write.csv("details-df/")
```
Expand Down Expand Up @@ -247,7 +247,7 @@ from aut import *

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.extract_boilerplate(Udf.remove_http_header("content")).alias("content"))\
.select("crawl_date", extract_domain("url").alias("domain"), "url", extract_boilerplate(remove_http_header("content")).alias("content"))\
.write.csv("plain-text-no-boilerplate-df/")
```

Expand Down Expand Up @@ -338,7 +338,7 @@ dates = "2009[10][09]\d\d"

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
.select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\
.filter(col("crawl_date").rlike(dates))\
.write.csv("plain-text-date-filtered-2008-2015-df/")
```
Expand Down Expand Up @@ -406,7 +406,7 @@ languages = ["fr"]

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
.select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\
.filter(col("domain").isin(domains))\
.filter(col("language").isin(languages))\
.write.csv("plain-text-fr-df/")
Expand Down Expand Up @@ -462,7 +462,7 @@ content = "%radio%"

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_html(Udf.remove_http_header("content")).alias("content"))\
.select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content")).alias("content"))\
.filter(col("content").like(content))
.write.csv("plain-text-radio-df/")
```
Expand Down Expand Up @@ -507,6 +507,6 @@ from aut import *

WebArchive(sc, sqlContext, "/path/to/warcs")\
.webpages()\
.select("crawl_date", Udf.extract_domain("url").alias("domain"), "url", Udf.remove_http_header("content").alias("content"))\
.select("crawl_date", extract_domain("url").alias("domain"), "url", remove_http_header("content").alias("content"))\
.write.csv("plain-html-df/")
```

0 comments on commit 2a1fcd7

Please sign in to comment.