In [1]:
! cd ..

In [2]:
! tree -d ../data

[01;36m../data[00m
├── [01;34mbzipped[00m
├── [01;34menwiki[00m
│   ├── [01;34mcategorylinks[00m
│   ├── [01;34mcategorypages[00m
│   ├── [01;34mpagelinks[00m
│   └── [01;34mpages[00m
├── [01;34mprocessed[00m
│   ├── [01;34mcategorylinks[00m
│   ├── [01;34mpage[00m
│   │   ├── [01;34mcategory_pages[00m
│   │   └── [01;34mnormal_pages[00m
│   └── [01;34mpagelinks[00m
└── [01;34mraw[00m

13 directories


# Read data with schema

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [28]:
data_dir = "../data"

catlinks = spark.read.csv(
    f"{data_dir}/processed/categorylinks",
    schema="from INT, title STRING, dest STRING, type STRING",
    sep="\t",
)
pagelinks = spark.read.csv(
    f"{data_dir}/processed/pagelinks",
    schema="from INT, dest INT",
    sep="\t",
)
catpages = spark.read.csv(
    f"{data_dir}/processed/page/category_pages",
    schema="id INT, title STRING, is_redirect BOOLEAN, is_new BOOLEAN",
    sep="\t"
)
pages = spark.read.csv(
    f"{data_dir}/processed/page/normal_pages", 
    schema="id INT, title STRING, is_redirect BOOLEAN, is_new BOOLEAN",
    sep="\t",
)

catlinks.show(n=5)
pagelinks.show(n=5)
catpages.show(n=5)
pages.show(n=5)

+--------+--------------------+--------+------+
|    from|               title|    dest|  type|
+--------+--------------------+--------+------+
| 2137402|1000_V_DC_railway...|57839957|subcat|
|51991420|1000_V_DC_railway...|57839957|  page|
|25064564|1000_V_DC_railway...|57839957|  page|
|57839948|1000_V_DC_railway...|57839957|subcat|
|   60340|1000_V_DC_railway...|57839957|  page|
+--------+--------------------+--------+------+
only showing top 5 rows

+----+--------+
|from|    dest|
+----+--------+
| 877|  559437|
| 877|   32693|
| 877|51097219|
| 877|  167496|
| 877|  278728|
+----+--------+
only showing top 5 rows

+--------+--------------------+-----------+------+
|      id|               title|is_redirect|is_new|
+--------+--------------------+-----------+------+
|52356933|Christian_mission...|      false|  true|
|52356969|Roman_Catholic_mi...|      false|  true|
|52356978|Christian_mission...|      false| false|
|52356986|Women_scientists_...|      false| false|
|52356994|West_Be

## Counts

In [23]:
pages.count()

5913792

In [24]:
pagelinks.count()

491945166

In [25]:
catpages.count()

1851072

In [26]:
catlinks.count()

67380060

## Repartition data

The data is repartitioned into 250mb chunks, which is reasonably sized for network transfer.

In [33]:
! du -h data/processed

 23M	data/processed/page/category_pages
 99M	data/processed/page/normal_pages
122M	data/processed/page
2.2G	data/processed/pagelinks
493M	data/processed/categorylinks
2.8G	data/processed


In [29]:
(
    pagelinks
    .repartitionByRange(8, "from", "dest")
    .write.parquet(f"{data_dir}/enwiki/pagelinks", mode="overwrite")
)

In [30]:
(
    catlinks
    .repartitionByRange(2, "from", "dest")
    .write.parquet(f"{data_dir}/enwiki/categorylinks")
)

In [31]:
(
    pages
    .repartitionByRange(1, "id")
    .write.parquet(f"{data_dir}/enwiki/pages")
)

In [32]:
(
    catpages
    .repartitionByRange(1, "id")
    .write.parquet(f"{data_dir}/enwiki/categorypages")
)

In [37]:
! tree -h data/enwiki

[01;34mdata/enwiki[00m
├── [ 256]  [01;34mcategorylinks[00m
│   ├── [   0]  _SUCCESS
│   ├── [222M]  part-00000-5c4ff747-a7d2-48b6-8e21-e82a4b8840dd-c000.snappy.parquet
│   └── [233M]  part-00001-5c4ff747-a7d2-48b6-8e21-e82a4b8840dd-c000.snappy.parquet
├── [ 192]  [01;34mcategorypages[00m
│   ├── [   0]  _SUCCESS
│   └── [ 32M]  part-00000-e856fb1e-f22c-4c36-9ed1-5621bab499a9-c000.snappy.parquet
├── [ 640]  [01;34mpagelinks[00m
│   ├── [   0]  _SUCCESS
│   ├── [248M]  part-00000-3255198c-5b01-46f9-9a29-0e7e6c3d24c1-c000.snappy.parquet
│   ├── [231M]  part-00001-3255198c-5b01-46f9-9a29-0e7e6c3d24c1-c000.snappy.parquet
│   ├── [245M]  part-00002-3255198c-5b01-46f9-9a29-0e7e6c3d24c1-c000.snappy.parquet
│   ├── [247M]  part-00003-3255198c-5b01-46f9-9a29-0e7e6c3d24c1-c000.snappy.parquet
│   ├── [248M]  part-00004-3255198c-5b01-46f9-9a29-0e7e6c3d24c1-c000.snappy.parquet
│   ├── [247M]  part-00005-3255198c-5b01-46f9-9a29-0e7e6c3d24c1-c000.snappy.parquet
│   ├── [251M] 