In [1]:
! pip install ..

Processing /Users/amiyaguchi/Research/wiki-forecast
Building wheels for collected packages: wiki-forecast
  Building wheel for wiki-forecast (setup.py) ... [?25ldone
[?25h  Created wheel for wiki-forecast: filename=wiki_forecast-0.3.3-cp36-none-any.whl size=27505 sha256=4704109d5996746410e950fc97dc24be551c4184a38c7a01e3de9f5c516f9a99
  Stored in directory: /Users/amiyaguchi/Library/Caches/pip/wheels/4f/a1/40/0ace60fd57f0bb1ff87e264f714aa5255c9d115906476a9f13
Successfully built wiki-forecast
Installing collected packages: wiki-forecast
  Found existing installation: wiki-forecast 0.3.3
    Uninstalling wiki-forecast-0.3.3:
      Successfully uninstalled wiki-forecast-0.3.3
Successfully installed wiki-forecast-0.3.3


In [2]:
!tree -h ../data/clustered/sample_2_8_50

[01;34m../data/clustered/sample_2_8_50[00m
├── [   0]  _SUCCESS
├── [ 61M]  part-00000-da9ff087-c6ae-4dad-b9bd-d796a7ac0c15-c000.snappy.parquet
├── [ 61M]  part-00001-da9ff087-c6ae-4dad-b9bd-d796a7ac0c15-c000.snappy.parquet
├── [ 61M]  part-00002-da9ff087-c6ae-4dad-b9bd-d796a7ac0c15-c000.snappy.parquet
└── [ 61M]  part-00003-da9ff087-c6ae-4dad-b9bd-d796a7ac0c15-c000.snappy.parquet

0 directories, 5 files


In [3]:
from pyspark.sql import SparkSession, functions as F, Window
from graphframes import GraphFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wikicast.bipartition import induce_graph
from datetime import datetime, timedelta


def sign_to_string(*signs):
    result = ""
    for sign in signs:
        result += "x" if sign else "o"
    return result

def get_name(t): 
    return (start_date + timedelta(t)).strftime("%Y-%m-%d")


def transform(graph, pageviews, dates):
    sign = [f"sign_{x}" for x in range(8)]
    fiedler = [f"fiedler_{x}" for x in range(8)]     
    time_series = (
        pageviews.join(
            pageviews.groupBy("page_id").agg(F.sum("count").alias("total")).where("total >= 10").select("page_id"), 
            on="page_id",
            how="inner"
        )
        .withColumn("id", F.col("page_id"))
        .groupBy("id")
        .pivot("date")
        .agg(F.min("count"))
    )
    for date in set(dates) - set(time_series.columns):
        time_series = time_series.withColumn(date, F.lit(0))
    return (
        graph.vertices.join(graph.degrees, on="id", how="left")
        .join(graph.inDegrees, on="id", how="left")
        .join(graph.outDegrees, on="id", how="left")
        .join(time_series, on="id", how="inner")
        .withColumn("partition_id", F.udf(sign_to_string, "string")(*sign))
        .select("id", "title", "partition_id", "degree", "inDegree", "outDegree", *sign+fiedler+dates)
    )

def plot_degree(graph):
    degree = graph.inDegrees.groupBy("inDegree").count().orderBy(F.desc("count"))
    degree.show(n=5)

    df = degree.toPandas()
    plt.scatter(df["inDegree"], df["count"])
    plt.xscale("log")
    plt.yscale("log")



spark = SparkSession.builder.getOrCreate()
pages = spark.read.parquet("../data/clustered/sample_1_8_50")
pagelinks = spark.read.parquet("../data/enwiki/pagelinks")
pageviews = spark.read.parquet("../data/enwiki/pagecount_daily_v2")

pages.printSchema()
pagelinks.printSchema()

graph = induce_graph(GraphFrame(pages, pagelinks.selectExpr("from as src", "dest as dst")), relabel=False)
graph.cache()

print(f"graph has {graph.vertices.count()} articles and {graph.edges.count()} hyperlinks")
graph.vertices.show(vertical=True, n=1)

if False:
    plot_degree(graph)

start_date = datetime.strptime("2018-01-01", "%Y-%m-%d")
end_date = datetime.strptime("2019-09-01", "%Y-%m-%d")
dates = [get_name(t) for t in range((end_date-start_date).days)]
data = transform(graph, pageviews, dates)

root
 |-- id: integer (nullable = true)
 |-- bias: boolean (nullable = true)
 |-- sign_0: boolean (nullable = true)
 |-- sign_1: boolean (nullable = true)
 |-- sign_2: boolean (nullable = true)
 |-- sign_3: boolean (nullable = true)
 |-- sign_4: boolean (nullable = true)
 |-- sign_5: boolean (nullable = true)
 |-- sign_6: boolean (nullable = true)
 |-- title: string (nullable = true)
 |-- is_redirect: boolean (nullable = true)
 |-- is_new: boolean (nullable = true)
 |-- fiedler_0: double (nullable = true)
 |-- fiedler_1: double (nullable = true)
 |-- fiedler_2: double (nullable = true)
 |-- fiedler_3: double (nullable = true)
 |-- fiedler_4: double (nullable = true)
 |-- fiedler_5: double (nullable = true)
 |-- fiedler_6: double (nullable = true)
 |-- fiedler_7: double (nullable = true)
 |-- sign_7: boolean (nullable = true)

root
 |-- from: integer (nullable = true)
 |-- dest: integer (nullable = true)

graph has 2936436 articles and 52893186 hyperlinks
-RECORD 0----------------------

In [4]:
%time data.repartitionByRange(1, "partition_id").write.parquet("../data/design_matrix/sample_1_8_50", mode="overwrite")

CPU times: user 25.4 ms, sys: 37.1 ms, total: 62.6 ms
Wall time: 3min 56s


In [5]:
!ls ../data/clustered

[34msample_10_8_50[m[m [34msample_2_8_50[m[m  [34msample_4_8_50[m[m  [34msample_6_8_50[m[m  [34msample_8_8_50[m[m
[34msample_1_8_50[m[m  [34msample_3_8_50[m[m  [34msample_5_8_50[m[m  [34msample_7_8_50[m[m  [34msample_9_8_50[m[m


In [6]:
def transform_batch(spark, pagelinks, pageviews, dates):
    for i in range(2,11):
        print(f"iteration {i}")
        pages = spark.read.parquet(f"../data/clustered/sample_{i}_8_50")
        graph = induce_graph(GraphFrame(pages, pagelinks.selectExpr("from as src", "dest as dst")), relabel=False)
        data = transform(graph, pageviews, dates)
        (
            data
            .repartitionByRange(1, "partition_id")
            .write
            .parquet(f"../data/design_matrix/sample_{i+1}_8_50", mode="overwrite")
        )

pagelinks.cache()
pageviews.cache()
transform_batch(spark, pagelinks, pageviews, dates)

iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
