## Install python packages

In [1]:
%%bash

pip3 install pandas

Collecting pandas
  Downloading pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.3/12.3 MB 77.6 MB/s eta 0:00:00
Collecting numpy<2,>=1.22.4
  Downloading numpy-1.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.2/18.2 MB 95.2 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 502.5/502.5 KB 40.0 MB/s eta 0:00:00
Collecting tzdata>=2022.1
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 341.8/341.8 KB 58.6 MB/s eta 0:00:00
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-1.26.1 pandas-2.1.3 pytz-2023.3.post1 tzdata-2023.3




## Install the custom pyspark package

In [3]:
%%bash
./install-pyspark.sh

Processing ./pyspark-3.5.0.dev0.tar.gz
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.5.0.dev0-py2.py3-none-any.whl size=258907270 sha256=0b4d799be4c5885bbc255d6d183aad212261753fc2040975ac39a64617314622
  Stored in directory: /root/.cache/pip/wheels/09/88/fa/a22881c4ebc798868daf7cf5183c18d7081e6dc2ac0d0befd1
Successfully built pyspark
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.0.dev0
    Uninstalling pyspark-3.5.0.dev0:
      Successfully uninstalled pyspark-3.5.0.dev0
Successfully installed pyspark-3.5.0.dev0




## Create the DB users

In [3]:
%%bash

export PGPASSWORD=postgres

create_db() {
cat <<EOF
        create user $1 with password '$1';
        create database $1;
        grant all privileges on database $1 to $1;
        \c $1 postgres
        grant all on schema public to $1;
EOF
}

dbnames=(lsqb snap tpcds tpch)

psql --host=postgres --username postgres --dbname postgres <<-EOSQL
        $(create_db lsqb)
        $(create_db snap)
        $(create_db tpcds)
        $(create_db tpch)
EOSQL

CREATE ROLE
CREATE DATABASE
GRANT
You are now connected to database "lsqb" as user "postgres".
GRANT
CREATE ROLE
CREATE DATABASE
GRANT
You are now connected to database "snap" as user "postgres".
GRANT
CREATE ROLE
CREATE DATABASE
GRANT
You are now connected to database "tpcds" as user "postgres".
GRANT
CREATE ROLE
CREATE DATABASE
GRANT
You are now connected to database "tpch" as user "postgres".
GRANT


## Fetch LSQB data

In [7]:
%%bash
cd lsqb
export MAX_SF=300
./scripts/download-merged-fk-data-sets.sh
cd ..





Reading package lists...
Building dependency tree...
Reading state information...
curl is already the newest version (7.81.0-1ubuntu1.14).
0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.
Downloading scale factor 0.1
Downloading scale factor 0.3
Downloading scale factor 1
Downloading scale factor 3
Downloading scale factor 10
Downloading scale factor 30
Downloading scale factor 100
Downloading scale factor 300


## Import the LSQB benchmark data

In [11]:
%%bash
export SF=30
./import-lsqb.sh

DROP VIEW
DROP VIEW
DROP VIEW
DROP VIEW
DROP VIEW
DROP VIEW
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
COPY 1575
COPY 6380
COPY 6
COPY 111
COPY 1343
COPY 1831640
COPY 77110476
COPY 24025658
COPY 184000
COPY 16080
COPY 71
COPY 96662059
COPY 25336957
COPY 105337003
COPY 5937610
COPY 4289970
COPY 68979133
COPY 30729218
COPY 147243
COPY 400460
COPY 7273036
COPY 7273036
CREATE VIEW
CREATE VIEW
CREATE VIEW
CREATE VIEW
CREATE VIEW
CREATE VIEW


## Fetch the SNAP data

In [15]:
%%bash

datasets=("http://snap.stanford.edu/data/cit-Patents.txt.gz"
          "http://snap.stanford.edu/data/wiki-topcats.txt.gz"
          "http://snap.stanford.edu/data/web-Google.txt.gz"
          "http://snap.stanford.edu/data/bigdata/communities/com-dblp.ungraph.txt.gz")

cd snap

for d in ${datasets[*]}; do
    if [ ! -f $(basename $d) ]; then
        curl -O $d;
        gunzip $(basename $d)
    else
        echo "$(basename $d) exists"
    fi
done

./remove-header.sh

cd ..

cit-Patents.txt.gz exists


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 95.1M  100 95.1M    0     0  6205k      0  0:00:15  0:00:15 --:--:-- 14.2M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 20.1M  100 20.1M    0     0  1870k      0  0:00:11  0:00:11 --:--:-- 4079k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4041k  100 4041k    0     0   622k      0  0:00:06  0:00:06 --:--:--  865k
gzip: com-dblp.ungraph.txt already exists;	not overwritten


## Import the SNAP data

In [18]:
%%writefile import-snap.sql

DROP TABLE IF EXISTS patents;
CREATE TABLE patents (fromNode integer, toNode integer);
\copy patents FROM 'snap/noheader/cit-Patents.txt' with (header false);

DROP TABLE IF EXISTS wiki;
CREATE TABLE wiki (fromNode integer, toNode integer);
\copy wiki FROM 'snap/noheader/wiki-topcats.txt' with (header false, delimiter ' ');

DROP TABLE IF EXISTS google;
CREATE TABLE google (fromNode integer, toNode integer);
\copy google FROM 'snap/noheader/web-Google.txt' with (header false);

DROP TABLE IF EXISTS dblp;
CREATE TABLE dblp (fromNode integer, toNode integer);
\copy dblp FROM 'snap/noheader/com-dblp.ungraph.txt' with (header false);

Overwriting import-snap.sql


In [19]:
%%bash

PGPASSWORD=snap psql -h postgres -U snap -d snap -f import-snap.sql

DROP TABLE
CREATE TABLE
COPY 16518948
DROP TABLE
CREATE TABLE
COPY 28511807
DROP TABLE
CREATE TABLE
COPY 5105039
DROP TABLE
CREATE TABLE
COPY 1049866


## Generate SNAP queries

In [24]:
import glob
from pathlib import Path

snap_tables = ['patents', 'wiki', 'google', 'dblp']

for tablename in snap_tables:
    Path(f'snap-queries/{tablename}').mkdir(parents=True, exist_ok=True)
    # We use the patent queries as the base and replace the references to the patent relation
    base_query_files = glob.glob('snap-queries/patents/*.sql')
    for file in base_query_files:
        query = Path(file).read_text()
        basename = Path(file).name
        new_query = query.replace('patents', tablename)
        with open(f'snap-queries/{tablename}/{basename}', 'w') as new_file:
            new_file.write(new_query)
        
    

## Generate the TPC-H data

In [None]:
%%bash

export SF=100

cd tpch-kit/dbgen

yes y | ./dbgen -s $SF

mkdir -p data-$SF

mv *.tbl data-$SF/

cd ../..

TPC-H Population Generator (Version 2.17.3)
Copyright Transaction Processing Performance Council 1994 - 2010


## Import the TPC-H data

In [4]:
%%bash

export SF=100

PGPASSWORD=tpch psql -h postgres -U tpch -f tpch-create.sql

tables=$( ls tpch-kit/dbgen/data-$SF/*.tbl | xargs -n 1 basename | sed 's/.tbl$//')

for t in $tables; do
        PGPASSWORD=tpch psql -h postgres -U tpch -c "\copy $t from 'tpch-kit/dbgen/data-$SF/${t}.tbl' DELIMITER '|';"
done


psql:tpch-create.sql:1: NOTICE:  table "part" does not exist, skipping


DROP TABLE
CREATE TABLE


psql:tpch-create.sql:15: NOTICE:  table "supplier" does not exist, skipping


DROP TABLE
CREATE TABLE


psql:tpch-create.sql:26: NOTICE:  table "partsupp" does not exist, skipping


DROP TABLE
CREATE TABLE


psql:tpch-create.sql:36: NOTICE:  table "customer" does not exist, skipping


DROP TABLE
CREATE TABLE


psql:tpch-create.sql:48: NOTICE:  table "orders" does not exist, skipping


DROP TABLE
CREATE TABLE


psql:tpch-create.sql:61: NOTICE:  table "lineitem" does not exist, skipping


DROP TABLE
CREATE TABLE


psql:tpch-create.sql:82: NOTICE:  table "nation" does not exist, skipping


DROP TABLE
CREATE TABLE


psql:tpch-create.sql:90: NOTICE:  table "region" does not exist, skipping


DROP TABLE
CREATE TABLE
COPY 15000000
COPY 600037902
COPY 25
COPY 150000000
COPY 20000000
COPY 80000000
COPY 5
COPY 1000000
