### Fetching The 7z archive

In [0]:
try:
    dbutils.fs.ls("file:/databricks/driver/dblp.v13.7z")
    print("Archive in filesystem (file:/databricks/driver/dblp.v13.7z)")
except:
    try:
        dbutils.fs.ls("dbfs:/FileStore/data/dblp.v13.7z")
        print("Archive located in FileStore. Copying into local store..")
        dbutils.fs.cp("dbfs:/FileStore/data/dblp.v13.7z", "file:/databricks/driver/dblp.v13.7z")
        print("Completed")
    except:
        print("7z archive not found. Fetching from URL...")
        !wget https://originalstatic.aminer.cn/misc/dblp.v13.7z
        print("7z archive Downloaded. Moving archive to FileStore..")
        dbutils.fs.mkdirs("dbfs:/FileStore/data")
        dbutils.fs.cp("file:/databricks/driver/dblp.v13.7z", "dbfs:/FileStore/data/dblp.v13.7z")
        print("Completed.")

7z archive not found. Fetching from URL...
--2022-04-15 20:01:54--  https://originalstatic.aminer.cn/misc/dblp.v13.7z
Resolving originalstatic.aminer.cn (originalstatic.aminer.cn)... 159.27.2.14
Connecting to originalstatic.aminer.cn (originalstatic.aminer.cn)|159.27.2.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2568255035 (2.4G) [application/x-7z-compressed]
Saving to: ‘dblp.v13.7z’

dblp.v13.7z           0%[                    ]       0  --.-KB/s               dblp.v13.7z           0%[                    ]  71.41K   212KB/s               dblp.v13.7z           0%[                    ] 151.41K   224KB/s               dblp.v13.7z           0%[                    ] 279.41K   276KB/s               dblp.v13.7z           0%[                    ] 823.41K   611KB/s               dblp.v13.7z           0%[                    ]   2.91M  1.73MB/s               dblp.v13.7z           0%[                    ]   5.90M  3.08MB/s               dblp.v13.7z

In [0]:
dbutils.fs.ls("file:/databricks/driver/dblp.v13.7z")

Out[42]: [FileInfo(path='file:/databricks/driver/dblp.v13.7z', name='dblp.v13.7z', size=2568255035, modificationTime=1631501339000)]

### Extracting Archive into json

In [0]:
!pip install py7zr -q

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
import py7zr

archive = py7zr.SevenZipFile('dblp.v13.7z', mode='r')
archive.extractall()
archive.close()

In [0]:
dbutils.fs.ls("file:/databricks/driver/dblpv13.json")

Out[45]: [FileInfo(path='file:/databricks/driver/dblpv13.json', name='dblpv13.json', size=17352640799, modificationTime=1631240655155)]

### Cleaning NumberInt(#) tags

In [0]:
import re

# Cleaning the `NumberInt` tag
fin = open(f"dblpv13.json")
fout = open(f"dblpv13_clean.json", "wt")
for line in fin:
    fout.write(re.sub(r"NumberInt\([\d]*\)", lambda x: "".join(re.findall(r"\d", x.group(0))), line))
fin.close()
fout.close()

### Partitioning Dataset into JSON files

In [0]:
!pip install ijson tqdm -q

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
%mkdir data

In [0]:
import ijson
import json
from tqdm import tqdm
import decimal

class DecimalEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, decimal.Decimal):
            return str(o)
        return super(DecimalEncoder, self).default(o)

data_dir = 'data/'
with open('dblpv13_clean.json', 'r') as f:
    counter, file_id = 0, 0
    file_buffer = []
    for obj_data in tqdm(ijson.items(f, 'item')):
        file_buffer.append(obj_data)
        counter += 1
        if counter % 300000 == 0:
            print(f" Saving, data_PART_{file_id}.json in {data_dir}")
            f = open(f'{data_dir}data_PART_{file_id}.json', 'w')
            dump = json.dumps(file_buffer, cls=DecimalEncoder)
            f.write(dump)
            f.close()
            file_id += 1
            file_buffer = []
f = open(f'{data_dir}data_PART_{file_id}.json', 'w')
dump = json.dumps(file_buffer, cls=DecimalEncoder)
print(f" Saving, data_PART_{file_id}.json in {data_dir}")
f.write(dump)
f.close()
file_id += 1
file_buffer = []

0it [00:00, ?it/s]2623it [00:00, 26208.16it/s]5720it [00:00, 28932.40it/s]8793it [00:00, 29723.00it/s]12228it [00:00, 31417.79it/s]15369it [00:00, 25581.80it/s]18654it [00:00, 27644.79it/s]21543it [00:00, 25953.22it/s]24233it [00:00, 24315.13it/s]26736it [00:01, 18668.80it/s]29109it [00:01, 19800.04it/s]31541it [00:01, 20903.29it/s]33956it [00:01, 21695.07it/s]36249it [00:01, 15771.06it/s]38517it [00:01, 17238.12it/s]40886it [00:01, 18726.41it/s]43236it [00:01, 19874.40it/s]45536it [00:02, 20647.19it/s]47738it [00:02, 13670.17it/s]50107it [00:02, 15688.50it/s]52307it [00:02, 17092.90it/s]54682it [00:02, 18706.62it/s]57037it [00:02, 19916.39it/s]59408it [00:02, 20916.00it/s]61650it [00:03, 11823.47it/s]63904it [00:03, 13731.21it/s]65931it [00:03, 15051.80it/s]67956it [00:03, 16190.86it/s]69964it [00:03, 17130.95it/s]71932it [00:03, 17770.40it/s]73975it [00:03, 18461.53it/s]75980it [00:03, 18860.15it/s]77967it [00:04, 9712.50it/s] 79884it [00:04, 11316.2

In [0]:
len(file_buffer)

Out[50]: 0

In [0]:
path = "file:/databricks/driver/data/"
[f"{path}data_PART_{num}.json" for num in range(file_id)]

Out[55]: ['file:/databricks/driver/data/data_PART_0.json',
 'file:/databricks/driver/data/data_PART_1.json',
 'file:/databricks/driver/data/data_PART_2.json',
 'file:/databricks/driver/data/data_PART_3.json',
 'file:/databricks/driver/data/data_PART_4.json',
 'file:/databricks/driver/data/data_PART_5.json',
 'file:/databricks/driver/data/data_PART_6.json',
 'file:/databricks/driver/data/data_PART_7.json',
 'file:/databricks/driver/data/data_PART_8.json',
 'file:/databricks/driver/data/data_PART_9.json',
 'file:/databricks/driver/data/data_PART_10.json',
 'file:/databricks/driver/data/data_PART_11.json',
 'file:/databricks/driver/data/data_PART_12.json',
 'file:/databricks/driver/data/data_PART_13.json',
 'file:/databricks/driver/data/data_PART_14.json',
 'file:/databricks/driver/data/data_PART_15.json',
 'file:/databricks/driver/data/data_PART_16.json',
 'file:/databricks/driver/data/data_PART_17.json']

In [0]:
# https://stackoverflow.com/a/42540517 (This seems more proper way to merge the frames..)

# from functools import reduce

# path = "file:/databricks/driver/data/"
# dataframes = map(lambda r: spark.read.json(r), [f"{path}data_PART_{num}.json" for num in range(file_id)])
# union = reduce(lambda df1, df2: df1.unionAll(df2), dataframes)

dataframes = []
for num in range(file_id):
    print(f"Reading file data_PART_{num}.json into spark")
    dataframes.append(spark.read.json(f"{path}data_PART_{num}.json"))
print(len(dataframes))

Reading file data_PART_0.json into spark
Reading file data_PART_1.json into spark
Reading file data_PART_2.json into spark
Reading file data_PART_3.json into spark
Reading file data_PART_4.json into spark
Reading file data_PART_5.json into spark
Reading file data_PART_6.json into spark
Reading file data_PART_7.json into spark
Reading file data_PART_8.json into spark
Reading file data_PART_9.json into spark
Reading file data_PART_10.json into spark
Reading file data_PART_11.json into spark
Reading file data_PART_12.json into spark
Reading file data_PART_13.json into spark
Reading file data_PART_14.json into spark
Reading file data_PART_15.json into spark
Reading file data_PART_16.json into spark
Reading file data_PART_17.json into spark
18


In [0]:
dataframes

Out[67]: [DataFrame[_id: string, abstract: string, authors: array<struct<_id:string,bio:string,email:string,gid:string,name:string,name_zh:string,oid:string,oid_zh:string,orcid:string,org:string,org_zh:string,orgid:string,orgs:array<string>,orgs_zh:array<string>,sid:string>>, doi: string, fos: array<string>, isbn: string, issn: string, issue: string, keywords: array<string>, lang: string, n_citation: bigint, page_end: string, page_start: string, pdf: string, references: array<string>, title: string, url: array<string>, venue: struct<_id:string,issn:string,name:string,name_d:string,name_s:string,online_issn:string,publisher:string,raw:string,raw_zh:string,sid:string,src:string,t:string,type:bigint>, volume: string, year: bigint],
 DataFrame[_id: string, abstract: string, authors: array<struct<_id:string,bio:string,email:string,gid:string,name:string,name_zh:string,oid:string,oid_zh:string,orcid:string,org:string,org_zh:string,orgid:string,orgs:array<string>,orgs_zh:array<string>,sid:str

In [0]:
dataframes[0].printSchema()

root
 |-- _id: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- sid: string (nullable = true)
 |-- doi: string (nulla

In [0]:
dataframes[8].printSchema()

root
 |-- _id: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- position: string (nullable = true)
 |    |    |-- sid

In [0]:
# union.printSchema()

Out[59]: <map at 0x7fb678157ca0>

In [0]:
mapFrames = map(lambda r: r, dataframes)
union = reduce(lambda df1, df2: df1.unionAll(df2), mapFrames)


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-2656773693406771>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0mmapFrames[0m [0;34m=[0m [0mmap[0m[0;34m([0m[0;32mlambda[0m [0mr[0m[0;34m:[0m [0mr[0m[0;34m,[0m [0mdataframes[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0munion[0m [0;34m=[0m [0mreduce[0m[0;34m([0m[0;32mlambda[0m [0mdf1[0m[0;34m,[0m [0mdf2[0m[0;34m:[0m [0mdf1[0m[0;34m.[0m[0munionAll[0m[0;34m([0m[0mdf2[0m[0;34m)[0m[0;34m,[0m [0mmapFrames[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m<command-2656773693406771>[0m in [0;36m<lambda>[0;34m(df1, df2)[0m
[1;32m      1[0m [0mmapFrames[0m [0;34m=[0m [0mmap[0m[0;34m([0m[0;32mlambda[0m [0mr[0m[0;34m:[0m [0mr[0m[0;34m,[0m [0mdataframes[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m

In [0]:
# dbutils.fs.mkdirs("dbfs:/FileStore/data/split_data")

# dbutils.fs.cp("file:/databricks/driver/data/", "dbfs:/FileStore/data/split_data", recurse = True)