### Fetching The 7z archive

In [0]:
try:
    dbutils.fs.ls("file:/databricks/driver/dblp.v13.7z")
    print("Archive in filesystem (file:/databricks/driver/dblp.v13.7z)")
except:
    try:
        dbutils.fs.ls("dbfs:/FileStore/data/dblp.v13.7z")
        print("Archive located in FileStore. Copying into local store..")
        dbutils.fs.cp("dbfs:/FileStore/data/dblp.v13.7z", "file:/databricks/driver/dblp.v13.7z")
        print("Completed")
    except:
        print("7z archive not found. Fetching from URL...")
        !wget https://originalstatic.aminer.cn/misc/dblp.v13.7z
        print("7z archive Downloaded. Moving archive to FileStore..")
        dbutils.fs.mkdirs("dbfs:/FileStore/data")
        dbutils.fs.cp("file:/databricks/driver/dblp.v13.7z", "dbfs:/FileStore/data/dblp.v13.7z")
        print("Completed.")

In [0]:
dbutils.fs.ls("file:/databricks/driver/dblp.v13.7z")

### Extracting Archive into json

In [0]:
!pip install py7zr -q

In [0]:
import py7zr

archive = py7zr.SevenZipFile('dblp.v13.7z', mode='r')
archive.extractall()
archive.close()

In [0]:
dbutils.fs.ls("file:/databricks/driver/dblpv13.json")

### Cleaning NumberInt(#) tags

In [0]:
import re

# Cleaning the `NumberInt` tag
fin = open(f"dblpv13.json")
fout = open(f"dblpv13_clean.json", "wt")
for line in fin:
    fout.write(re.sub(r"NumberInt\([\d]*\)", lambda x: "".join(re.findall(r"\d", x.group(0))), line))
fin.close()
fout.close()

### Partitioning Dataset into JSON files

In [0]:
!pip install ijson tqdm -q

In [0]:
%mkdir data

In [0]:
import ijson
import json
from tqdm import tqdm
import decimal

class DecimalEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, decimal.Decimal):
            return str(o)
        return super(DecimalEncoder, self).default(o)

data_dir = 'data/'
with open('dblpv13_clean.json', 'r') as f:
    counter, file_id = 0, 0
    file_buffer = []
    for obj_data in tqdm(ijson.items(f, 'item')):
        file_buffer.append(obj_data)
        counter += 1
        if counter % 300000 == 0:
            print(f" Saving, data_PART_{file_id}.json in {data_dir}")
            f = open(f'{data_dir}data_PART_{file_id}.json', 'w')
            dump = json.dumps(file_buffer, cls=DecimalEncoder)
            f.write(dump)
            f.close()
            file_id += 1
            file_buffer = []
f = open(f'{data_dir}data_PART_{file_id}.json', 'w')
dump = json.dumps(file_buffer, cls=DecimalEncoder)
print(f" Saving, data_PART_{file_id}.json in {data_dir}")
f.write(dump)
f.close()
file_id += 1
file_buffer = []

In [0]:
len(file_buffer)

In [0]:
path = "file:/databricks/driver/data/"
[f"{path}data_PART_{num}.json" for num in range(file_id)]

In [0]:
# https://stackoverflow.com/a/42540517 (This seems more proper way to merge the frames..)

from functools import reduce

path = "file:/databricks/driver/data/"
dataframes = map(lambda r: spark.read.option("inferSchema", True).json(r), [f"{path}data_PART_{num}.json" for num in range(file_id)])
union = reduce(lambda df1, df2: df1.unionAll(df2), dataframes)

# dataframes = []
# for num in range(file_id):
#     print(f"Reading file data_PART_{num}.json into spark")
#     dataframes.append(spark.read.option("inferSchema", True).json(f"{path}data_PART_{num}.json"))
# print(len(dataframes))

In [0]:
dataframes

In [0]:
# dataframes[0].printSchema()

In [0]:
# dataframes[9].printSchema()

In [0]:
union.printSchema()

In [0]:
mapFrames = map(lambda r: r, dataframes)
union = reduce(lambda df1, df2: df1.unionAll(df2), mapFrames)


In [0]:
# dbutils.fs.mkdirs("dbfs:/FileStore/data/split_data")

# dbutils.fs.cp("file:/databricks/driver/data/", "dbfs:/FileStore/data/split_data", recurse = True)