### Fetching The 7z archive

**Skip this Section if you already have performed the extraction process and jump to checkpoint for pulling data from split json files.**

In [None]:
# Checking if archive is downloaded in memory.
try:
    dbutils.fs.ls("file:/databricks/driver/dblp.v13.7z")
    print("Archive in filesystem (file:/databricks/driver/dblp.v13.7z)")
except:
    # If archive is not in memory, Checking databricks store for cached version and pulling into memory.
    try:
        dbutils.fs.ls("dbfs:/FileStore/data/dblp.v13.7z")
        print("Archive located in FileStore. Copying into local store..")
        dbutils.fs.cp("dbfs:/FileStore/data/dblp.v13.7z", "file:/databricks/driver/dblp.v13.7z")
        print("Completed")
    except:
        # If archive is not cached, downloading and storing in databricks store.
        print("7z archive not found. Fetching from URL...")
        !wget https://originalstatic.aminer.cn/misc/dblp.v13.7z
        print("7z archive Downloaded. Moving archive to FileStore..")
        dbutils.fs.mkdirs("dbfs:/FileStore/data")
        dbutils.fs.cp("file:/databricks/driver/dblp.v13.7z", "dbfs:/FileStore/data/dblp.v13.7z")
        print("Completed.")

In [None]:
# The returned array should have one object of FileInfo with size =2568255035

dbutils.fs.ls("file:/databricks/driver/dblp.v13.7z")

### Extracting Archive into json

#### 1. Extracting 7zip file into 16 GB json.

In [None]:
!pip install py7zr -q

In [None]:
import py7zr

archive = py7zr.SevenZipFile('dblp.v13.7z', mode='r')
archive.extractall()
archive.close()

In [None]:
dbutils.fs.ls("file:/databricks/driver/dblpv13.json")

#### 2. Cleaning NumberInt(#) tags

The json data contains non-confirming tags, and so cannot be parsed as it is. We will read each line and substitute the tag. (This should take about 25 minutes)

In [None]:
import re

# Cleaning the `NumberInt` tag
fin = open(f"dblpv13.json")
fout = open(f"dblpv13_clean.json", "wt")
for line in fin:
    fout.write(re.sub(r"NumberInt\([\d]*\)", lambda x: "".join(re.findall(r"\d", x.group(0))), line))
fin.close()
fout.close()

#### 3. Partitioning Dataset into JSON files
Since the whopping 16 GB of json data cannot be loaded into memory directly, we need to partition the data into smaller chunks (300k objects per chunk) for processing.  
We also parse data encoded as Decimal data with DecimalEncoder.

In [None]:
%mkdir data

In [None]:
import ijson
import json
import decimal

class DecimalEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, decimal.Decimal):
            return str(o)
        return super(DecimalEncoder, self).default(o)

data_dir = 'data/'
with open('dblpv13_clean.json', 'r') as f:
    counter, file_id = 0, 0
    file_buffer = []
    for obj_data in ijson.items(f, 'item'):
        file_buffer.append(obj_data)
        counter += 1
        if counter % 300000 == 0:
            print(f" Saving, data_PART_{file_id}.json in {data_dir}")
            f = open(f'{data_dir}data_PART_{file_id}.json', 'w')
            dump = json.dumps(file_buffer, cls=DecimalEncoder)
            f.write(dump)
            f.close()
            file_id += 1
            file_buffer = []
f = open(f'{data_dir}data_PART_{file_id}.json', 'w')
dump = json.dumps(file_buffer, cls=DecimalEncoder)
print(f" Saving, data_PART_{file_id}.json in {data_dir}")
f.write(dump)
f.close()
file_id += 1
file_buffer = []

#### 4. Moving files to dbfs FileStore from instance storage, to make it available for later.

In [None]:
# removing old json stored in filestore.
dbutils.fs.rm("dbfs:/FileStore/data/split_data/", recurse = True)
# Creating dir to store json in filestore..
dbutils.fs.mkdirs("dbfs:/FileStore/data/split_data")
# confirming dir is empty
dbutils.fs.ls("dbfs:/FileStore/data/split_data")

In [None]:
# Copying all json parts into filestore.
dbutils.fs.cp("file:/databricks/driver/data/", "dbfs:/FileStore/data/split_data", recurse = True)

### Transform

#### Reading data from databricks Filestore into dataframes (Checkpoint after data load)

In [None]:
from functools import reduce
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, ArrayType, StringType, LongType, StructField, IntegerType
from pyspark.sql.functions import udf

path = "dbfs:/FileStore/data/split_data/"
parquet_dir = "/tmp/out"

# There should be 18 files each with 300 k records. This would change if you change split value.
file_count = len(dbutils.fs.ls(path))
assert file_count == 18, "Data not found. You may want to check the path or run the notebook from start again. If you updated the split value, ignore this assertion error"

In [None]:
# Build map of spark dataframes by reading json partition chunk files
dataframes_map = map(lambda r: spark.read.option("inferSchema", True).json(r), [f"{path}data_PART_{num}.json" for num in range(file_count)])
# reduce the dataframes into single dataframe by performing union over the mapped frames.
union = reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dataframes_map)
union = union.na.drop(subset=["authors"])
union = union.filter(union.lang == 'en')
union.printSchema()

In [None]:
# Deleting entries with small Titles (less than 3 words) and empty author list
size_ = udf(lambda s: len(s.split()), IntegerType())

union = union.na.drop(subset=["Title", "authors"])
union = union.filter(size_(F.col("Title")) > 3)
union.show(5)

In [None]:
def save_parquet_frame(frame, alias, clean = False):
    # pull required Fields
    parquet_path=f"{parquet_dir}{alias}.parquet"
       
    # Clean (delete dups, Fill NaN values?, ...)
    if clean:
        frame = frame.distinct()
       
    # Save parquet Frame
    frame.write.mode('overwrite').parquet(parquet_path)
    
    # pull appeneded parquet file and return
    # frame = spark.read.parquet(parquet_path)
    return frame

def distinct_frame_from_cols(frame, columns):
    # get distinct records for col
    print(*columns)
    frame = frame.select(*columns).distinct()
    frame = frame.select("*").withColumn("id", F.monotonically_increasing_id())
    # return the indexed Table
    return frame.select("id", *columns)

#### Publication Table

- Counting number of citations.
- Building new table for Title, abstract, volume, Number of citations, references and more.
- Saving Table to parquet file

In [None]:
lang_frame = distinct_frame_from_cols(union, ['lang']).withColumnRenamed("lang", "Text")
lang_frame = save_parquet_frame(lang_frame, "Language")

lang_rdd = lang_frame.rdd.collectAsMap()
lang_rdd

In [None]:
def map_lang_id(rdd):
    def map_fos_id_(col):
        if col == "null" or col == "" or not col:
            return None
        try:
            return [fosTuple[0] for fosTuple in list(rdd.items()) if fosTuple[1] == col][0]
        except ValueError:
            return None
    return udf(map_fos_id_, LongType())

union = union.select("*", map_lang_id(lang_rdd)("lang").alias("Lang_ID")).drop("lang")
union.show()

In [None]:
# UDF to get relevant publication's citation counts
def cite_count(countMapper):
    def cite_count_(col):
        if col == "null" or col == "" or not col:
            return "Unknown"
        return countMapper.get(col)
    return udf(cite_count_, StringType())

In [None]:
# building a Citation counter dictionary
citation_frame = union.select(F.explode_outer("references").alias("reference_countmap"))
citation_frame = citation_frame.groupBy("reference_countmap").count()
citation_frame = citation_frame.rdd.map(lambda row: row.asDict(True))
citation_counts = citation_frame.collect()
citation_counter = {}
for citation_count in citation_counts:
    citation_counter[citation_count['reference_countmap']] = citation_count['count']

In [None]:
# Building Publication Frame
publication_frame = union.select("_id", "Title", "volume", "issue", "abstract", "pdf", "isbn", "issn", "doi", "url", cite_count(citation_counter)("_id").alias("NumberOfCitations"))

# Removing extracted fields from the main schema 
union = union.drop("Title", "volume", "abstract", "pdf", "isbn", "issn", "doi", "url", "references", "page_start", "page_end", "n_citation")

# Saving the table into parquet file
publication_parquet = f"{parquet_dir}publication.parquet"
save_parquet_frame(publication_frame, "Publication", clean=True)

publication_frame.show(10)

In [None]:
### Future Steps:
# 1. API lookup to fill in missing data in issn, isbn, pdf, volume columns

In [None]:
union.select(union.columns).show(2)

#### Extracting Unique Field_of_Study values. 

First we built a mapper to generalize desciplines. Secondly, we count `Field of study` topics to determine significance and importance of each. If the decipline is found in the generalized mapper, we use that item to map the Field of Study list. Otherwise we use counts of occurances of each item from the list in the whole database, and pick the one with most frequent occurance as a suitable discipline.

Use [This](https://confluence.egi.eu/display/EGIG/Scientific+Disciplines) link to build a map to replace the specific field to generalized descipline.

In [None]:
# Building countmap structure
countMapFos = union.select(F.explode("fos").alias("fos2"))
countMapFos = countMapFos.groupBy("fos2").count()
countMapperRdd = countMapFos.rdd.map(lambda row: row.asDict(True))

countMapperList = countMapperRdd.collect()

count_mapper = {}
for countMapperItem in countMapperList:
    count_mapper[countMapperItem['fos2']] = countMapperItem['count']

decipline_mapper = {
	# 1 Natural Sciences
	"Mathematics": "Mathematics", "Applied mathematics": "Mathematics", "Pure mathematics": "Mathematics", "Statistics and probability": "Mathematics",
	"Computer Science": "Computer Sciences", "Computer Sciences": "Computer Sciences", "Algorithms": "Computer Sciences", "Artificial Intelligence (expert systems, machine learning, robotics)": "Computer Sciences", "Computer architecture": "Computer Sciences", "Computer communications": "Computer Sciences", "Computer graphics": "Computer Sciences", "Computer security and reliability": "Computer Sciences", "Data structures": "Computer Sciences", "Distributed computing": "Computer Sciences", "Human-computer interaction": "Computer Sciences", "Operating systems": "Computer Sciences", "Parallel computing": "Computer Sciences", "Programming languages": "Computer Sciences", "Quantum computing": "Computer Sciences", "Software engineering": "Computer Sciences", "Theory of computation": "Computer Sciences",
	"Information sciences": "Information sciences", "Information science": "Information sciences", "Data management": "Information sciences", "Data mining": "Information sciences", "Information retrieval": "Information sciences", "Information management": "Information sciences", "Knowledge management": "Information sciences", "Multimedia, hypermedia": "Information sciences",
	"Earth Sciences": "Earth Sciences", "Earth Science": "Earth Sciences", "Atmospheric science": "Earth Sciences", "Climate research": "Earth Sciences", "Geochemistry": "Earth Sciences", "Geology": "Earth Sciences", "Geophysics": "Earth Sciences", "Hydrology": "Earth Sciences", "Mineralogy": "Earth Sciences", "Oceanography": "Earth Sciences", "Palaeontology": "Earth Sciences", "Physical geography": "Earth Sciences", "Seismology": "Earth Sciences", "Volcanology": "Earth Sciences", 
	"Biology Science": "Biology Science", "Aerobiology": "Biology Science", "Bacteriology": "Biology Science", "Behavioural biology": "Biology Science", "Biochemistry and molecular biology": "Biology Science", "Biodiversity conservation": "Biology Science", "Bioinformatics": "Biology Science", "Biological rhythm": "Biology Science", "Biology": "Biology Science", "Biophysics": "Biology Science", "Botany": "Biology Science", "Cell biology": "Biology Science", "Computational biology": "Biology Science", "Cryobiology": "Biology Science", "Developmental biology": "Biology Science", "Ecology": "Biology Science", "Evolutionary biology": "Biology Science", "Genetics and heredity": "Biology Science", "Marine and Freshwater biology": "Biology Science", "Mathematical biology": "Biology Science", "Microbiology": "Biology Science", "Mycology": "Biology Science", "Plant science": "Biology Science", "Reproductive biology": "Biology Science", "Structural biology": "Biology Science", "Taxonomy": "Biology Science", "Theoretical biology": "Biology Science", "Thermal biology": "Biology Science", "Virology": "Biology Science", "Zoology": "Biology Science", 
	"Physical sciences": "Physical sciences", "Physical science": "Physical sciences", "Accelerator physics": "Physical sciences", "Acoustics": "Physical sciences", "Aerosol physics": "Physical sciences", "Astrobiology": "Physical sciences", "Astronomy": "Physical sciences", "Astroparticle physics": "Physical sciences", "Astrophysics": "Physical sciences", "Atomic": "Physical sciences", "Chemical physics": "Physical sciences", "Computational physics": "Physical sciences", "Condensed matter physics": "Physical sciences", "Cryogenics": "Physical sciences", "Fluid Mechanics": "Physical sciences", "Fusion": "Physical sciences", "High energy physics": "Physical sciences", "Mathematical physics": "Physical sciences", "Medical physics": "Physical sciences", "Molecular physics": "Physical sciences", "Nuclear physics": "Physical sciences", "Optics": "Physical sciences", "Particle physics": "Physical sciences", "Physics": "Physical sciences", "Planetary science": "Physical sciences", "Plasma physics": "Physical sciences", "Space science": "Physical sciences", "Quantum physics": "Physical sciences",
	"Chemical science": "Chemical sciences", "Chemical sciences": "Chemical sciences", "Analytical chemistry": "Chemical sciences", "Chemistry": "Chemical sciences", "Colloid chemistry": "Chemical sciences", "Computational chemistry": "Chemical sciences", "Electrochemistry": "Chemical sciences", "Inorganic and nuclear chemistry": "Chemical sciences", "Mathematical chemistry": "Chemical sciences", "Organic chemistry": "Chemical sciences", "Physical chemistry": "Chemical sciences", "Polymer science": "Chemical sciences", 

	# 2 Engineering and Technology
	"Civil engineering": "Civil engineering", "Architecture engineering": "Civil engineering", "Civil engineering": "Civil engineering", "Civil Protection": "Civil engineering", "Construction/Structural engineering": "Civil engineering", "Transport engineering": "Civil engineering",
	"Electrical, electronic and information engineering": "Electrical, electronic and information engineering", "Communication engineering and systems": "Electrical, electronic and information engineering", "Computer hardware and architecture": "Electrical, electronic and information engineering", "Electrical and electronic engineering": "Electrical, electronic and information engineering", "Robotics, Automation and Control Systems": "Electrical, electronic and information engineering",
	"Mechanical engineering": "Mechanical engineering", "Applied mechanics": "Mechanical engineering", "Audio engineering": "Mechanical engineering", "Nuclear related engineering": "Mechanical engineering", "Reliability analysis": "Mechanical engineering", "Thermodynamics": "Mechanical engineering",
	"Aerospace engineering": "Aerospace engineering", "Aeronautical engineering": "Aerospace engineering", "Astronautical engineering": "Aerospace engineering",
	"Chemical engineering": "Chemical engineering", "Chemical engineering (plants, products)": "Chemical engineering", "Chemical process engineering": "Chemical engineering",
	"Materials engineering": "Materials engineering", "Ceramics": "Materials engineering", "Coating and films": "Materials engineering", "Composites": "Materials engineering", "Paper and wood": "Materials engineering", "Textiles": "Materials engineering",
	"Bioengineering and Biomedical engineering": "Bioengineering and Biomedical engineering", "Bioengineering": "Bioengineering and Biomedical engineering", "Biomedical engineering": "Bioengineering and Biomedical engineering",
	"Environmental engineering": "Environmental engineering", "Energy and fuels": "Environmental engineering", "Geological engineering": "Environmental engineering", "Geotechnics": "Environmental engineering", "Ocean engineering": "Environmental engineering", "Mining and mineral processing": "Environmental engineering", "Petroleum engineering": "Environmental engineering", "Remote sensing": "Environmental engineering", "Sea vessels": "Environmental engineering",
	"Environmental biotechnology": "Environmental biotechnology", "Bioremediation": "Environmental biotechnology", "Diagnostic biotechnologies": "Environmental biotechnology",
	 "Industrial biotechnology":  "Industrial biotechnology", "Bio-derived novel materials": "Industrial biotechnology", "Biocatalysis": "Industrial biotechnology", "Bioderived bulk and fine chemicals": "Industrial biotechnology", "Biofuels": "Industrial biotechnology", "Biomaterials": "Industrial biotechnology", "Bioprocessing technologies": "Industrial biotechnology", "Bioproducts": "Industrial biotechnology", "Fermentation": "Industrial biotechnology",
	 "Nano-technology":  "Nano-technology", "Nano-materials": "Nano-technology", "Nano-processes": "Nano-technology",

	# 3 Medical and Health Sciences
	"Basic medicine": "Basic medicine", "Anatomy and morphology": "Basic medicine", "Human genetics": "Basic medicine", "Immunology": "Basic medicine", "Medicinal chemistry": "Basic medicine", "Neuroscience": "Basic medicine", "Pathology": "Basic medicine", "Pharmacology and pharmacy": "Basic medicine", "Physiology": "Basic medicine", "Toxicology": "Basic medicine",
	"Clinical medicine": "Clinical medicine", "Allergy": "Clinical medicine", "Anaesthesiology": "Clinical medicine", "Andrology": "Clinical medicine", "Cardiac and Cardiovascular systems": "Clinical medicine", "Critical care/Emergency medicine": "Clinical medicine", "Dentistry, oral surgery/medicine": "Clinical medicine", "Dermatology and venereal diseases": "Clinical medicine", "Gastroenterology and hepatology": "Clinical medicine", "General and internal medicine": "Clinical medicine", "Geriatrics and gerontology": "Clinical medicine", "Hematology": "Clinical medicine", "Integrative and Complementary medicine": "Clinical medicine", "Medical imaging": "Clinical medicine", "Nuclear medicine": "Clinical medicine", "Obstetrics and gynaecology": "Clinical medicine", "Oncology": "Clinical medicine", "Ophthalmology": "Clinical medicine", "Optometry": "Clinical medicine", "Orthopaedics": "Clinical medicine", "Otorhinolaryngolog": "Clinical medicine", "Paediatrics": "Clinical medicine", "Peripheral vascular disease": "Clinical medicine", "Psychiatry": "Clinical medicine", "Radiology": "Clinical medicine", "Respiratory systems": "Clinical medicine", "Rheumatology": "Clinical medicine", "Surgery": "Clinical medicine", "Transplantation": "Clinical medicine", "Urology and nephrology": "Clinical medicine",
	"Health science": "Health sciences", "Health sciences": "Health sciences", "Epidemiology": "Health sciences", "Health care science and services": "Health sciences", "Health policy and services": "Health sciences", "Infectious diseases": "Health sciences", "Medical ethics": "Health sciences", "Nursing": "Health sciences", "Nutrition and Dietetics": "Health sciences", "Occupational health": "Health sciences", "Parasitology": "Health sciences", "Public and environmental health": "Health sciences", "Social biomedical science": "Health sciences", "Sport and fitness science": "Health sciences", "Substance abuse": "Health sciences", "Tropical medicine": "Health sciences",
	"Medical biotechnology": "Medical biotechnology", "Biomedical devices": "Medical biotechnology", "Health-related biotechnology": "Medical biotechnology", "Pharmaceutical biotechnology": "Medical biotechnology", "Biotechnology and medical ethics": "Medical biotechnology", "Molecular diagnostics": "Medical biotechnology", "Biophysical manipulation": "Medical biotechnology", "Agricultural Sciences": "Medical biotechnology",

	# 4 Agricultural Sciences
	"Agriculture, forestry, and fisheries": "Agriculture, forestry, and fisheries", "Agriculture": "Agriculture, forestry, and fisheries", "Agronomy, plant breeding, plant protection": "Agriculture, forestry, and fisheries", "Fishery": "Agriculture, forestry, and fisheries", "Forestry": "Agriculture, forestry, and fisheries", "Horticulture and viticulture": "Agriculture, forestry, and fisheries", "Soil science": "Agriculture, forestry, and fisheries",
	"Animal and dairy sciences": "Animal and dairy sciences", "Animal science": "Animal and dairy sciences", "Dairy science": "Animal and dairy sciences", "Husbandry": "Animal and dairy sciences", "Pets": "Animal and dairy sciences",
	"Veterinary sciences": "Veterinary sciences", "Veterinary anaesthesiology": "Veterinary sciences", "Veterinary medicine": "Veterinary sciences", "Veterinary ophthalmology": "Veterinary sciences", "Veterinary pathobiology": "Veterinary sciences", "Veterinary radiology": "Veterinary sciences", "Veterinary reproduction": "Veterinary sciences", "Veterinary surgery": "Veterinary sciences",
	"Agricultural biotechnology": "Agricultural biotechnology", "Biomass feedstock production tech": "Agricultural biotechnology", "Biopharming": "Agricultural biotechnology", "Diagnostics": "Agricultural biotechnology", "Food biotechnology": "Agricultural biotechnology", "GM technology (crops, livestock)": "Agricultural biotechnology", "Livestock cloning": "Agricultural biotechnology", "Marker assisted selection": "Agricultural biotechnology",

	# 5 Social Sciences
	"Psychology": "Psychology", "Biological Psychology": "Psychology", "Clinical Psychology": "Psychology", "Cognitive Psychology": "Psychology", "Comparative Psychology": "Psychology", "Developmental Psychology": "Psychology", "Educational and School Psychology": "Psychology", "Evolutionary Psychology": "Psychology", "Industrial–organisational Psychology": "Psychology", "Personality Psychology": "Psychology", "Positive Psychology": "Psychology", "Social Psychology": "Psychology",
	"Economics, finance and business": "Economics, finance and business", "Business and Management": "Economics, finance and business", "Economics and Econometrics": "Economics, finance and business", "Finance": "Economics, finance and business", "Industrial relations": "Economics, finance and business",
	"Educational sciences": "Educational sciences", "Educational science": "Educational sciences", "General Education": "Educational sciences",  "Special Education (learning disabilities)": "Educational sciences",
	"Sociology": "Sociology", "Anthropology": "Sociology", "Demography": "Sociology", "Ethnology": "Sociology", "Family studies": "Sociology", "Social issues": "Sociology", "Social work": "Sociology", "Sociology": "Sociology", "Women's and gender studie": "Sociology",
	"Law": "Law", "Canon Law": "Law", "Civil Law": "Law", "Comparative Law": "Law", "Competition Law": "Law", "Constitutional Law": "Law", "Criminal Law": "Law", "Islamic Law": "Law", "Jewish Law": "Law", "Jurisprudence (Philosophy of Law)": "Law",
	"Political sciences": "Political sciences", "Political science": "Political sciences", "Comparative politics": "Political sciences", "Empirical pata analysis": "Political sciences", "International relations": "Political sciences", "Organisation theory": "Political sciences", "Political economy": "Political sciences", "Political philosophy": "Political sciences", "Public administration": "Political sciences", "Theories of the state": "Political sciences",
	"Social and economic geography": "Social and economic geography", "Cultural and economic geography": "Social and economic geography", "Transport planning": "Social and economic geography", "Urban studies": "Social and economic geography",
	"Media and communications": "Media and communications", "Information science - social": "Media and communications", "Journalism": "Media and communications", "Library science": "Media and communications", "Media and socio-cultural communication": "Media and communications",

	# 6 "Humanities",
	"History and Archaeology": "History and Archaeology", "Archaeology": "History and Archaeology", "History (Prehistory; Ancient; Modern world)": "History and Archaeology",
	"Languages and literature": "Languages and literature", "General language studies": "Languages and literature", "General literature studies": "Languages and literature", "Linguistics": "Languages and literature", "Literary theory": "Languages and literature", "Specific languages": "Languages and literature", "Specific literatures": "Languages and literature",
	"Philosophy, ethics and religion": "Philosophy, ethics and religion", "Ethics": "Philosophy, ethics and religion", "Philosophy of science/technology": "Philosophy, ethics and religion", "Philosophy": "Philosophy, ethics and religion", "Religious studies": "Philosophy, ethics and religion", "Theology": "Philosophy, ethics and religion",
	"Arts": "Arts", "Architectural design": "Arts", "Folklore studies": "Arts", "Media Studies (Film, Radio, TV)": "Arts", "Musicology": "Arts", "Performing arts studies": "Arts",

	# 7 "Support Activities"
	"Archives": "Support Activities", "Development": "Support Activities", "Urban planning": "Support Activities"
} 

def translate(mapper, count_mapper):
    def translate_(col):
        if col == "" or not col:
            return None
        fields = list(filter(None, [mapper.get(t) for t in col]))
        if len(fields):
            return fields[0]
        else:
            col_count = [count_mapper[x] for x in col]
            return col[col_count.index(max(col_count))]
    return udf(translate_, StringType())

In [None]:
# Finding relevant `Field_of_Study` from `fos` list with mapped value with `translate` udf into "Field_of_Study" column.
union = union.select("*", F.col("fos"), translate(decipline_mapper, count_mapper)("fos").alias("Text"))
# Dropping `fos` column
union = union.drop("fos")

# Building Frame of distinct disciplines out of "Field_of_Study" column.
FoS_frame = distinct_frame_from_cols(union, ["Text"])
save_parquet_frame(FoS_frame, "FieldOfStudy")
FoS_frame.show()

In [None]:
# Reading Mapping field of study to id, wuth RDD map for replacing "Field_of_Study" to relevant ID in the union table.
FoSrdd = FoS_frame.rdd.collectAsMap()

def map_fos_id(rdd):
    def map_fos_id_(col):
        if col == "null" or col == "" or not col:
            return None
        try:
            matches = [fosTuple[0] for fosTuple in list(rdd.items()) if fosTuple[1] == col]
            if len(matches):
                return matches[0]
            else:
                return None
        except ValueError:
            return None
    return udf(map_fos_id_, LongType())

In [None]:
union = union.withColumn("FOS_ID", map_fos_id(FoSrdd)("Text")).drop("Text")
union.show()

In [None]:
joined = union.join(FoS_frame, union.FOS_ID == FoS_frame.id, how="left")
joined.show(10)

### Author schema

In [None]:
# !pip install geograpy3 nltk -q

In [None]:
#import geograpy
#import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')

In [None]:
#str(geograpy.locateCity("Michigan"))
#geograpy.get_place_context(text="University of Michigan, USA")

In [None]:
#print(geograpy.get_place_context(text="Pune, India"))

In [None]:
# Data path:
# path = "dbfs:/FileStore/data/split_data/"

# Reading first chunk for Testing

# union = spark.read.option("inferSchema", True).json(f"{path}data_PART_0.json")
# union.show(5)

In [None]:
# Extracting Authors from the dataset

# Exploding a column returns a new row for each element in the given array or map type. 
# For each item in the map/array of data it creates a copy of the row and with that element in new column.
# Here, We only select the exploded column, and so we only get row with author object in the generated frame.
authors_frame = union.select(F.explode_outer("authors").alias("authors"))


# selectExpr Projects a set of SQL expressions and returns a new DataFrame. e.g. (authors['name', 'email'] => [authors.name, authors.email])

authors_frame = authors_frame.selectExpr("authors.*")
authors_frame = authors_frame.drop("org_zh", "orgs_zh")
authors_frame.printSchema()

In [None]:
org_frame = authors_frame.select("oid", "orcid", "org", F.explode("orgs").alias("org_name"))
org_frame = org_frame.na.drop("all").distinct()
org_frame.show()

In [None]:
# TODO:
# 1. Extract Org, Country and city for each ORG
# 2. Save Org Frame

In [None]:
def author_name(name):
    if name:
        name = name.split()
        if len(name) > 1:
            if len(name) == 1:
                return (name[0], None, None)
            return (name[0], ' '.join(name[1:-1]), name[-1])
    return None, None, None

author_name_schema = StructType([
    StructField("FirstName", StringType(), False),
    StructField("MiddleName", StringType(), False),
    StructField("LastName", StringType(), False),
])

author_name_udf = udf(author_name, author_name_schema)

authors_frame = authors_frame.select("*", author_name_udf("name").alias("author_name"))
authors_frame = authors_frame.select("*", "author_name.*")
authors_frame = authors_frame.drop("name", "author_name", "name_zh", "bio", "sid", "position", "avatar", "homepage", "oid", "orcid", "oid_zh", "orgs_zh", "orgs", "orgid", "org", "g_id")

save_parquet_frame(authors_frame, "Author")
authors_frame.show()

#### Extracting Venue (Conference/Workshop where article was presneted/cited) from the dataset

In [None]:
# Exploding a column returns a new row for each element in the given array or map type. 
# For each item in the map/array of data it creates a copy of the row and with that element in new column.
# Here, We only select the exploded column, and so we only get row with author object in the generated frame.
 
venue_frame = union.select("venue")

# selectExpr Projects a set of SQL expressions and returns a new DataFrame. e.g. (authors['name', 'email'] => [authors.name, authors.email])
venue_frame = venue_frame.selectExpr("venue.*")
venue_frame.printSchema()

In [None]:
venue_frame.show()

In [None]:
import requests 

def venue_API(venue_string):
    venue_string = venue_string.split(' ')[0]    
    URL = "http://dblp.org/search/venue/api?q=" + venue_string + "%3A$&format=json"
    print(URL)
    r = requests.get(url = URL)
    data = r.json()
    coAuths=[]
    joursConfs=[]
    try:
        data = data['result']['hits']
        if int(data['@total']) > 0:
            return data['hit'][0]['info']['venue'], data['hit'][0]['info']['acronym'], data['hit'][0]['info']['type'], data['hit'][0]['info']['url']
    except:
        pass
    return venue_string, None, None, None

In [None]:
schema = StructType([
    StructField("name", StringType(), False),
    StructField("name_s", StringType(), False),
    StructField("type", StringType(), False),
    StructField("src", StringType(), False),
])

venue_query_udf = udf(venue_API, schema)

venue_frame = venue_frame.select("*", venue_query_udf(F.col("raw")).alias("query_results"))
venue_frame = venue_frame.drop('name_d', 'raw', 'name_s', 'name', 'sid', 'issn', 'online_issn', 'publisher', 'type', 'src', 'raw_zh', 't')
venue_frame = venue_frame.select("*", "query_results.*")
venue_frame = venue_frame.drop("query_results")

save_parquet_frame(venue_frame, "Venue", clean=True)
venue_frame.show()

In [None]:
## TODO: 
# 1. Pull more info before save

In [None]:
union.printSchema()

In [None]:
union.show(5)

#### Keyword Lookup

In [None]:
keyword_frame = union.select(F.explode_outer("keywords").alias("key_countmap"))
key_countmap = keyword_frame.groupBy("key_countmap").count()
key_countmap = key_countmap.rdd.map(lambda row: row.asDict(True))
# union = union.drop("key_countmap")
keyword_counts = key_countmap.collect()
keyword_counter = {}
for keyword_count in keyword_counts:
    keyword_counter[keyword_count['key_countmap']] = keyword_count['count']
    
keyword_counter

### LOAD

#### Loading frames from Parquet files

In [None]:
language = spark.read.parquet(f'{parquet_path}Language.parquet')
language.show()

In [None]:
field_of_study = spark.read.parquet(f'{parquet_path}FieldOfStudy.parquet')
field_of_study.show(5)

In [None]:
publications = spark.read.parquet(f'{parquet_path}Publication.parquet')
publications.show(5)

In [None]:
venues = spark.read.parquet(f'{parquet_path}Venue.parquet')
venues.show(5)

In [None]:
authors = spark.read.parquet(f'{parquet_path}Author.parquet')
authors.show(5)

In [None]:
#organizations = spark.read.parquet(f'{parquet_path}Organization.parquet')
#organizations.show(5)