<a href="https://colab.research.google.com/github/antonpolishko/A_colab_collection/blob/master/Dataverse_Colab_Connect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connecting Dataverse to Colab


The below code for fetching the dataset is all thanks to Alex Lofgran.



In [None]:
# Import & Settings
import pandas as pd
import numpy as np
import os
from google.colab import drive

!pip install pyDataverse 
#documentation here: https://pydataverse.readthedocs.io/en/latest/index.html#quickstart
from pyDataverse.api import Api
from pyDataverse.models import Dataverse

In [None]:
drive.mount('/content/gdrive', force_remount=True)

In [None]:
#Make /COVID-19 and /data directories if needed...
if not os.path.exists('gdrive/My Drive/COVID-19/data'):
    os.makedirs('gdrive/My Drive/COVID-19/data')

In [None]:
#Set the current working directory as the download destination
import os
root = os.getcwd()
filepath = 'gdrive/My Drive/COVID-19/data'
cwd = os.path.join(root, filepath)
os.chdir(cwd)
print(os.getcwd())

/content/gdrive/My Drive/COVID-19/data


In [None]:
# Establish connection to Dataverse
base_url = 'http://datasets.coronawhy.org'
api = Api(base_url)
print('Dataverse connection status: ', api.status)

Dataverse connection status:  OK


In [None]:
# Get dataverse info
dv = 'root' 
response = api.get_dataverse(dv)
response.json()

{'data': {'alias': 'root',
  'creationDate': '2020-04-15T21:32:36Z',
  'dataverseContacts': [{'contactEmail': 'dataverse-k8s-contact@mailinator.com',
    'displayOrder': 0}],
  'dataverseType': 'UNCATEGORIZED',
  'description': 'Information and Data hub produced by all <a href="https://coronawhy.org">CoronaWhy</a> research groups. Please <a href="https://www.coronawhy.org/join-the-fight">join us</a> if you want to help in the fight against COVID-19.\r\n<br/>Disclaimer: at the moment all materials published on this site are available for public for the demonstration purposes, without <a href="https://en.wikipedia.org/wiki/Persistent_identifier">DOI Persistent Identifiers</a>. ',
  'id': 1,
  'name': 'Root',
  'permissionRoot': True},
 'status': 'OK'}

In [None]:
# Get dataverse information by id
dv = 'doi:10.5072/FK2/DKZOAH'
response = api.get_dataset(dv) # in Dataverse 
dataset_id = response.json()['data']['id']
print('Dataset ID: ', dataset_id, '\n')

datafile_id = response.json()['data']['latestVersion']['files'][0]['dataFile']['id']
print('Datafile ID: ', datafile_id, '\n')

print('Response results: ')
response.json()

Dataset ID:  52 

Datafile ID:  53 

Response results: 


{'data': {'authority': '10.5072',
  'id': 52,
  'identifier': 'FK2/DKZOAH',
  'latestVersion': {'createTime': '2020-04-28T12:43:57Z',
   'datasetId': 52,
   'datasetPersistentId': 'doi:10.5072/FK2/DKZOAH',
   'fileAccessRequest': False,
   'files': [{'dataFile': {'checksum': {'type': 'MD5',
       'value': '6d2ba5df5b6ac2395e9bdd4f0402c6ce'},
      'contentType': 'application/gzip',
      'creationDate': '2020-04-28',
      'filename': 'v9text.json.gz',
      'filesize': 964007304,
      'id': 53,
      'md5': '6d2ba5df5b6ac2395e9bdd4f0402c6ce',
      'persistentId': '',
      'pidURL': '',
      'rootDataFileId': -1,
      'storageIdentifier': '171c0ccd63c-f1f6a3b39821'},
     'datasetVersionId': 29,
     'label': 'v9text.json.gz',
     'restricted': False,
     'version': 1}],
   'id': 29,
   'lastUpdateTime': '2020-04-28T12:45:02Z',
   'license': 'CC0',
   'metadataBlocks': {'citation': {'displayName': 'Citation Metadata',
     'fields': [{'multiple': False,
       'typeClass': 'pri

In [None]:
# Download datafile
response = api.get_datafile(datafile_id)
print('File size: ', len(response.content))

#Assuming that a length less than 4k results in an error --> return that error response
if len(response.content) < 4000:
    print(response.content)

#Write the response content to the specified filename
with open('v9text.json.gz', 'wb') as f: 
    f.write(response.content)

print('Files in current directory: ', '\n', os.listdir())


File size:  964007304
Files in current directory:  
 ['v9text.json.gz']


In [None]:
#Unzips the file & replaces the original (via CLI)
!gunzip v9text.json.gz
print('Files in current directory: ')
os.listdir()

Files in current directory: 


['v9text.json']

In [None]:
#Pythonic method of unzipping
import io
import gzip
#Unzipping the file this way may lead to a better result (no error with position '0x8b')
buf = io.BytesIO(response.content)
print('Converted with BytesIO')
gzip_f = gzip.GzipFile(fileobj=buf)
print('Unzipped')
# cntnt = gzip_f.read()

We'll use ijson for reading in the JSON files as a stream. Further, I've added functionality to split this into chunks so that you can just write your main processing function, decorate it and get the results

In [None]:
#Basic imports
import ijson
from itertools import islice,chain
#Creating my chunks generator
def chunks(iterable, size):
    iterator = iter(iterable)
    for first in iterator:
        yield chain([first], islice(iterator, size - 1))

In [None]:
#Reading in the JSON and creating chunks
def read_big_json(f, chunk_size=10000):
  obs = ijson.items(f, 'item')
  return chunks(obs,chunk_size)

In [None]:
def preprocess_chunk(original_fun):
  def wrapper(c):
    l=[]
    for el in c:
      l.append(el)
    df = pd.DataFrame(l)
    return original_fun(df)
  return wrapper

Above were my utility functions. Guide to the end user:
1. Use read_big_json to read in a file. Pass the file path and the chunk size (default is 10000). Store the result as this result is a collection of the chunks
2. Write your processing function which consists of all operations you want to perform on your dataframe. Pass this function a single argument - the dataframe (chunk). Decorate it with @preprocess_chunk
3. Now, iterate over all chunks in the file read in through read_big_json and call your function on these chunks
I have demonstrated these as below

PS: Currently this will only work for a JSON which is an array of objects. Other types are not supported yet

In [None]:
#My custom function is returndf which I've decorated with preprocess_chunk
@preprocess_chunk
def returndf(df):
  return df

In [None]:
#Reading in my chunks
f = open('v9text.json')
myfile = read_big_json(f,10000)

In [None]:
#Just checking the first 5 Dataframes that are created. Note: The file needs to be open for this to work
l=[]
c=0
for chunk in myfile:
  l.append(returndf(chunk))
  c+=1
  if (c==5):
    break
  

In [None]:
l[0]

Unnamed: 0,cord_uid,language,sentence_id,section,subsection,lemma,UMLS,UMLS_IDS,translated,GGP,SO,TAXON,CHEBI,GO,CL,DNA,CELL_TYPE,CELL_LINE,RNA,PROTEIN,DISEASE,CHEMICAL,CANCER,ORGAN,TISSUE,ORGANISM,CELL,AMINO_ACID,GENE_OR_GENE_PRODUCT,SIMPLE_CHEMICAL,ANATOMICAL_SYSTEM,IMMATERIAL_ANATOMICAL_ENTITY,MULTI-TISSUE_STRUCTURE,DEVELOPING_ANATOMICAL_STRUCTURE,ORGANISM_SUBDIVISION,CELLULAR_COMPONENT,PATHOLOGICAL_FORMATION,ORGANISM_SUBSTANCE
0,xgwbl8em,ro,xgwbl8em140,Frameshift reporter constructs and 2′-O-Methyl...,14.0,[],[],[],False,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[P2lucAZ1FS],[],[],[],[],[],[],[],[],[]
1,xgwbl8em,ro,xgwbl8em151,Frameshift reporter constructs and 2′-O-Methyl...,15.0,[],[],[],False,[],[],[],[],[],[],[],[],[],[],[P2lucAZ1FSUGG],[],"[P2lucAZ1FSUGG, TCGACGTGCTCCTGGTGCCCCTGGATC]",[],[],[],[],[],[],[P2lucAZ1FSUGG],[],[],[],[],[],[],[],[],[]
2,xgwbl8em,en,xgwbl8em162,Frameshift reporter constructs and 2′-O-Methyl...,16.0,"[2′-o-methyl, antisense, oligonucleotide, synt...","[Antisense Oligonucleotides, Synthesis, Techno...","[C0079925, C1883254, C0039421, C0020980]",False,[],"[antisense oligonucleotides, Integrated DNA]",[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[DNA],[],[]
3,xgwbl8em,de,xgwbl8em173,Frameshift reporter constructs and 2′-O-Methyl...,17.0,[],[],[],False,[],[],[],[],[],[],"[SL1, GGUGGGUGAGGG, SL2, GGAUCCGGGUGGGUGAGGG]",[],[],[],"[AGUUGAAGGAUCCAGGGGCA, AZ1B, GGAAGUUGAAGGAUCCA...",[],[],[],[],[],[],[],[],"[AGUUGAAGGAUCCAGGGGCA, AZ1B, GGAAGUUGAAGGAUCCA...",[],[],[],[],[],[],[],[],[]
4,xgwbl8em,en,xgwbl8em04,INTRODUCTION,0.0,"[standard, triplet, readout, genetic, code, re...","[Genetic Code, Reprogram, Signal Transduction,...","[C0017380, C3828338, C0037083, C0035696, C0205...",False,[ribosomal frameshifting],[genetic],[],[mRNA],[],[],[],[],[],[mRNA],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[ribosomal],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,bkfkmtld,en,bkfkmtld09995,abstract,0.0,"[finally, codon-based, substitution, model, si...","[Alignment, Quality, Phylogenetic Analysis, Co...","[C1706765, C0332306, C1519068, C0009221]",False,[],"[substitution, indels, codons]",[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
9996,bkfkmtld,en,bkfkmtld09996,abstract,0.0,"[conclusion, result, indicate, indel]","[Conclusion, Test Result, Insertion and Deletion]","[C1707478, C0456984, C3845271]",False,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
9997,bkfkmtld,en,bkfkmtld789997,Collapsed Sampling as an MH Proposal Distribut...,78.0,"[method, sample, alignment, sample, distributi...","[Methods, Sampling - Surgical action, Alignmen...","[C0025663, C0441621, C1706765, C2347026, C0037...",False,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
9998,bkfkmtld,en,bkfkmtld789998,Collapsed Sampling as an MH Proposal Distribut...,78.0,"[define, mh, transition, kernel, use, collapse...","[Alignment, Spatial Distribution]","[C1706765, C0037775]",False,[],[],[],[],[],[],[],[],[],[],[],[],[ρ],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]


In [None]:
#I know -- I don't like this manual one-liner to close the file so will see if I can fix this later
f.close()