### load derived data (jsonl file) into dictionary
This notebook shows how to:
- Pull data from the derived data file (JSON lines file)
- Extract the information we want, organize it into a dictionary
- Save the dictionary into a JSON file

After this has been stored into a JSON file, we can load it into pandas, cross check with unpaywall, etc. (all to be done in other notebooks)

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import jsonlines
import json
import doi as doiLib
from unpywall.utils import UnpywallCredentials
from unpywall import Unpywall

In [2]:
dataPath = "../data/derived2/"
dataFile = "records.jsonl"

In [3]:
index = 0

#initialData = pd.DataFrame()
dataDict = {"WOSUID":[],
            "pubTitle":[],
            "pubYear":[],
            "pubType":[],
            "journalTitle":[],
            "publisher":[],
            "area":[],
            "areaCount":[],
            "identifier":[],
            "keywords":[],
            "abstract":[],
            "doi":[],
            "validDoi":[],
            "url":[],
            #"OA":[],
            #"OA-pdf_url":[],
            #"issn":[],
                   }
with jsonlines.open(dataPath+dataFile) as reader:   
    for obj in reader:
        if index<100000:
            try:
                wosID = obj["record"]["UID"][4:]
            except KeyError:
                wosID = None
            dataDict["WOSUID"].append(wosID)
            dataDict["pubTitle"].append(obj["record"]["static_data"]["summary"]["titles"]["title"][-1]["content"])
            dataDict["pubYear"].append(obj["record"]["static_data"]["summary"]["pub_info"]["pubyear"])
            dataDict["pubType"].append(obj["record"]["static_data"]["summary"]["doctypes"]["doctype"])
            dataDict["journalTitle"].append(obj["record"]["static_data"]["summary"]["titles"]["title"][0]["content"])
               
            try:
                publisher = obj["record"]["static_data"]["summary"]["publishers"]["publisher"]["names"]["name"]["unified_name"]
            except KeyError:
                publisher = None
                
            dataDict["publisher"].append(publisher)
            try:
                areaCount = obj["record"]["static_data"]["fullrecord_metadata"]["category_info"]["subheadings"]["count"]
                
            except KeyError:
                areaCount = None
            dataDict["areaCount"].append(areaCount)
            
            try:
                area = obj["record"]["static_data"]["fullrecord_metadata"]["category_info"]["subheadings"]["subheading"]
                
            except KeyError:
                area = None
            dataDict["area"].append(area)
            
            dataDict["identifier"].append(obj["record"]["dynamic_data"]["cluster_related"]["identifiers"]["identifier"])
            
            try:
                keyword = obj["record"]["static_data"]["fullrecord_metadata"]["keywords"]["keyword"]
            except KeyError:
                keyword = None
            dataDict["keywords"].append(keyword)

            try:
                abstract = obj["record"]["static_data"]["fullrecord_metadata"]["abstracts"]["abstract"]["abstract_text"]["p"]
            except KeyError:
                abstract = None
            
            dataDict["abstract"].append(abstract)
            

            doi = None
            url = None
            #print(index)
            for item in obj["record"]["dynamic_data"]["cluster_related"]["identifiers"]["identifier"]:
                try:
                    if item["type"]=="doi" or item["type"]=="xref_doi":    
                        doi = item["value"]
                except TypeError:
                    doi=None
                
                try:
                    validDoi = doiLib.validate_doi(doi)
                except ValueError:
                    validDoi = None
            dataDict["doi"].append(doi)
            dataDict["validDoi"].append(validDoi)
            
            if validDoi !=None:
                pass
            else:
                pass    
            try:
                url = doiLib.get_real_url_from_doi(doi)
            except ValueError:
                url=None
            dataDict["url"].append(url)           
        index=index+1

In [5]:
# save pulled data
import json
with open(dataPath+'clean_dictionary.json', 'w') as fp:
    json.dump(dataDict, fp)

In [None]:

#convert dictionary to panda dataframe
data = pd.DataFrame.from_dict(dataDict)


In [None]:
with open("email") as fid:
    unpaywallcred = fid.readline()

UnpywallCredentials("nick.haupka@gmail.com")# could not get api cred on time, so using this email which is pasted all over the web.... #unpaywallcred)

In [64]:
#use unpaywall api to check if papers are open access
upArticles = Unpywall.doi(dois=list(data["doi"].dropna()),errors="ignore")             





In [73]:
#merge information based on doi
##TODO
#dataMerge = pd.merge(data, upArticles, on='doi')


    
#'best_oa_location.license'
#df["OA"] = up["is_oa"]
#df["OA-pdf_url"] = up["best_oa_location.url_for_pdf"]

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [72]:
pd.merge?

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mmerge[0m[1;33m([0m[1;33m
[0m    [0mleft[0m[1;33m:[0m [1;34m'DataFrame | Series'[0m[1;33m,[0m[1;33m
[0m    [0mright[0m[1;33m:[0m [1;34m'DataFrame | Series'[0m[1;33m,[0m[1;33m
[0m    [0mhow[0m[1;33m:[0m [1;34m'str'[0m [1;33m=[0m [1;34m'inner'[0m[1;33m,[0m[1;33m
[0m    [0mon[0m[1;33m:[0m [1;34m'IndexLabel | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mleft_on[0m[1;33m:[0m [1;34m'IndexLabel | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mright_on[0m[1;33m:[0m [1;34m'IndexLabel | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mleft_index[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mright_index[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0msort[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m


In [1]:
print(len(data))
sns.histplot(data["pubYear"],bins=list(range(2004,2023)))

NameError: name 'data' is not defined

In [16]:
#identifier
#obj["record"]["dynamic_data"]["cluster_related"]["identifiers"]["identifier"]

#WOSID 
#obj["record"]["UID"][4:]

#'find if this is an article
#obj["record"]["static_data"]["summary"]["doctypes"]["doctype"]

#find publication title
#obj["record"]["static_data"]["summary"]["titles"]["title"][-1]["content"]

#find journal title
#obj["record"]["static_data"]["summary"]["titles"]["title"][0]["content"]

#find publisher (elsevier, nature, etc)
#obj["record"]["static_data"]["summary"]["publishers"]["publisher"]["names"]["name"]["unified_name"]

#get publication year
#obj["record"]["static_data"]["summary"]["pub_info"]["pubyear"]

#find area (still requires more code to systematically extract all categories)
#obj["record"]["static_data"]["fullrecord_metadata"]["category_info"]["subheadings"]

### plot for the paper:
- number of papers in OSH over years
  - divide them per area (using research areas tags)
    - Engineering, life sciences, humanities?
- quality of the papers - how many fit the OSHWA OSH classification?
- Accessibility of the papers - how many are OA?