# Checking the Completeness of Scholars@TAMU Data  
**Filename:** cleaning.ipynb  
**Path:** TAMIDS/Code/Scholars@TAMU Data/completeness.ipynb  
**Created Date:** 18 March 2022, 16:34 

In [21]:
from IPython.display import Markdown, display, HTML
from typing import Dict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import json

pd.options.display.float_format = '{:,.3f}'.format
pd.options.styler.format.precision = 2
pd.options.styler.format.thousands = ','
pd.options.display.max_columns = 50
pd.options.display.max_rows = 150
plt.style.use('seaborn-darkgrid')

# General Markdown Formatting Functions

def printmd(string, level=1):
    header_level = '#'*level + ' '
    display(Markdown(header_level + string))

## Scholars@TAMU

In [1]:
base_path = "../../Data/Scholars@TAMU"

!tree "$base_path"

[01;34m../../Data/Scholars@TAMU[0m
├── [01;34mcourses[0m
│   ├── [00mcourse_overview.pickle[0m
│   └── [00mraw_course_overview.pickle[0m
├── [01;34mgrants_and_awards[0m
│   ├── [00mawards_overview.pickle[0m
│   ├── [00mgrants_abstract.pickle[0m
│   ├── [00mgrants_overview.pickle[0m
│   ├── [00mraw_awards_overview.pickle[0m
│   ├── [00mraw_grants_abstract.pickle[0m
│   └── [00mraw_grants_overview.pickle[0m
├── [01;34morganizations[0m
│   ├── [00morganizations_external.pickle[0m
│   ├── [00morganizations_local.pickle[0m
│   ├── [00mraw_organizations_external.pickle[0m
│   └── [00mraw_organizations_local.pickle[0m
├── [01;34mpeople[0m
│   ├── [00mpeople_affiliation.pickle[0m
│   ├── [00mpeople_education.pickle[0m
│   ├── [00mpeople_overview.pickle[0m
│   ├── [00mpeople_subject_areas.pickle[0m
│   ├── [00mraw_people_affiliation.pickle[0m
│   ├── [00mraw_people_education.pickle[0m
│   ├── [00mraw_people_overview.pickle[0m
│   └── [00mraw_peop

In [23]:
def count_types(df: pd.DataFrame, percentage=False, plot=False, show_table=False, color_map:str=False) -> pd.DataFrame:
    new_df = pd.DataFrame()
    for column in df:
        new_df[column] = df[column].apply(lambda x: "nan" if pd.isna(x) else type(x).__name__)
    new_df = new_df.apply(pd.value_counts)

    if percentage:
        new_df = new_df / new_df.sum() * 100

    if plot:
        new_df.T.plot.bar()
        plt.yscale("log")
        plt.rcParams["figure.figsize"]
        plt.show()

    if show_table:
        if color_map:
            cm = sns.light_palette(color_map, as_cmap=True)
            color_df = new_df.style.background_gradient(cmap=cm, axis=None)
            color_df = color_df.applymap(lambda x: 'color: transparent' if pd.isnull(x) else '')
            color_df = color_df.applymap(lambda x: 'background-color: transparent' if pd.isnull(x) else '')
            display(color_df)
        else:
            display(new_df)


    return new_df


In [24]:
def slice_unnamed_columns(df: pd.DataFrame):
    return df.loc[:,~df.columns.str.contains('Unnamed:')]

def calculate_completeness(df: pd.DataFrame, show_table=False, color_map:str=False) -> pd.DataFrame:
    sliced_df = slice_unnamed_columns(df)
    missing_values = sliced_df.isna().sum()
    total_rows = sliced_df.shape[0]
    availible_values = sliced_df.count()
    data_completeness = (1 - missing_values / total_rows ) *  100

    new_df = pd.DataFrame({
        'data_completeness': data_completeness,
        'total_rows': total_rows,
        'availible_values': availible_values,
        'missing_values': missing_values
    })

    if show_table:
        if color_map:
            cm = sns.color_palette(color_map, as_cmap=True)
            color_df = new_df.style.text_gradient(cmap=cm, axis=0, gmap=new_df['data_completeness'], vmin=0, vmax=100)
            display(color_df)
        else:
            display(new_df)

    return new_df

In [25]:
def col_types_to_json(df_dict:Dict[str, pd.DataFrame], folder_name: str):
    type_dict = {}
    for file_name, df in df_dict.items():
        sliced_df = slice_unnamed_columns(df)
        counted_df = count_types(sliced_df)
        type_dict[file_name] = {column: counted_df[column].idxmax() for column in counted_df}

    with open('dicts/column_types/' + folder_name + '_col_types.json', 'w') as outfile:
        json.dump(type_dict, outfile, indent=4)

In [26]:
with open('dicts/raw_filenames.json', 'r') as outfile:
    raw_filenames = json.load(outfile)

## ./people

In [27]:
people_path = base_path + "/people"

people_raw = {picklename: pd.read_pickle(people_path + "/raw_" + picklename + ".pickle") for picklename in raw_filenames['people']}

In [28]:
for name, df in people_raw.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### people_affiliation

Unnamed: 0,uid,uin,people_uri,people_api,position_dept_id
float,,,,,5912.0
int,,5944.0,,,
,,,,,32.0
str,5944.0,,5944.0,5944.0,


### people_education

Unnamed: 0,uid,uin,people_uri,people_api,external_org_id,granting_school_name,degreeuri_text,year,major
float,,9118.0,,,9120.0,,,,
int,,,,,,,,8521.0,
,3.0,3.0,3.0,3.0,1.0,,459.0,509.0,1398.0
str,9118.0,,9118.0,9118.0,,9121.0,8662.0,91.0,7723.0


### people_overview

Unnamed: 0,uid,uin,people_uri,people_api,status,dept_id,lastname,middle,firstname,preferred_title,email,overview,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29
float,,,,,,4859.0,,,,,,,,,,,,,,,,,,,,,,,,
int,,4859.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,1.0,1.0,1.0,1532.0,1.0,6.0,130.0,3064.0,4821.0,4829.0,4837.0,4842.0,4845.0,4847.0,4850.0,4854.0,4855.0,4855.0,4856.0,4858.0,4858.0,4858.0,4858.0,4858.0,4859.0,4859.0
str,4860.0,1.0,4860.0,4860.0,4859.0,,4859.0,3328.0,4859.0,4854.0,4730.0,1796.0,39.0,31.0,23.0,18.0,15.0,13.0,10.0,6.0,5.0,5.0,4.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0


### people_subject_areas

Unnamed: 0,uid,uin,people_uri,people_api,research_areas
int,,7328.0,,,
,,,,,4226.0
str,7328.0,,7328.0,7328.0,3102.0


In [29]:
people_stats = {}

for name, df in people_raw.items():
    printmd(name, 3)
    people_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### people_affiliation

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,5944,5944,0
uin,100.0,5944,5944,0
people_uri,100.0,5944,5944,0
people_api,100.0,5944,5944,0
position_dept_id,99.46,5944,5912,32


### people_education

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,99.97,9121,9118,3
uin,99.97,9121,9118,3
people_uri,99.97,9121,9118,3
people_api,99.97,9121,9118,3
external_org_id,99.99,9121,9120,1
granting_school_name,100.0,9121,9121,0
degreeuri_text,94.97,9121,8662,459
year,94.42,9121,8612,509
major,84.67,9121,7723,1398


### people_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,4860,4860,0
uin,100.0,4860,4860,0
people_uri,100.0,4860,4860,0
people_api,100.0,4860,4860,0
status,99.98,4860,4859,1
dept_id,99.98,4860,4859,1
lastname,99.98,4860,4859,1
middle,68.48,4860,3328,1532
firstname,99.98,4860,4859,1
preferred_title,99.88,4860,4854,6


### people_subject_areas

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,7328,7328,0
uin,100.0,7328,7328,0
people_uri,100.0,7328,7328,0
people_api,100.0,7328,7328,0
research_areas,42.33,7328,3102,4226


In [30]:
col_types_to_json(people_raw, 'people')

## ./courses

In [31]:
courses_path = base_path + "/courses"

courses_raw = {picklename: pd.read_pickle(courses_path + "/raw_" + picklename + ".pickle") for picklename in raw_filenames['courses']}

In [32]:
for name, df in courses_raw.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### course_overview

Unnamed: 0,uid,uin,people_uri,people_api,course,course_name,acad_dept,year
int,,70297.0,,,,1.0,,70259
,,,,,,,,33
str,70297.0,,70297.0,70297.0,70297.0,70296.0,70297.0,5


In [33]:
courses_stats = {}

for name, df in courses_raw.items():
    printmd(name, 3)
    courses_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### course_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,70297,70297,0
uin,100.0,70297,70297,0
people_uri,100.0,70297,70297,0
people_api,100.0,70297,70297,0
course,100.0,70297,70297,0
course_name,100.0,70297,70297,0
acad_dept,100.0,70297,70297,0
year,99.95,70297,70264,33


In [34]:
col_types_to_json(courses_raw, 'courses')

## ./grant_award

In [35]:
grants_path = base_path + "/grants_and_awards"

grants_raw = {picklename: pd.read_pickle(grants_path + "/raw_" + picklename + ".pickle") for picklename in raw_filenames['grants_and_awards']}

In [36]:
for name, df in grants_raw.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### awards_overview

Unnamed: 0,uid,uin,people_uri,people_api,award_id,year,award_name,authoritative_name_id
float,,,,,4199.0,4199.0,,4193.0
int,1.0,10053.0,,,,,1.0,
,,,,,5855.0,5855.0,5861.0,5861.0
str,10053.0,1.0,10054.0,10054.0,,,4192.0,


### grants_abstract

Unnamed: 0,grant_uri,grant_api,abstract,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42
int,,,,17,3,,,,,,,,,,1,1,,,,,,,,,,,,,1,,,1,,,,,,,,,,,
,,,2970.0,5474,5491,5502.0,5512.0,5532.0,5548.0,5558.0,5579.0,5606.0,5611.0,5617.0,5633,5650,5673.0,5683.0,5700.0,5707.0,5719.0,5728.0,5739.0,5745.0,5751.0,5755.0,5763.0,5765.0,5766,5768.0,5768.0,5772,5773.0,5787.0,5789.0,5790.0,5794.0,5795.0,5798.0,5798.0,5798.0,5798.0,5798.0
str,5802.0,5802.0,2832.0,311,308,300.0,290.0,270.0,254.0,244.0,223.0,196.0,191.0,185.0,168,151,129.0,119.0,102.0,95.0,83.0,74.0,63.0,57.0,51.0,47.0,39.0,37.0,35,34.0,34.0,29,29.0,15.0,13.0,12.0,8.0,7.0,4.0,4.0,4.0,4.0,4.0


### grants_overview

Unnamed: 0,uid,uin,people_uri,people_api,grant_uid,grant_uri,grant_api,start_year,end_year,role,grant_awarded_by_external_org_id,title,Unnamed: 12,Unnamed: 13
float,,,,,1.0,,,5752.0,5729.0,,5800.0,,,
int,,5803.0,,,9.0,,,,,,,,,
,,,,,,,,51.0,74.0,,3.0,,5800.0,5800.0
str,5803.0,,5803.0,5803.0,5793.0,5803.0,5803.0,,,5803.0,,5803.0,3.0,3.0


In [37]:
grants_stats = {}

for name, df in grants_raw.items():
    printmd(name, 3)
    grants_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### awards_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,10054,10054,0
uin,100.0,10054,10054,0
people_uri,100.0,10054,10054,0
people_api,100.0,10054,10054,0
award_id,41.76,10054,4199,5855
year,41.76,10054,4199,5855
award_name,41.7,10054,4193,5861
authoritative_name_id,41.7,10054,4193,5861


### grants_abstract

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
grant_uri,100.0,5802,5802,0
grant_api,100.0,5802,5802,0
abstract,48.81,5802,2832,2970


### grants_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,5803,5803,0
uin,100.0,5803,5803,0
people_uri,100.0,5803,5803,0
people_api,100.0,5803,5803,0
grant_uid,100.0,5803,5803,0
grant_uri,100.0,5803,5803,0
grant_api,100.0,5803,5803,0
start_year,99.12,5803,5752,51
end_year,98.72,5803,5729,74
role,100.0,5803,5803,0


In [38]:
col_types_to_json(grants_raw, 'grants_and_awards')

## ./organizations

In [39]:
orgs_path = base_path + "/organizations"

orgs_raw = {picklename: pd.read_pickle(orgs_path + "/raw_" + picklename + ".pickle") for picklename in raw_filenames['organizations']}

In [40]:
for name, df in orgs_raw.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### organizations_external

Unnamed: 0,external_org_id,external_org_uri,external_org_api,authoritative_name,authoritative_name_city,authoritative_name_state,authoritative_name_country,grid_id,aa_id,Unnamed: 9
int,95674.0,,,,,,,,993,
,,,,1.0,473.0,60626.0,451.0,532.0,94680,95674.0
str,,95674.0,95674.0,95673.0,95201.0,35048.0,95223.0,95142.0,1,


### organizations_local

Unnamed: 0,[dept_id],[dept_name],[department_uri],[department_api]
int,586.0,,,
str,,586.0,586.0,586.0


In [41]:
orgs_stats = {}

for name, df in orgs_raw.items():
    printmd(name, 3)
    orgs_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### organizations_external

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
external_org_id,100.0,95674,95674,0
external_org_uri,100.0,95674,95674,0
external_org_api,100.0,95674,95674,0
authoritative_name,100.0,95674,95673,1
authoritative_name_city,99.51,95674,95201,473
authoritative_name_state,36.63,95674,35048,60626
authoritative_name_country,99.53,95674,95223,451
grid_id,99.44,95674,95142,532
aa_id,1.04,95674,994,94680


### organizations_local

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
[dept_id],100.0,586,586,0
[dept_name],100.0,586,586,0
[department_uri],100.0,586,586,0
[department_api],100.0,586,586,0


In [42]:
col_types_to_json(orgs_raw, 'organizations')

## ./publications

In [43]:
pubs_path = base_path + "/publications"

pubs_raw = {picklename: pd.read_pickle(pubs_path + "/raw_" + picklename + ".pickle") for picklename in raw_filenames['publications']}

In [44]:
for name, df in pubs_raw.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### publications_abstract

Unnamed: 0,publication_uri,publication_api,abstract,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,Unnamed: 79,Unnamed: 80
float,,,,6,6,10,3,8,10,12,4,5,5,5,4,2,5,3,,2,1,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
int,,,,57,44,38,27,24,17,20,18,12,12,17,13,9,10,2,3.0,5,2,1,2,2.0,2.0,3.0,5.0,,3.0,2.0,2.0,,1.0,1.0,,,,,2.0,1.0,1.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,55692.0,215846,217061,217939,218691,219398,220013,220544,220987,221358,221656,221926,222168,222346,222478,222596,222697.0,222760,222816,222893,222938,222968.0,223001.0,223025.0,223043.0,223060.0,223065.0,223079.0,223086.0,223096.0,223104.0,223112.0,223122.0,223127.0,223132.0,223133.0,223139.0,223140.0,223143.0,223147.0,223147.0,223150.0,223152.0,223156.0,223162.0,223163.0,223166.0,223171.0,223173.0,223173.0,223173.0,223173.0,223174.0,223174.0,223174.0,223176.0,223176.0,223176.0,223176.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0,223177.0
str,223178.0,223178.0,167486.0,7269,6067,5191,4457,3748,3138,2602,2169,1803,1505,1230,993,821,685,577,478.0,411,359,283,237,208.0,175.0,150.0,130.0,118.0,110.0,97.0,90.0,82.0,73.0,65.0,56.0,51.0,46.0,45.0,37.0,37.0,34.0,31.0,31.0,28.0,26.0,22.0,16.0,15.0,12.0,7.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### publications_author_institutions

Unnamed: 0,publication_uri,publication_api,organisation,city,country
int,,,,1.0,
,,,1.0,,
str,950084.0,950084.0,950083.0,950083.0,950084.0


### publications_overview

Unnamed: 0,people_uid,uin,people_uri,people_api,dept_id,publication_uid,publication_uri,publication_api,doi,issn,eissn,journal_title,month,year,begin_page,end_page,volume,issue,publisher,publication_type,publication_title,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29
datetime,,,,,,,,,,,,,,,14,17,85,6643,,,,,,,,,,,,
float,,,,,,,,,,,,,,,345,337,21,6,,,,,,,,,,,,
int,,223178.0,,,223178.0,,,,,,,1.0,211518.0,222640.0,190524,181694,185551,141722,1.0,,7.0,3.0,3.0,1.0,1.0,1.0,1.0,3.0,,
,,,,,,,,,43511.0,34804.0,81821.0,28574.0,11656.0,537.0,18454,29315,34059,68192,41309.0,2.0,15.0,223060.0,223116.0,223165.0,223172.0,223174.0,223174.0,223174.0,223174.0,223177.0
str,223178.0,,223178.0,223178.0,,223178.0,223178.0,223178.0,179667.0,188374.0,141357.0,194603.0,4.0,1.0,13833,11801,3419,6586,181868.0,223176.0,223156.0,115.0,59.0,12.0,5.0,3.0,3.0,1.0,4.0,1.0
time,,,,,,,,,,,,,,,8,14,43,29,,,,,,,,,,,,


### publications_subject_journal_wos

Unnamed: 0,publication_uri,publication_api,wos_id,wos_research_area_id,keyword
float,,,,194946.0,
,,,12475.0,12475.0,12475.0
str,207421.0,207421.0,194946.0,,194946.0


### publications_unsdg

Unnamed: 0,publication_uri,publication_api,category_sdg_id,name
int,,,14154.0,
str,14154.0,14154.0,,14154.0


In [45]:
pubs_stats = {}

for name, df in pubs_raw.items():
    printmd(name, 3)
    pubs_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### publications_abstract

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
publication_uri,100.0,223178,223178,0
publication_api,100.0,223178,223178,0
abstract,75.05,223178,167486,55692


### publications_author_institutions

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
publication_uri,100.0,950084,950084,0
publication_api,100.0,950084,950084,0
organisation,100.0,950084,950083,1
city,100.0,950084,950084,0
country,100.0,950084,950084,0


### publications_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
people_uid,100.0,223178,223178,0
uin,100.0,223178,223178,0
people_uri,100.0,223178,223178,0
people_api,100.0,223178,223178,0
dept_id,100.0,223178,223178,0
publication_uid,100.0,223178,223178,0
publication_uri,100.0,223178,223178,0
publication_api,100.0,223178,223178,0
doi,80.5,223178,179667,43511
issn,84.41,223178,188374,34804


### publications_subject_journal_wos

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
publication_uri,100.0,207421,207421,0
publication_api,100.0,207421,207421,0
wos_id,93.99,207421,194946,12475
wos_research_area_id,93.99,207421,194946,12475
keyword,93.99,207421,194946,12475


### publications_unsdg

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
publication_uri,100.0,14154,14154,0
publication_api,100.0,14154,14154,0
category_sdg_id,100.0,14154,14154,0
name,100.0,14154,14154,0


In [46]:
col_types_to_json(pubs_raw, 'publications')

## Combined

In [47]:
combined_dict = {
    'people': pd.concat(people_stats.values(), axis=0, keys=people_stats.keys()),
    'courses': pd.concat(courses_stats.values(), axis=0, keys=courses_stats.keys()),
    'grants_and_awards': pd.concat(grants_stats.values(), axis=0, keys=grants_stats.keys()),
    'organizations': pd.concat(orgs_stats.values(), axis=0, keys=orgs_stats.keys()),
    'publications': pd.concat(pubs_stats.values(), axis=0, keys=pubs_stats.keys())
}

combined_df = pd.concat(combined_dict.values(), axis=0, keys=combined_dict.keys())

combined_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,data_completeness,total_rows,availible_values,missing_values
people,people_affiliation,uid,100.0,5944,5944,0
people,people_affiliation,uin,100.0,5944,5944,0
people,people_affiliation,people_uri,100.0,5944,5944,0
people,people_affiliation,people_api,100.0,5944,5944,0
people,people_affiliation,position_dept_id,99.462,5944,5912,32
people,people_education,uid,99.967,9121,9118,3
people,people_education,uin,99.967,9121,9118,3
people,people_education,people_uri,99.967,9121,9118,3
people,people_education,people_api,99.967,9121,9118,3
people,people_education,external_org_id,99.989,9121,9120,1
