# Completeness of Scholars@TAMU Data After Cleaning  
**Filename:** completeness_after_clean.ipynb  
**Path:** TAMIDS/Code/Scholars@TAMU Data/completeness_after_clean.ipynb  
**Created Date:** 02 April 2022, 21:30 

This is pretty much the same file as before, but using the data produced by cleaning.ipynb

In [1]:
from IPython.display import Markdown, display, HTML
from typing import Dict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import json

pd.options.display.float_format = '{:,.3f}'.format
pd.options.styler.format.precision = 2
pd.options.styler.format.thousands = ','
pd.options.display.max_columns = 50
pd.options.display.max_rows = 150
plt.style.use('seaborn-darkgrid')

# General Markdown Formatting Functions

def printmd(string, level=1):
    header_level = '#'*level + ' '
    display(Markdown(header_level + string))

## Scholars@TAMU

In [2]:
base_path = "../../Data/Scholars@TAMU"

!ls "$base_path"

[34mcourses[m[m           [34mmy_api_calls[m[m      [34mpeople[m[m
[34mgrants_and_awards[m[m [34morganizations[m[m     [34mpublications[m[m


In [3]:
def count_types(df: pd.DataFrame, percentage=False, plot=False, show_table=False, color_map:str=False) -> pd.DataFrame:
    new_df = pd.DataFrame()

    # the below can count the total amount of characters in a column
    # doof = df.copy()
    # doof = doof.applymap(lambda x: len(str(x)))
    # display(doof.sum())

    for column in df:
        new_df[column] = df[column].apply(lambda x: "nan" if pd.isna(x) else type(x).__name__)
    new_df = new_df.apply(pd.value_counts)

    if percentage:
        new_df = new_df / new_df.sum() * 100

    if plot:
        new_df.T.plot.bar()
        plt.yscale("log")
        plt.rcParams["figure.figsize"]
        plt.show()

    if show_table:
        if color_map:
            cm = sns.light_palette(color_map, as_cmap=True)
            color_df = new_df.style.background_gradient(cmap=cm, axis=None)
            color_df = color_df.applymap(lambda x: 'color: transparent' if pd.isnull(x) else '')
            color_df = color_df.applymap(lambda x: 'background-color: transparent' if pd.isnull(x) else '')
            display(color_df)
        else:
            display(new_df)


    return new_df


In [4]:
def slice_unnamed_columns(df: pd.DataFrame):
    return df.loc[:,~df.columns.str.contains('Unnamed:')]

def calculate_completeness(df: pd.DataFrame, show_table=False, color_map:str=False) -> pd.DataFrame:
    sliced_df = slice_unnamed_columns(df)
    missing_values = sliced_df.isna().sum()
    total_rows = sliced_df.shape[0]
    availible_values = sliced_df.count()
    data_completeness = (1 - missing_values / total_rows ) *  100

    new_df = pd.DataFrame({
        'data_completeness': data_completeness,
        'total_rows': total_rows,
        'availible_values': availible_values,
        'missing_values': missing_values
    })

    if show_table:
        if color_map:
            cm = sns.color_palette(color_map, as_cmap=True)
            color_df = new_df.style.text_gradient(cmap=cm, axis=0, gmap=new_df['data_completeness'], vmin=0, vmax=100)
            display(color_df)
        else:
            display(new_df)

    return new_df

In [5]:
def col_types_to_json(df_dict:Dict[str, pd.DataFrame], folder_name: str):
    type_dict = {}
    for file_name, df in df_dict.items():
        sliced_df = slice_unnamed_columns(df)
        counted_df = count_types(sliced_df)
        type_dict[file_name] = {column: counted_df[column].idxmax() for column in counted_df}

    with open('dicts/column_types/' + folder_name + '_col_types.json', 'w') as outfile:
        json.dump(type_dict, outfile, indent=4)

In [6]:
with open('dicts/data_filenames.json', 'r') as outfile:
    data_filenames = json.load(outfile)

## ./people

In [7]:
people_path = base_path + "/people"

people_data = {picklename: pd.read_pickle(people_path + "/" + picklename + ".pickle") for picklename in data_filenames['people']}

In [8]:
for name, df in people_data.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### people_affiliation

Unnamed: 0,uid,uin,people_uri,people_api,position_dept_id
float,,,,,5912.0
int,,5944.0,,,
,,,,,32.0
str,5944.0,,5944.0,5944.0,


### people_education

Unnamed: 0,uid,uin,people_uri,people_api,external_org_id,granting_school_name,degreeuri_text,year,major
float,,9118.0,,,9120.0,,,,
int,,,,,,,,8612.0,
,3.0,3.0,3.0,3.0,1.0,,459.0,509.0,1398.0
str,9118.0,,9118.0,9118.0,,9121.0,8662.0,,7723.0


### people_overview

Unnamed: 0,uid,uin,people_uri,people_api,status,dept_id,lastname,middle,firstname,preferred_title,email,overview
float,,,,,,4859.0,,,,,,
int,,4859.0,,,,,,,,,,
,,,,,,,,1531.0,,5.0,129.0,3063.0
str,4859.0,,4859.0,4859.0,4859.0,,4859.0,3328.0,4859.0,4854.0,4730.0,1796.0


### people_subject_areas

Unnamed: 0,uid,uin,people_uri,people_api,research_areas
int,,7328.0,,,
,,,,,4226.0
str,7328.0,,7328.0,7328.0,3102.0


In [9]:
people_stats = {}

for name, df in people_data.items():
    printmd(name, 3)
    people_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### people_affiliation

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,5944,5944,0
uin,100.0,5944,5944,0
people_uri,100.0,5944,5944,0
people_api,100.0,5944,5944,0
position_dept_id,99.46,5944,5912,32


### people_education

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,99.97,9121,9118,3
uin,99.97,9121,9118,3
people_uri,99.97,9121,9118,3
people_api,99.97,9121,9118,3
external_org_id,99.99,9121,9120,1
granting_school_name,100.0,9121,9121,0
degreeuri_text,94.97,9121,8662,459
year,94.42,9121,8612,509
major,84.67,9121,7723,1398


### people_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,4859,4859,0
uin,100.0,4859,4859,0
people_uri,100.0,4859,4859,0
people_api,100.0,4859,4859,0
status,100.0,4859,4859,0
dept_id,100.0,4859,4859,0
lastname,100.0,4859,4859,0
middle,68.49,4859,3328,1531
firstname,100.0,4859,4859,0
preferred_title,99.9,4859,4854,5


### people_subject_areas

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,7328,7328,0
uin,100.0,7328,7328,0
people_uri,100.0,7328,7328,0
people_api,100.0,7328,7328,0
research_areas,42.33,7328,3102,4226


## ./courses

In [10]:
courses_path = base_path + "/courses"

courses_data = {picklename: pd.read_pickle(courses_path + "/" + picklename + ".pickle") for picklename in data_filenames['courses']}

In [11]:
for name, df in courses_data.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### course_overview

Unnamed: 0,uid,uin,people_uri,people_api,course,course_name,acad_dept,year
int,,70291.0,,,,,,70258.0
,,,,,,,,33.0
str,70291.0,,70291.0,70291.0,70291.0,70291.0,70291.0,


In [12]:
courses_stats = {}

for name, df in courses_data.items():
    printmd(name, 3)
    courses_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### course_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,70291,70291,0
uin,100.0,70291,70291,0
people_uri,100.0,70291,70291,0
people_api,100.0,70291,70291,0
course,100.0,70291,70291,0
course_name,100.0,70291,70291,0
acad_dept,100.0,70291,70291,0
year,99.95,70291,70258,33


## ./grants_and_awards

In [13]:
grants_path = base_path + "/grants_and_awards"

grants_data = {picklename: pd.read_pickle(grants_path + "/" + picklename + ".pickle") for picklename in data_filenames['grants_and_awards']}

In [14]:
for name, df in grants_data.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### awards_overview

Unnamed: 0,uid,uin,people_uri,people_api,award_id,year,award_name,authoritative_name_id
float,,,,,4198.0,4198.0,,4192.0
int,,10052.0,,,,,,
,,,,,5854.0,5854.0,5860.0,5860.0
str,10052.0,,10052.0,10052.0,,,4192.0,


### grants_abstract

Unnamed: 0,grant_uri,grant_api,abstract
,,,2970
str,5802.0,5802.0,2832


### grants_overview

Unnamed: 0,uid,uin,people_uri,people_api,grant_uid,grant_uri,grant_api,start_year,end_year,role,grant_awarded_by_external_org_id,title
float,,,,,,,,5752.0,5729.0,,5800.0,
int,,5803.0,,,,,,,,,,
,,,,,,,,51.0,74.0,,3.0,
str,5803.0,,5803.0,5803.0,5803.0,5803.0,5803.0,,,5803.0,,5803.0


In [15]:
grants_stats = {}

for name, df in grants_data.items():
    printmd(name, 3)
    grants_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### awards_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,10052,10052,0
uin,100.0,10052,10052,0
people_uri,100.0,10052,10052,0
people_api,100.0,10052,10052,0
award_id,41.76,10052,4198,5854
year,41.76,10052,4198,5854
award_name,41.7,10052,4192,5860
authoritative_name_id,41.7,10052,4192,5860


### grants_abstract

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
grant_uri,100.0,5802,5802,0
grant_api,100.0,5802,5802,0
abstract,48.81,5802,2832,2970


### grants_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
uid,100.0,5803,5803,0
uin,100.0,5803,5803,0
people_uri,100.0,5803,5803,0
people_api,100.0,5803,5803,0
grant_uid,100.0,5803,5803,0
grant_uri,100.0,5803,5803,0
grant_api,100.0,5803,5803,0
start_year,99.12,5803,5752,51
end_year,98.72,5803,5729,74
role,100.0,5803,5803,0


## ./organizations

In [16]:
orgs_path = base_path + "/organizations"

orgs_data = {picklename: pd.read_pickle(orgs_path + "/" + picklename + ".pickle") for picklename in data_filenames['organizations']}

In [17]:
for name, df in orgs_data.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### organizations_external

Unnamed: 0,external_org_id,external_org_uri,external_org_api,authoritative_name,authoritative_name_city,authoritative_name_state,authoritative_name_country,grid_id,aa_id
int,95673.0,,,,,,,,993.0
,,,,1.0,473.0,60626.0,450.0,532.0,94680.0
str,,95673.0,95673.0,95672.0,95200.0,35047.0,95223.0,95141.0,


### organizations_local

Unnamed: 0,[dept_id],[dept_name],[department_uri],[department_api]
int,586.0,,,
str,,586.0,586.0,586.0


In [18]:
orgs_stats = {}

for name, df in orgs_data.items():
    printmd(name, 3)
    orgs_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### organizations_external

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
external_org_id,100.0,95673,95673,0
external_org_uri,100.0,95673,95673,0
external_org_api,100.0,95673,95673,0
authoritative_name,100.0,95673,95672,1
authoritative_name_city,99.51,95673,95200,473
authoritative_name_state,36.63,95673,35047,60626
authoritative_name_country,99.53,95673,95223,450
grid_id,99.44,95673,95141,532
aa_id,1.04,95673,993,94680


### organizations_local

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
[dept_id],100.0,586,586,0
[dept_name],100.0,586,586,0
[department_uri],100.0,586,586,0
[department_api],100.0,586,586,0


## ./publications

In [19]:
pubs_path = base_path + "/publications"

pubs_data = {picklename: pd.read_pickle(pubs_path + "/" + picklename + ".pickle") for picklename in data_filenames['publications']}

In [20]:
for name, df in pubs_data.items():
    printmd(name, 3)
    new_df = count_types(df, show_table=True, color_map='green')

### publications_abstract

Unnamed: 0,publication_uri,publication_api,abstract
,,,55692
str,223178.0,223178.0,167486


### publications_author_institutions

Unnamed: 0,publication_uri,publication_api,organisation,city,country
,,,1,,
str,950084.0,950084.0,950083,950084.0,950084.0


### publications_overview

Unnamed: 0,people_uid,uin,people_uri,people_api,dept_id,publication_uid,publication_uri,publication_api,doi,issn,eissn,journal_title,month,year,begin_page,end_page,volume,issue,publisher,publication_type,publication_title
int,,223171.0,,,223171.0,,,,,,,,211516.0,222634.0,,,,,,,
,,,,,,,,,43510.0,34803.0,81816.0,28574.0,11655.0,537.0,,,,,41307.0,,15.0
str,223171.0,,223171.0,223171.0,,223171.0,223171.0,223171.0,179661.0,188368.0,141355.0,194597.0,,,223171.0,223171.0,223171.0,223171.0,181864.0,223171.0,223156.0


### publications_subject_journal_wos

Unnamed: 0,publication_uri,publication_api,wos_id,wos_research_area_id,keyword
float,,,,194946.0,
,,,12475.0,12475.0,12475.0
str,207421.0,207421.0,194946.0,,194946.0


### publications_unsdg

Unnamed: 0,publication_uri,publication_api,category_sdg_id,name
int,,,14154.0,
str,14154.0,14154.0,,14154.0


In [21]:
pubs_stats = {}

for name, df in pubs_data.items():
    printmd(name, 3)
    pubs_stats[name] = calculate_completeness(df, show_table=True, color_map='light:red_r')

### publications_abstract

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
publication_uri,100.0,223178,223178,0
publication_api,100.0,223178,223178,0
abstract,75.05,223178,167486,55692


### publications_author_institutions

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
publication_uri,100.0,950084,950084,0
publication_api,100.0,950084,950084,0
organisation,100.0,950084,950083,1
city,100.0,950084,950084,0
country,100.0,950084,950084,0


### publications_overview

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
people_uid,100.0,223171,223171,0
uin,100.0,223171,223171,0
people_uri,100.0,223171,223171,0
people_api,100.0,223171,223171,0
dept_id,100.0,223171,223171,0
publication_uid,100.0,223171,223171,0
publication_uri,100.0,223171,223171,0
publication_api,100.0,223171,223171,0
doi,80.5,223171,179661,43510
issn,84.41,223171,188368,34803


### publications_subject_journal_wos

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
publication_uri,100.0,207421,207421,0
publication_api,100.0,207421,207421,0
wos_id,93.99,207421,194946,12475
wos_research_area_id,93.99,207421,194946,12475
keyword,93.99,207421,194946,12475


### publications_unsdg

Unnamed: 0,data_completeness,total_rows,availible_values,missing_values
publication_uri,100.0,14154,14154,0
publication_api,100.0,14154,14154,0
category_sdg_id,100.0,14154,14154,0
name,100.0,14154,14154,0


## Combined

In [22]:
combined_dict = {
    'people': pd.concat(people_stats.values(), axis=0, keys=people_stats.keys()),
    'courses': pd.concat(courses_stats.values(), axis=0, keys=courses_stats.keys()),
    'grants_and_awards': pd.concat(grants_stats.values(), axis=0, keys=grants_stats.keys()),
    'organizations': pd.concat(orgs_stats.values(), axis=0, keys=orgs_stats.keys()),
    'publications': pd.concat(pubs_stats.values(), axis=0, keys=pubs_stats.keys())
}

combined_df = pd.concat(combined_dict.values(), axis=0, keys=combined_dict.keys())

combined_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,data_completeness,total_rows,availible_values,missing_values
people,people_affiliation,uid,100.0,5944,5944,0
people,people_affiliation,uin,100.0,5944,5944,0
people,people_affiliation,people_uri,100.0,5944,5944,0
people,people_affiliation,people_api,100.0,5944,5944,0
people,people_affiliation,position_dept_id,99.462,5944,5912,32
people,people_education,uid,99.967,9121,9118,3
people,people_education,uin,99.967,9121,9118,3
people,people_education,people_uri,99.967,9121,9118,3
people,people_education,people_api,99.967,9121,9118,3
people,people_education,external_org_id,99.989,9121,9120,1
