In [1]:
from IPython.display import Markdown, display, HTML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.options.display.float_format = '{:,.3f}'.format
pd.options.styler.format.precision = 2
pd.options.styler.format.thousands = ','
pd.options.display.max_columns = 50
plt.style.use('seaborn-darkgrid')

# General Markdown Formatting Functions

def printmd(string, level=1):
    header_level = '#'*level + ' '
    display(Markdown(header_level + string))

In [14]:
base_path = "../../Provided Resources/2022 Student Data Science Competition_TAMIDS/Scholars@TAMU Data"
people_path = base_path + "/people"

people_raw = {
    "affiliation": pd.read_excel(people_path + "/people_affiliation.xlsx", sheet_name = 0),
    "education": pd.read_excel(people_path + "/People_education.xlsx", sheet_name = 0),
    "overview": pd.read_excel(people_path + "/people_overview.xlsx", sheet_name = 0),
    "subject_areas": pd.read_excel(people_path + "/people_subject_areas.xlsx", sheet_name = 0)
}

In [9]:
def count_types(df: pd.DataFrame, percentage=False, plot=False, show_table=False, color_map:str=False) -> pd.DataFrame:
    new_df = pd.DataFrame()
    for column in df:
        new_df[column] = df[column].apply(lambda x: "nan" if pd.isna(x) else type(x).__name__)
    new_df = new_df.apply(pd.value_counts)

    if percentage:
        new_df = new_df / new_df.sum() * 100

    if plot:
        new_df.T.plot.bar()
        plt.yscale("log")
        plt.rcParams["figure.figsize"]
        plt.show()

    if show_table:
        if color_map:
            cm = sns.light_palette(color_map, as_cmap=True)
            color_df = new_df.style.background_gradient(cmap=cm, axis=None)
            color_df = color_df.applymap(lambda x: 'color: transparent' if pd.isnull(x) else '')
            color_df = color_df.applymap(lambda x: 'background-color: transparent' if pd.isnull(x) else '')
            display(color_df)
        else:
            display(new_df)


    return new_df

In [13]:
def slice_unnamed_columns(df: pd.DataFrame):
    return df.loc[:,~df.columns.str.contains('Unnamed:')]

In [27]:
people_sliced = {}

for name, df in people_raw.items():
    printmd(name, 3)
    # print(type_dict)
    sliced_df = count_types(slice_unnamed_columns(df))
    people_sliced[name] = sliced_df
    # count_types(df, show_table=True)
    type_dict = {column: sliced_df[column].idxmax() for column in sliced_df}
    display(type_dict)

### affiliation

{'uid': 'str',
 'uin': 'int',
 'people_uri': 'str',
 'people_api': 'str',
 'position_dept_id': 'float'}

### education

{'uid': 'str',
 'uin': 'float',
 'people_uri': 'str',
 'people_api': 'str',
 'external_org_id': 'float',
 'granting_school_name': 'str',
 'degreeuri_text': 'str',
 'year': 'int',
 'major': 'str'}

### overview

{'uid': 'str',
 'uin': 'int',
 'people_uri': 'str',
 'people_api': 'str',
 'status': 'str',
 'dept_id': 'float',
 'lastname': 'str',
 'middle': 'str',
 'firstname': 'str',
 'preferred_title': 'str',
 'email': 'str',
 'overview': 'nan'}

### subject_areas

{'uid': 'str',
 'uin': 'int',
 'people_uri': 'str',
 'people_api': 'str',
 'research_areas': 'nan'}