![TAP](https://avatars2.githubusercontent.com/u/13385739?v=3&s=200 "TAP")
# Module 2: Working with Data Frames
This module will give you an overview for how to work with and think about data frames.

In [None]:
import json
import random
import trustedanalytics as ia

# Connect to the analytics server...
ia.connect()

# Set seed...
random.seed = 01001000

In [None]:
# List the frames available to us...
ia.get_frame_names()

In [None]:
# Load the inpatient data and revert it to the original state...
tutorial_inpat = ia.get_frame("tutorial_inpat_frame")
tutorial_inpat.drop_columns([i for i in tutorial_inpat.column_names if i != 'data_lines'])

In [None]:
# Let's see what methods are available to us...
dir(tutorial_inpat)

In [None]:
# List column names...
tutorial_inpat.column_names

In [None]:
# Let's add patient ages...
def add_age(row):
    """Function to extract age information from inpatient json"""
    my_json = json.loads(row[0])
    AGE = my_json['AGE'] if 'AGE' in my_json else 0.0
    return AGE

def add_random_number(row):
    return random.randint(a=0, b=1000)

# Add ages using the add column function! 
# We'll discuss this function in detail in the next module...
# tutorial_inpat.add_columns(add_age, ("AGE", ia.float64))
tutorial_inpat.add_columns(add_random_number, ("RANDOM_NUMBER", int))

In [None]:
# Let's look at our data!
tutorial_inpat.inspect()

In [None]:
# Yikes! Here's a trick I use...
tutorial_inpat.inspect(columns=[i for i in tutorial_inpat.column_names if i != 'data_lines'])

In [None]:
# We can check how many patient encounters were in our data...
tutorial_inpat.row_count

In [None]:
tutorial_inpat.column_summary_statistics('AGE')

In [None]:
# Let's create a copy of our data frame...
tmp = tutorial_inpat.copy()

####Data Cleaning

In [None]:
# Let's get rid of that data_lines column, since this is just a working copy...
tmp.drop_columns(columns=['data_lines'])

In [None]:
# We can filter our data. This function modifies the frame in place, so it's good that we made a copy!
tmp.filter(predicate=lambda row: row.RANDOM_NUMBER % 2 == 0)

In [None]:
# Check it.
tmp.inspect()

In [None]:
tmp.row_count

In [None]:
tmp.drop_duplicates(unique_columns=['AGE'])

In [None]:
tmp.row_count

#### Other Operations of Note

In [None]:
# Aggregate your data using group_by...
inpat_age_count = tutorial_inpat.group_by('AGE', ia.agg.count)

In [None]:
# Check it.
inpat_age_count.inspect()

In [None]:
# We can sort our data in place...
inpat_age_count.sort(columns=['count', 'AGE'], ascending=False)

In [None]:
inpat_age_count.inspect()

In [None]:
# We can extract our data for non-ATK work...
local_data = inpat_age_count.take(n=inpat_age_count.row_count, offset=0, columns=None)

In [None]:
# Check it.
len(local_data)

In [None]:
type(local_data)

In [None]:
type(inpat_age_count)