In [1]:
import json_lines
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data into list, and then load to dataframe
with json_lines.open('../data/ida_wrangling_exercise_data.2017-02-13.jsonl.gz') as f:
    data = [item for item in f]
    df = pd.DataFrame(data)

## Data exploration 

In [3]:
df.head()

Unnamed: 0,address,dob,email,id,name,phone,record_date,ssn
0,"{'street': '86314 David Pass Apt. 211', 'city'...",1971-06-30,opark@hotmail.com,01d68a4c598a45559c06f4df0b3d82cb,"{'firstname': 'Cynthia', 'lastname': 'Dawson',...",624-869-4610,2006-07-08T09:02:13,xxx-xx-2412
1,"20722 Coleman Villages\nEast Rose, SC 71064-5894",1965-09-09,sperez@armstrong.com,876ff718291d4397bb1e0477ceee6ad9,"{'firstname': 'Tamara', 'lastname': 'Myers'}",1-594-462-7759,2009-03-28T20:22:57,xxx-xx-8025
2,"{'street': '6676 Young Square', 'city': 'New J...",1993-04-12,uortiz@gmail.com,81753097bf7e4e2085982f422bdb9cda,"{'firstname': 'Jamie', 'lastname': 'Alexander'}",472.218.5065x389,2016-08-30T20:31:39,xxx-xx-0568
3,"0932 Gomez Drives\nLeefort, MD 46879-3166",1977-04-14,palmerdiane@yahoo.com,2c2f7154b80f40ca80d08c5adc54ea45,"{'firstname': 'Angela', 'lastname': 'Garcia', ...",1-663-109-4460x1080,2001-02-15T18:50:35,xxx-xx-9825
4,"{'street': '158 Smith Vista', 'city': 'East Sh...",1970-03-19,nancymaxwell@gmail.com,4f5263f339694d068e17ee7fdbb852b8,"{'firstname': 'Jennifer', 'lastname': 'Rodrigu...",233-423-3823,2014-06-21T14:36:01,xxx-xx-9104


In [4]:
'Number of rows {} '.format(len(df))

'Number of rows 150000 '

This initial dataframe has 150,000 entries. Some of the fields (e.g. address and name) are recorded as dictionaries and strings. Let's start working with a subset (first 100 entries ) of the data frame. Truncating the dataframe will speed up initial code experimentation. 

### Operating on subset of data 

In [5]:
# Subset wtih first 100 entries of dataframe
df_sub = df[0:100].copy()

### 1. Start by making a list of all of the nested named fields that appear in any record. Concatenate nested field names using a period '.' to defind named fields for nested records. Present the list in alphabetical order.

In [6]:
def expand_dict(dframe, dict_names):
    """This function will take a dataframe and dictionary list as inputs. 
    It will convert dictionary keys into seperate columns within the same dataframe by invoking the apply method.
    It returns the original dataframe with concatenated columns in alphabetical order"""

    # Iterate through list of dictionaries
    for item in dict_names:
        # Exand dictionary keys into columns
        dframe = pd.concat([dframe, dframe[item].apply(pd.Series).add_prefix(
            (item + '.'))], axis=1)

        # Drops extra generated column from apply method
        dframe.drop(item + '.0', axis=1, inplace=True)
    return dframe

In [7]:
#Invoke the function above 
df_sub = expand_dict(df_sub, ['address', 'name'])
df_sub.head()

Unnamed: 0,address,dob,email,id,name,phone,record_date,ssn,address.city,address.state,address.street,address.zip,name.firstname,name.lastname,name.middlename
0,"{'street': '86314 David Pass Apt. 211', 'city'...",1971-06-30,opark@hotmail.com,01d68a4c598a45559c06f4df0b3d82cb,"{'firstname': 'Cynthia', 'lastname': 'Dawson',...",624-869-4610,2006-07-08T09:02:13,xxx-xx-2412,Hoodburgh,RI,86314 David Pass Apt. 211,83973.0,Cynthia,Dawson,Claire
1,"20722 Coleman Villages\nEast Rose, SC 71064-5894",1965-09-09,sperez@armstrong.com,876ff718291d4397bb1e0477ceee6ad9,"{'firstname': 'Tamara', 'lastname': 'Myers'}",1-594-462-7759,2009-03-28T20:22:57,xxx-xx-8025,,,,,Tamara,Myers,
2,"{'street': '6676 Young Square', 'city': 'New J...",1993-04-12,uortiz@gmail.com,81753097bf7e4e2085982f422bdb9cda,"{'firstname': 'Jamie', 'lastname': 'Alexander'}",472.218.5065x389,2016-08-30T20:31:39,xxx-xx-0568,New Julie,UT,6676 Young Square,73125.0,Jamie,Alexander,
3,"0932 Gomez Drives\nLeefort, MD 46879-3166",1977-04-14,palmerdiane@yahoo.com,2c2f7154b80f40ca80d08c5adc54ea45,"{'firstname': 'Angela', 'lastname': 'Garcia', ...",1-663-109-4460x1080,2001-02-15T18:50:35,xxx-xx-9825,,,,,Angela,Garcia,Alexis
4,"{'street': '158 Smith Vista', 'city': 'East Sh...",1970-03-19,nancymaxwell@gmail.com,4f5263f339694d068e17ee7fdbb852b8,"{'firstname': 'Jennifer', 'lastname': 'Rodrigu...",233-423-3823,2014-06-21T14:36:01,xxx-xx-9104,East Sharonstad,ME,158 Smith Vista,42483.0,Jennifer,Rodriguez,


### 2. Answer the following questions for each field in your list from question 1.

- What percentage of the records contain the field?
- What are the five most common values of the field?

In [8]:

def col_percent(dframe):
    '''A funciton to loop through columns and return the percentage of populated items.'''
    for item in dframe:
        print('{} = {:.1f}%'.format(item, 100 *
                                    dframe[item].count() / len(dframe)))

In [9]:
# Invoke the function above 
col_percent(df_sub)

address = 93.0%
dob = 99.0%
email = 81.0%
id = 100.0%
name = 100.0%
phone = 93.0%
record_date = 100.0%
ssn = 96.0%
address.city = 41.0%
address.state = 41.0%
address.street = 41.0%
address.zip = 41.0%
name.firstname = 66.0%
name.lastname = 66.0%
name.middlename = 26.0%


In [29]:
#Five most common values in a column
df_sub['address.state'].value_counts().head(5)

UT    4
RI    4
WY    3
CT    2
LA    2
Name: address.state, dtype: int64

In [26]:
df_sub[['address.state', 'name.firstname']].mode()

Unnamed: 0,address.state,name.firstname
0,RI,Robert
1,UT,


In [57]:
# Create list comprehension to calculate top 5 of each column
#data = {item: df_sub[item].value_counts().head(5) for item in df_sub[['address.state', 'name.firstname']]}
pd.DataFrame((df_sub[item].value_counts().head(5) for item in df_sub[['address.state', 'name.firstname']])
#data

SyntaxError: unexpected EOF while parsing (<ipython-input-57-1695a81cecf8>, line 4)

In [63]:
type([df_sub[item].value_counts().head(5) for item in df_sub[['address.state', 'name.firstname']]])

list

In [89]:
pd.DataFrame(df_sub['address.state'].value_counts().head(5) ).transpose()

Unnamed: 0,UT,RI,WY,CT,LA
address.state,4,4,3,2,2


In [93]:
pd.DataFrame(df_sub['address.state'].value_counts().head(5) )

Unnamed: 0,address.state
UT,4
RI,4
WY,3
CT,2
LA,2


In [78]:
df_sub['address.state'].value_counts().head(5)

UT    4
RI    4
WY    3
CT    2
LA    2
Name: address.state, dtype: int64

In [70]:
pd.concat([df_sub['address.state'].value_counts().head(5) ,df_sub['address.city'].value_counts().head(5)  ], axis = 0)

Unnamed: 0,address.state,address.city
CT,2.0,
Courtneyport,,1.0
East Stephaniefurt,,1.0
LA,2.0,
Lake Laura,,1.0
RI,4.0,
Rodriguezland,,1.0
UT,4.0,
WY,3.0,
West Annaside,,1.0
