In [1]:
%run ttcchen.ipynb

# Apply (or, thinking and doing functionally)
- we've talked at length about the benefits of broadcasting and vectorization, but sometimes you need to do something that numpy just can't do
- we can still think and act functionally, and pandas provides two mechanisms for this `apply` and `applymap`

- `applymap` is a function on a `DataFrame` which takes a single parameter, some function to apply to all cells in the `DataFrame`
- the return value is likewise simple, the new `DataFrame`
- let's take a look

In [2]:
%%fakedata
person
------
ssn
random_number(5) as cash_on_hand

In [3]:
person

Unnamed: 0,ssn,cash_on_hand
0,395-97-1843,30223
1,716-83-9743,68043
2,496-63-6342,33561
3,039-26-0496,32450
4,212-88-7817,73929
5,777-22-3715,41296
6,890-15-1622,83494
7,363-05-3389,91562
8,449-53-6927,38910
9,875-54-5391,3790


In [4]:
person.applymap(lambda x: print(x))

395-97-1843
716-83-9743
496-63-6342
039-26-0496
212-88-7817
777-22-3715
890-15-1622
363-05-3389
449-53-6927
875-54-5391
395-97-1843
716-83-9743
496-63-6342
039-26-0496
212-88-7817
777-22-3715
890-15-1622
363-05-3389
449-53-6927
875-54-5391
30223
68043
33561
32450
73929
41296
83494
91562
38910
3790


Unnamed: 0,ssn,cash_on_hand
0,,
1,,
2,,
3,,
4,,
5,,
6,,
7,,
8,,
9,,


In [5]:
def remove_dash(x):
    return x.replace("-","")
person.applymap(remove_dash)

AttributeError: ("'int' object has no attribute 'replace'", 'occurred at index cash_on_hand')

In [6]:
person.dtypes

ssn             object
cash_on_hand     int64
dtype: object

In [7]:
def remove_dash(x):
    if type(x)==str:
        return x.replace("-","")
    return x
person.applymap(remove_dash)

Unnamed: 0,ssn,cash_on_hand
0,395971843,30223
1,716839743,68043
2,496636342,33561
3,39260496,32450
4,212887817,73929
5,777223715,41296
6,890151622,83494
7,363053389,91562
8,449536927,38910
9,875545391,3790


- In the end, I almost never use `applymap`, but `apply` I use on **every** data cleaning approach
- `apply` looks almost the same, but acts across a row or column
- **very very powerful**
- the result of an apply is a `Series` or `DataFrame` object

In [8]:
%%fakedata
people
------
name
address

In [9]:
people

Unnamed: 0,name,address
0,Hannah Thompson,"8745 Mike Terrace\nCostatown, IA 63607"
1,Theresa Wheeler,"96746 Bobby Park\nEast Melinda, WA 89186"
2,William Pruitt,258 James Isle Suite 973\nPort Christophervill...
3,Jorge Wheeler II,"9299 Jason Valley\nAdamburgh, NY 43096"
4,Michael Hood,"5630 Joseph Light\nCynthiamouth, WI 06236"
5,Rebecca Romero,"14108 Smith Stream\nStevenstad, MD 13142"
6,Travis Williams,"141 Hartman Route Apt. 757\nWest Patrickmouth,..."
7,Matthew Davis,"301 Wendy Cape Suite 739\nEricville, TX 82238"
8,Stephanie Phillips,"PSC 1218, Box 0557\nAPO AP 05617"
9,Melissa Morgan,579 Roberson Junctions Apt. 424\nSouth Kimberl...


In [11]:
# so, how should we clean this?
people.head()

Unnamed: 0,name,address
0,Hannah Thompson,"8745 Mike Terrace\nCostatown, IA 63607"
1,Theresa Wheeler,"96746 Bobby Park\nEast Melinda, WA 89186"
2,William Pruitt,258 James Isle Suite 973\nPort Christophervill...
3,Jorge Wheeler II,"9299 Jason Valley\nAdamburgh, NY 43096"
4,Michael Hood,"5630 Joseph Light\nCynthiamouth, WI 06236"


In [15]:

def get_zip(row):
    print(type(row))
    return

people.apply(get_zip, axis=1)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
9    None
dtype: object

In [22]:
def get_zip(row):
    '''This function expectes a parameter called row which is of type Series.
    The expectation is that the Series object has two fields in it, one for
    the name of a person and one for the address of the person. This function
    will pull out and return the zip code for the address'''
    # little tests to make sure our row is formatted properly
    assert("name" in row)
    assert("address" in row)
    
    # For fun, an alternative is
    # return row["address"][-5] ? TODO test if this is reasonable
    
    # I <3 regex, lets use it to find the zip code    
    import re
    pattern="\d\d\d\d\d"
    zips=re.findall(pattern,row["address"])
    
    if len(zips) >0:
        return zips[-1]
    return None

people["zip codes"]=people.apply(get_zip, axis=1)
people

Unnamed: 0,name,address,zip codes
0,Hannah Thompson,"8745 Mike Terrace\nCostatown, IA 63607",63607
1,Theresa Wheeler,"96746 Bobby Park\nEast Melinda, WA 89186",89186
2,William Pruitt,258 James Isle Suite 973\nPort Christophervill...,34980
3,Jorge Wheeler II,"9299 Jason Valley\nAdamburgh, NY 43096",43096
4,Michael Hood,"5630 Joseph Light\nCynthiamouth, WI 06236",6236
5,Rebecca Romero,"14108 Smith Stream\nStevenstad, MD 13142",13142
6,Travis Williams,"141 Hartman Route Apt. 757\nWest Patrickmouth,...",38307
7,Matthew Davis,"301 Wendy Cape Suite 739\nEricville, TX 82238",82238
8,Stephanie Phillips,"PSC 1218, Box 0557\nAPO AP 05617",5617
9,Melissa Morgan,579 Roberson Junctions Apt. 424\nSouth Kimberl...,24779


In [26]:
def names(row):
    row["first_name"]=row["name"].split(" ")[0]
    row["last_name"]=row["name"].split(" ")[1]
    return row
people=people.apply(names,axis=1)

In [28]:
people

Unnamed: 0,name,address,zip codes,first_name,last_name
0,Hannah Thompson,"8745 Mike Terrace\nCostatown, IA 63607",63607,Hannah,Thompson
1,Theresa Wheeler,"96746 Bobby Park\nEast Melinda, WA 89186",89186,Theresa,Wheeler
2,William Pruitt,258 James Isle Suite 973\nPort Christophervill...,34980,William,Pruitt
3,Jorge Wheeler II,"9299 Jason Valley\nAdamburgh, NY 43096",43096,Jorge,Wheeler
4,Michael Hood,"5630 Joseph Light\nCynthiamouth, WI 06236",6236,Michael,Hood
5,Rebecca Romero,"14108 Smith Stream\nStevenstad, MD 13142",13142,Rebecca,Romero
6,Travis Williams,"141 Hartman Route Apt. 757\nWest Patrickmouth,...",38307,Travis,Williams
7,Matthew Davis,"301 Wendy Cape Suite 739\nEricville, TX 82238",82238,Matthew,Davis
8,Stephanie Phillips,"PSC 1218, Box 0557\nAPO AP 05617",5617,Stephanie,Phillips
9,Melissa Morgan,579 Roberson Junctions Apt. 424\nSouth Kimberl...,24779,Melissa,Morgan


In [29]:
%%fakedata
person_info
-----------
address as home
address as work
address as school
ssn

In [38]:
person_info=person_info.set_index("ssn")

In [57]:
def get_state(cell):
    import re
    results=re.findall(r"([A-Z]{2}) (?:\d\d\d\d\d)$",cell)
    return results[0]

person_info.applymap(get_state)

Unnamed: 0_level_0,home,work,school
ssn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
107-75-7841,TX,CA,SD
622-98-3196,UT,AE,CA
580-74-1264,ME,NY,WA
644-99-1686,AP,KY,CA
055-51-4765,HI,VT,MD
128-02-1128,AE,ME,CO
391-94-6266,AK,MD,OR
143-61-7753,MN,AK,NJ
101-65-8759,AL,ND,SD
786-91-1224,ME,AR,AA


In [52]:
person_info.loc["107-75-7841"]["home"]='91255 Aaron Village Apt. ZZ 81718\nWest Nicole, TX 81719'

In [49]:
person_info

Unnamed: 0_level_0,home,work,school
ssn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
107-75-7841,91255 Aaron Village Apt. TX 81719\nWest Nicole...,"24020 Tina Cliff\nSouth Traceyberg, CA 73353","8631 Julie Road\nDerekport, SD 73915"
622-98-3196,"5669 John Motorway\nWalkerchester, UT 95751",Unit 4457 Box 7085\nDPO AE 26053,"53006 Johnson Skyway\nLittleburgh, CA 34596"
580-74-1264,"32660 Moses Plaza\nPort Timothy, ME 81842","426 Michael Brook Apt. 394\nOsbornmouth, NY 34485","4408 Duarte Oval Apt. 087\nEast Carl, WA 05392"
644-99-1686,Unit 7098 Box 4563\nDPO AP 33062,"339 Cindy Gateway Suite 774\nNew Michelleside,...","930 Myers Station\nLevifort, CA 64655"
055-51-4765,"9165 Douglas Turnpike Suite 884\nSouth Mike, H...","631 Sarah Fields Suite 148\nJacobland, VT 53635","4450 Angela Passage\nHoustonborough, MD 96898"
128-02-1128,USS Lester\nFPO AE 25769,"105 Elizabeth Plains\nEast Ethan, ME 47701","20997 Keith Court\nPort Michael, CO 16085"
391-94-6266,"90506 Woods Rapids\nLake Samantha, AK 92366","549 Hancock Estates\nSouth Louis, MD 26054","5901 Angela Skyway Apt. 842\nPetersenchester, ..."
143-61-7753,"32317 Stephens Manors\nJohnshire, MN 66642","65667 Rodriguez Mills Suite 050\nReedview, AK ...","95185 Pennington Cove\nWest Ashleeland, NJ 69190"
101-65-8759,"105 Glenn Freeway\nCarlosbury, AL 25785","23139 Yesenia Plains Suite 080\nPort Eric, ND ...","23546 Karen Spur\nPort Cynthia, SD 69892"
786-91-1224,"0924 Carrie Brooks\nGuerreroberg, ME 29281","068 Michael Vista Suite 011\nNew Leslieside, A...",Unit 3986 Box 9374\nDPO AA 14577


In [62]:
%%fakedata
person
------
name
address
random_number(5) as num

In [65]:
def clean_name(some_string):
    lst=some_string.split(" ")[0:2]
    return "{} {}".format(lst[0],lst[1])

person["cleaned_name"]=person["name"].apply(clean_name)

In [72]:
#TODO: Chris to figure out and show?

def first_last(some_string):
    lst=some_string.split(" ")[0:2]
    return lst[0],lst[1]
person["first"],person["last"]=person["name"].apply(first_last)

ValueError: too many values to unpack (expected 2)

In [70]:
person

Unnamed: 0,name,address,num,cleaned_name,"(first, last)"
0,Matthew Wade PhD,"7498 Wright View\nShariville, KS 61265",69426,Matthew Wade,"(Matthew, Wade)"
1,Richard Mcclain,"809 Tami Way\nNorth Jamesside, NH 33598",61644,Richard Mcclain,"(Richard, Mcclain)"
2,Erika Brandt,"PSC 8022, Box 5200\nAPO AP 49782",62341,Erika Brandt,"(Erika, Brandt)"
3,Kenneth Myers,"2265 Dawson Lake Suite 627\nJacksonton, WV 22724",14733,Kenneth Myers,"(Kenneth, Myers)"
4,Kayla Hendrix,"61303 James Drives Suite 218\nSandersburgh, OK...",20761,Kayla Hendrix,"(Kayla, Hendrix)"
5,Debbie Logan,"2696 Melissa Trail Suite 637\nLaurenmouth, MI ...",95781,Debbie Logan,"(Debbie, Logan)"
6,Megan Miller,"113 Moore Spurs\nMackstad, OK 03901",2802,Megan Miller,"(Megan, Miller)"
7,Robert Fischer,"49121 Medina Green\nStacystad, CT 27914",23054,Robert Fischer,"(Robert, Fischer)"
8,Katie Beltran,"91134 Jason Extension\nNew Austin, UT 90128",45423,Katie Beltran,"(Katie, Beltran)"
9,Daniel Hoffman,"309 Harmon Squares Apt. 823\nJoshualand, AK 65117",60330,Daniel Hoffman,"(Daniel, Hoffman)"
