In [239]:
import pandas as pd

In [240]:
# load dataset
df = pd.read_csv('datasets/presidents.csv')
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days"
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days"
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days"
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days"
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days"


In [241]:
# first we clean up the name into firstname and lastname
df['First Name'] = df['President'].replace('[ ].*', '', regex = True)
df['Last Name'] = df['President'].replace('.*[ ]', '', regex = True)
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First Name,Last Name
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [242]:
# there is a better way to do this, using the apply() function
del(df['First Name'])
del(df['Last Name'])

In [243]:
# the apply function takes a user-written function and applies it across a single column or an entire dataframe
def splitname(row):
    row['First Name'] = row['President'].split()[0]
    row['Last Name'] = row['President'].split()[-1]
    return row
# now, we apply this function to our dataframe
df = df.apply(splitname, axis = 'columns')
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First Name,Last Name
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [244]:
# still not a very ideal way to go about this
del(df['First Name'])
del(df['Last Name'])

In [245]:
# extract function() - takes regex in form of groups that want to be included from the input column dtype
pattern = '(^[\w]*)(?:.* )([\w]*$)' # ?: used to identify non capturing group
# now we perform the syntax Series.str.extract(pattern) [a dataframe column is a series]
names = df['President'].str.extract(pattern).head()
names

  pattern = '(^[\w]*)(?:.* )([\w]*$)' # ?: used to identify non capturing group


Unnamed: 0,0,1
0,George,Washington
1,John,Adams
2,Thomas,Jefferson
3,James,Madison
4,James,Monroe


In [246]:
# we can name the groups in a regex
pattern = '(?P<First>^[\w]*)(?:.* )(?P<Last>[\w]*$)'
names = df['President'].str.extract(pattern).head()
names

  pattern = '(?P<First>^[\w]*)(?:.* )(?P<Last>[\w]*$)'


Unnamed: 0,First,Last
0,George,Washington
1,John,Adams
2,Thomas,Jefferson
3,James,Madison
4,James,Monroe


In [247]:
# we can assign the extracted series* as columns in our original dataframe
df['First'] = names['First']
df['Last'] = names['Last']
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [248]:
# now, we cleanup the born column
df['Born'] = df["Born"].str.extract('(?P<Born>^[\w]{3} [\d]{1,2}, [\d]{4})')
df.head()
# we have cleaned up the date format using regexes and extract() function

  df['Born'] = df["Born"].str.extract('(?P<Born>^[\w]{3} [\d]{1,2}, [\d]{4})')


Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,"Feb 22, 1732","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [249]:
# pandas has some very interesting date time functions to store data such as 'Born'
df['Born'] = pd.to_datetime(df['Born'])
df.head()
# changes the format of the date and time to a different format

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,1732-02-22,"57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,1735-10-30,"61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,1743-04-13,"57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,1751-03-16,"57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,1758-04-28,"58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [250]:
# further cleaning
pattern = '(?P<StartAge>(\d+)\s*years,\s*(\d+)\s*days)(?P<StartDate>[\w]{3} [\d]{1,2}, [\d]{4})'
start = df['Age atstart of presidency'].str.extract(pattern)
start
df['Age at Start of Presidency'] = start['StartAge']
df['Start Date of Presidency'] = start['StartDate']
df.head()

  pattern = '(?P<StartAge>(\d+)\s*years,\s*(\d+)\s*days)(?P<StartDate>[\w]{3} [\d]{1,2}, [\d]{4})'


Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last,Age at Start of Presidency,Start Date of Presidency
0,1,George Washington,1732-02-22,"57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington,"57 years, 67 days","Apr 30, 1789"
1,2,John Adams,1735-10-30,"61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams,"61 years, 125 days","Mar 4, 1797"
2,3,Thomas Jefferson,1743-04-13,"57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson,"57 years, 325 days","Mar 4, 1801"
3,4,James Madison,1751-03-16,"57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison,"57 years, 353 days","Mar 4, 1809"
4,5,James Monroe,1758-04-28,"58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe,"58 years, 310 days","Mar 4, 1817"


In [251]:
pattern = '(?P<EndAge>(\d+)\s*years,\s*(\d+)\s*days)(?P<EndDate>[\w]{3} [\d]{1,2}, [\d]{4})'
end = df['Age atend of presidency'].str.extract(pattern)
end
df['Age at End of Presidency'] = end['EndAge']
df['End Date of Presidency'] = end['EndDate']
df.head()

  pattern = '(?P<EndAge>(\d+)\s*years,\s*(\d+)\s*days)(?P<EndDate>[\w]{3} [\d]{1,2}, [\d]{4})'


Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last,Age at Start of Presidency,Start Date of Presidency,Age at End of Presidency,End Date of Presidency
0,1,George Washington,1732-02-22,"57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington,"57 years, 67 days","Apr 30, 1789","65 years, 10 days","Mar 4, 1797"
1,2,John Adams,1735-10-30,"61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams,"61 years, 125 days","Mar 4, 1797","65 years, 125 days","Mar 4, 1801"
2,3,Thomas Jefferson,1743-04-13,"57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson,"57 years, 325 days","Mar 4, 1801","65 years, 325 days","Mar 4, 1809"
3,4,James Madison,1751-03-16,"57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison,"57 years, 353 days","Mar 4, 1809","65 years, 353 days","Mar 4, 1817"
4,5,James Monroe,1758-04-28,"58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe,"58 years, 310 days","Mar 4, 1817","66 years, 310 days","Mar 4, 1825"


In [252]:
df = df.rename(columns = {'Post-presidencytimespan' : 'Post Presidency Life Span'})
df['First Name'] = df['First']
df['Last Name'] = df['Last']
del(df['President'])
del(df['Age atstart of presidency'])
del(df['Age atend of presidency'])
df = df[['First Name', 'Last Name', 'Start Date of Presidency', 'Age at Start of Presidency', 'End Date of Presidency', 'Died', 'Age', 'Post Presidency Life Span']]
df['Start Date of Presidency'] = pd.to_datetime(df['Start Date of Presidency'])
df['End Date of Presidency'] = pd.to_datetime(df['End Date of Presidency'])
df.head() # data cleaning performed


Unnamed: 0,First Name,Last Name,Start Date of Presidency,Age at Start of Presidency,End Date of Presidency,Died,Age,Post Presidency Life Span
0,George,Washington,1789-04-30,"57 years, 67 days",1797-03-04,"Dec 14, 1799","67 years, 295 days","2 years, 285 days"
1,John,Adams,1797-03-04,"61 years, 125 days",1801-03-04,"Jul 4, 1826","90 years, 247 days","25 years, 122 days"
2,Thomas,Jefferson,1801-03-04,"57 years, 325 days",1809-03-04,"Jul 4, 1826","83 years, 82 days","17 years, 122 days"
3,James,Madison,1809-03-04,"57 years, 353 days",1817-03-04,"Jun 28, 1836","85 years, 104 days","19 years, 116 days"
4,James,Monroe,1817-03-04,"58 years, 310 days",1825-03-04,"Jul 4, 1831","73 years, 67 days","6 years, 122 days"


In [264]:
import pandas as pd
s1 = pd.Series({1: 'Alice', 2: 'Jack', 3: 'Molly'})
s2 = pd.Series({'Alice': 1, 'Jack': 2, 'Molly': 3})
s2.iloc[1]

2