In [1]:
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/titanic_train.csv'

train = pd.read_csv(url)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# map() function as a Series method
# Mostly used for mapping categorical data to numerical data
# create new column

train['Sex_num'] = train.Sex.map({'female':0, 'male':1})

In [5]:
# let's compared Sex and Sex_num columns
# here we can see we map male to 1 and female to 0

train.loc[0:4, ['Sex', 'Sex_num']]

Unnamed: 0,Sex,Sex_num
0,male,1
1,female,0
2,female,0
3,female,0
4,male,1


In [6]:
# apply() function as a Series method
# Applies a function to each element in the Series

# say we want to calculate length of string in each string in "Name" column

# create new column
# we are applying Python's len function

train['Name_length'] = train['Name'].apply(len)

In [7]:
# the apply() method applies the function to each element

train.loc[0:3, ['Name', 'Name_length']]

Unnamed: 0,Name,Name_length
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44


In [8]:
import numpy as np

In [9]:
# say we look at the "Fare" column and we want to round it up
# we will use numpy's ceil function to round up the numbers

train['Fare_ceil'] = train['Fare'].apply(np.ceil)

In [11]:
train.loc[0:5, 'Fare':'Fare_ceil']

Unnamed: 0,Fare,Cabin,Embarked,Sex_num,Name_length,Fare_ceil
0,7.25,,S,1,23,8.0
1,71.2833,C85,C,0,51,72.0
2,7.925,,S,0,22,8.0
3,53.1,C123,S,0,44,54.0
4,8.05,,S,1,24,9.0
5,8.4583,,Q,1,16,9.0


In [12]:
# let's extract last name of each person

# we will use a str method
# now the series is a list of strings
# each cell has 2 strings in a list as you can see below

train.Name.str.split(',')

0                             [Braund,  Mr. Owen Harris]
1      [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                              [Heikkinen,  Miss. Laina]
3        [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                            [Allen,  Mr. William Henry]
                             ...                        
886                             [Montvila,  Rev. Juozas]
887                      [Graham,  Miss. Margaret Edith]
888          [Johnston,  Miss. Catherine Helen "Carrie"]
889                             [Behr,  Mr. Karl Howell]
890                               [Dooley,  Mr. Patrick]
Name: Name, Length: 891, dtype: object

In [13]:
# we just want the first string from the list
# we create a function to retrieve

def get_element(my_list, position):
    return my_list[position]

In [17]:
# use our created get_element function
# we pass position=0

train.Name.str.split(',').apply(get_element, position=0).head()

0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Name, dtype: object

In [19]:
# instead of above, we can use a lambda function
# input x (the list in this case)
# output x[0] (the first string of the list in this case)

train.Name.str.split(',').apply(lambda x: x[0]).head()

0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Name, dtype: object

In [20]:
# getting the second string which is the last name

train.Name.str.split(',').apply(lambda x: x[1]).head()

0                                Mr. Owen Harris
1     Mrs. John Bradley (Florence Briggs Thayer)
2                                    Miss. Laina
3             Mrs. Jacques Heath (Lily May Peel)
4                              Mr. William Henry
Name: Name, dtype: object