In [1]:
import pandas as pd


In [3]:
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


###### Map

This is a series method. 

Scenario - We want to create a dummy variable for the sex column ie turn it into ones and zeros.
Map allows us to map an existing value of a series to a different set of values

In [4]:
train['Sex_num'] = train.Sex.map({'female':0, 'male':1})

train['Sex_Num'] - We are creating a new column in the df

train.Sex - the column we are working on in the df

.map - the map function

{'female':0, 'male':1} - We pass the map function a dictionary where female maps to zero and male maps to one

In [5]:
train.loc[0:4, ['Sex', 'Sex_num']]

Unnamed: 0,Sex,Sex_num
0,male,1
1,female,0
2,female,0
3,female,0
4,male,1


Here we can compare the two columns to see that, in Sex_Num, the values are indeed zeros and ones rather than male and female
We also see that male has been translated to one and female has been translated to zero

###### Apply

This is both a series and a df method. It applies a function to each element in a series
Scenario - Let's pretend that we want to calculate the length of each string in the name column and create a new column called name_length that contains that integer value

In [6]:
train['Name_length'] = train.Name.apply(len)

In [7]:
train.loc[0:4, ['Name', 'Name_length']] # To compare just the name and name_length columns

Unnamed: 0,Name,Name_length
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44
4,"Allen, Mr. William Henry",24


When used as a series function, the apply method will apply whatever is in the parenthesis to the named series in our case train.Name, and it outputs the results
N.B. You just pass apply the name of the function and you leave off the parenthesis

###### Scenario - We are going to round up all the values in the Fares column using the numpy ceil function

In [8]:
import numpy as np

In [9]:
train['Fare_ceil'] = train.Fare.apply(np.ceil)

In [11]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_num,Name_length,Fare_ceil
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1,23,8.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,51,72.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,22,8.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,44,54.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1,24,9.0
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1,16,9.0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,23,52.0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,1,30,22.0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0,49,12.0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0,35,31.0


In [12]:
train.loc[0:4, ['Fare', 'Fare_ceil']] # These are the columns that we want to compare

Unnamed: 0,Fare,Fare_ceil
0,7.25,8.0
1,71.2833,72.0
2,7.925,8.0
3,53.1,54.0
4,8.05,9.0


As we can see, all the numbers in the Fare column has been rounded in...in some places, in a pretty horrid way

###### Scenario - We are going to extract the last name of each person into its own column

In [13]:
train.Name.str.split(',').head() # This gets us some of the way to where we want to get

0                           [Braund,  Mr. Owen Harris]
1    [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                            [Heikkinen,  Miss. Laina]
3      [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                          [Allen,  Mr. William Henry]
Name: Name, dtype: object

Although the output from this looks very similar to our df it is very different. In the df we had one string but this is a list of strings separated by the comma.

We need Pandas to pull out the first list element from each series element & output it to a new column

In [14]:
def get_element(my_list, position):
    return my_list[position]

In [15]:
train.Name.str.split(',').apply(get_element, position=0).head()

0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Name, dtype: object

Take this series, train.Name, apply this, get_element, function on every element and pass it keyword argument position = 0

###### Alternatively, use a lambda function. Lambda functions are used a lot with apply methods

In [16]:
train.Name.str.split(',').apply(lambda x: x[0]).head() 

0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Name, dtype: object

This gives us the same result

###### Apply as a DataFrame method
It applies a function along either axis of a df

In [17]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [19]:
drinks.loc[:, 'beer_servings':'wine_servings'] # We only want to use a subset of this df

Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0,0,0
1,89,132,54
2,25,0,14
3,245,138,312
4,217,57,45
5,102,128,45
6,193,25,221
7,21,179,11
8,261,72,212
9,279,75,191


In [20]:
drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=0)

beer_servings      376
spirit_servings    438
wine_servings      370
dtype: int64

In [21]:
drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=1)

0        0
1      132
2       25
3      312
4      217
5      128
6      221
7      179
8      261
9      279
10      46
11     176
12      63
13       0
14     173
15     373
16     295
17     263
18      34
19      23
20     167
21     173
22     173
23     245
24      31
25     252
26      25
27      88
28      37
29     144
      ... 
163    178
164     90
165    186
166    280
167     35
168     15
169    258
170    106
171      4
172     36
173     36
174    197
175     51
176     51
177     71
178     41
179     45
180    237
181    135
182    219
183     36
184    249
185    220
186    101
187     21
188    333
189    111
190      6
191     32
192     64
Length: 193, dtype: int64

In [22]:
drinks.loc[:, 'beer_servings':'wine_servings'].apply(np.argmax, axis=1)

  return getattr(obj, method)(*args, **kwds)


0        beer_servings
1      spirit_servings
2        beer_servings
3        wine_servings
4        beer_servings
5      spirit_servings
6        wine_servings
7      spirit_servings
8        beer_servings
9        beer_servings
10     spirit_servings
11     spirit_servings
12     spirit_servings
13       beer_servings
14     spirit_servings
15     spirit_servings
16       beer_servings
17       beer_servings
18       beer_servings
19       beer_servings
20       beer_servings
21     spirit_servings
22       beer_servings
23       beer_servings
24       beer_servings
25     spirit_servings
26       beer_servings
27       beer_servings
28       beer_servings
29       beer_servings
            ...       
163    spirit_servings
164      beer_servings
165      wine_servings
166      wine_servings
167    spirit_servings
168    spirit_servings
169    spirit_servings
170      beer_servings
171      wine_servings
172      beer_servings
173      beer_servings
174      beer_servings
175      be

###### Applymap
This is a df method
It applies a function to every element of the df

In [23]:
drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float)

Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0.0,0.0,0.0
1,89.0,132.0,54.0
2,25.0,0.0,14.0
3,245.0,138.0,312.0
4,217.0,57.0,45.0
5,102.0,128.0,45.0
6,193.0,25.0,221.0
7,21.0,179.0,11.0
8,261.0,72.0,212.0
9,279.0,75.0,191.0


This changes absolutely ebery element in the df to a float. Anything that starts as an integer will become a float.

In [25]:
drinks.loc[:, 'beer_servings':'wine_servings'] = drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float)

This overwrites the df so that all the named columns have now been converted from integers to floats