In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
# Lambda Function
# Anonymous Function
# Simplier to understand and cleaner
# Generally used with Apply and map functions 

# Steps to define a lambda function 
# Varibale to assign the function value(similar to function name)
# Lambda Keyword
# Input Parameters
# Function body

In [150]:
# Example 1 - Calculate square of a number

# Defining a function
lambda_function = lambda x:x**2
#Calling the function

print(lambda_function(2))
print(lambda_function(2.5))

4
6.25


In [4]:
# Example 2 - Multiple parameters 
# Finding the result of x raise to power y

# Defining a function
lambda_function = lambda x,y:x**y
#Calling the function

print(lambda_function(2,3))
print(lambda_function(2.5,3))

8
15.625


In [5]:
#Example 3
# Use another function inside apply
def func(x,y):
    return x**y

lambda_function = lambda x,y:func(x,y)
print(lambda_function(2,3))
print(lambda_function(2.5,3))

8
15.625


In [12]:
# Map functions.. 
# Used in many cases.. Most basic usecase is to iterate on basic datasets like lists

# Consider a list list_val with length of 4, you want to find the square of each element
list1 = [1,2,3,4]
sqd_list1 = list1**2
# This will throw an error.. You cannot do a vectorised operation on the list

TypeError: can't multiply sequence by non-int of type 'list'

In [35]:
# Alternate ways
# For loop -  Go through each element and find its square
list1 = [1,2,3,4]
list2 = []
for l in list1:
    list2.append(l**2)
print(list2)
# Computationally expensive and not that clean

[1, 4, 9, 16]


In [126]:
# Using Map Function
#Example - 1

# As the name suggests, it will automatically map a function to each element of a data structure, in this case a list
# Structure of map function -  map(function to map to a datastructure, datastructure)
# For function, we will try and use lambda function everywhere

list_val =[1,2,3,4] 
sqd_list_val = map(lambda x:x**2,list_val)
print(sqd_list_val)
print(list(sqd_list_val)) 

# The lambda function, we are passing one single parameter and getting its square 
# We are getting values of x from the list in the second parameter
# What map returns is a map object and it can be type casted to list

<map object at 0x000002CEBDA54148>
[1, 4, 9, 16]


In [7]:
# Map Function Example 2 
# a third list with sum of corresponding element of 2 list
# Since we have to sum up elements from 2 list, we will need 2 parameters, 1 for each list element

# x - element for list1
# y - element for list2

list1 = [1,2,3,4]
list2 = [5,6,7,8]

# Index wise elements addition
list3 = list(map(lambda x,y:x+y, list1,list2))
print(list3)

# Index wise elements Multiplication
list3 = list(map(lambda x,y:x*y, list1,list2))
print(list3)



print(list1+list2) # extends the list instead of index wise addition
print(list1*list2) # Throws an error

[6, 8, 10, 12]
[5, 12, 21, 32]
[1, 2, 3, 4, 5, 6, 7, 8]


TypeError: can't multiply sequence by non-int of type 'list'

In [9]:
# Using DataFrames for Map and Apply
# Example 1 

df = pd.DataFrame([['Yes'],['No'],['Not Sure']],columns  = ['Target'])
print(df)
print(df.shape)

     Target
0       Yes
1        No
2  Not Sure
(3, 1)


In [10]:
# Map to convert Yes to 1 and No to zero
# Map function can be used as a dictionary here.. Just pass a dictionary and it will be converted
# Map can only be applied to 1 column at a time so you need to specify the column name


df['converted'] = df['Target'].map({'Yes':1,'No':0})
print(df)

     Target  converted
0       Yes        1.0
1        No        0.0
2  Not Sure        NaN


In [11]:
# Alternate is replace function and it can be used for entire df also
df['converted2'] = df['Target'].replace({'Yes':1,'No':0})
print(df)

     Target  converted converted2
0       Yes        1.0          1
1        No        0.0          0
2  Not Sure        NaN   Not Sure


In [12]:
# Apply functions in Pandas

# Applcable to a single column/ entire df

# Vectorised operations can be done on  a dataframe.. then why do we need apply to iterate through element??

# Example

df = pd.DataFrame([['2'],['3'],['4']],columns  = ['num'])
print(df)

  num
0   2
1   3
2   4


In [13]:
# Square of the columns
# method 1 - vectorised 
df['num_sqd'] = df['num'].astype(int)**2
print(df)

# method 2 - apply function

# Works similar to map function -  maps a function to each and every element of a df column

df['num_sqd2'] = df['num'].apply(lambda x : int(x)**2)
print(df)

  num  num_sqd
0   2        4
1   3        9
2   4       16
  num  num_sqd  num_sqd2
0   2        4         4
1   3        9         9
2   4       16        16


In [14]:
# If you do not specify a column just before the apply function, it will pass in the entire dataframe
# Then x parameter in the lambda function will become an array and you need to specify the index for columns

# Access one column    
df['num_sqd3'] = df.apply(lambda x:int(x[2])**3,axis = 1)
print(df)

# Access multiple columns
df['num_sqd4'] = df.apply(lambda x:int(x[2]) + x[3],axis = 1)
print(df)

# But all this can be done directly using vectorised form also, special need for apply??

  num  num_sqd  num_sqd2  num_sqd3
0   2        4         4        64
1   3        9         9       729
2   4       16        16      4096
  num  num_sqd  num_sqd2  num_sqd3  num_sqd4
0   2        4         4        64        68
1   3        9         9       729       738
2   4       16        16      4096      4112


In [15]:
# Above examples using vectorised form
df['num_sqd3'] = df['num_sqd2']**3
df['num_sqd4'] = df['num_sqd2'] + df['num_sqd3']
print(df)

  num  num_sqd  num_sqd2  num_sqd3  num_sqd4
0   2        4         4        64        68
1   3        9         9       729       738
2   4       16        16      4096      4112


In [16]:
# Special need for apply

# If we have data types like list or string in a column(Iterable data types), direct vectorisation will not work

# Lets have an example dataset
df = pd.DataFrame([['Amit Soni'],['Shrey Pachisia'],['Prakarsh Chauhan']],columns  = ['full_name'])
print(df)

# Task is to extract the names and surnames in another column

str1 = 'Amit Soni'
first_name = str1.split(" ")[0]
last_name = str1.split(" ")[1]
print(first_name)
print(last_name)


# Can we do it directly??

df['name'] = df['full_name'].split(" ")[0]
# It will throw an error because df does not have any split function.. split only applies to string

          full_name
0         Amit Soni
1    Shrey Pachisia
2  Prakarsh Chauhan
Amit
Soni


AttributeError: 'Series' object has no attribute 'split'

In [17]:
# This is where we need apply functions.. we will have to apply split to every entry to the column
df = pd.DataFrame([['Amit Soni'],['Shrey Pachisia'],['Prakarsh Chauhan']],columns  = ['full_name'])


df['first_name'] = df['full_name'].apply(lambda x:x.split(" ")[0])
df['last_name'] = df['full_name'].apply(lambda x:x.split(" ")[1])
print(df)

          full_name first_name last_name
0         Amit Soni       Amit      Soni
1    Shrey Pachisia      Shrey  Pachisia
2  Prakarsh Chauhan   Prakarsh   Chauhan


In [18]:
# Example 2 - With lists

df = pd.DataFrame([[[1,2,3,4],[5,6,7,8]],[[2,3,4,5],[6,7,8,9]]],columns  = ['list1','list2'])
print(df)

# Job is to write a function which creates a column with the sum of 2nd element of both lists in list1 and list2
# Since we will have to use 2 columns of df in apply, we will not specify any column in apply

df['sum_list'] = df.apply(lambda x: x[0][1]+x[1][1],axis = 1)
print(df)

          list1         list2
0  [1, 2, 3, 4]  [5, 6, 7, 8]
1  [2, 3, 4, 5]  [6, 7, 8, 9]
          list1         list2  sum_list
0  [1, 2, 3, 4]  [5, 6, 7, 8]         8
1  [2, 3, 4, 5]  [6, 7, 8, 9]        10


In [33]:
# Example - 3
df = pd.read_csv('demo_dataset.csv')
print(df.head())
print(df.shape)

df = df[0:10000]

   Num1  Num2
0   993   900
1   689   575
2   355   968
3   549   599
4   245   475
(670464, 2)


In [34]:
# Demo to check the run time of for loop and apply functions
start_time = time.time()
for i in range(len(df)):
    df.loc[i,'Num3'] = df.loc[i,'Num1'] - df.loc[i,'Num2']
end_time = time.time()-start_time
print(end_time)
print(df.head())

9.971112489700317
   Num1  Num2   Num3
0   993   900   93.0
1   689   575  114.0
2   355   968 -613.0
3   549   599  -50.0
4   245   475 -230.0


In [35]:
start_time = time.time()
df['Num3'] = df.apply(lambda x: x['Num1']-x['Num2'],axis = 1)
end_time = time.time()-start_time
print(end_time)
print(df.head())

0.49347829818725586
   Num1  Num2   Num3
0   993   900   93.0
1   689   575  114.0
2   355   968 -613.0
3   549   599  -50.0
4   245   475 -230.0


In [139]:
# Environics Example

# Loading Raw Data

df = pd.read_csv('en_df.csv')
print(df)

def calculate_on_group(x):
    fill_val = x.tolist()
    return pd.Series(np.array(fill_val).sum(), index=x.index)

  Postcode Variable  Variable Population in Postcode   CA Population
0       P1    20-24                                30            436
1       P1    25-29                                34            436
2       P1    30-34                                44            436
3       P2    20-24                                61            436
4       P2    25-29                                75            436
5       P2    30-34                                30            436
6       P3    20-24                                39            436
7       P3    25-29                                54            436
8       P3    30-34                                69            436


In [142]:
# Creating Additional Columns required for indexing later
df['Postcode_Population'] = df.groupby('Postcode')['Variable Population in Postcode '].apply(lambda x: calculate_on_group(x))
df['Variable_Population_Canada'] = df.groupby('Variable')['Variable Population in Postcode '].apply(lambda x: calculate_on_group(x))
# Calculating Ratios
df['Postcode_Ratio'] = round(df['Variable Population in Postcode ']/df['Postcode_Population'],2)
df['CA_Ratio'] = round(df['Variable_Population_Canada']/df['CA Population'],2)
df['Index'] = round(df['Postcode_Ratio']/df['CA_Ratio'],2)

# Giving Ranking to index 
df['Rank'] = df.groupby('Postcode')['Index'].rank(method='max',ascending = False)
df

Unnamed: 0,Postcode,Variable,Variable Population in Postcode,CA Population,Postcode_Population,Variable_Population_Canada,Postcode_Ratio,CA_Ratio,Index,Rank
0,P1,20-24,30,436,108,130,0.28,0.3,0.93,2.0
1,P1,25-29,34,436,108,163,0.31,0.37,0.84,3.0
2,P1,30-34,44,436,108,143,0.41,0.33,1.24,1.0
3,P2,20-24,61,436,166,130,0.37,0.3,1.23,1.0
4,P2,25-29,75,436,166,163,0.45,0.37,1.22,2.0
5,P2,30-34,30,436,166,143,0.18,0.33,0.55,3.0
6,P3,20-24,39,436,162,130,0.24,0.3,0.8,3.0
7,P3,25-29,54,436,162,163,0.33,0.37,0.89,2.0
8,P3,30-34,69,436,162,143,0.43,0.33,1.3,1.0


In [144]:
# Everything in lambda is passed as a group(as specified) rather than a single element and then the final results are aggregated
def calculate_on_group1(x):
    print(x)
    fill_val = x.tolist()
    return pd.Series(np.array(fill_val).sum(), index=x.index)

df.groupby('Postcode')['Variable Population in Postcode '].apply(lambda x: calculate_on_group1(x))

0    30
1    34
2    44
Name: P1, dtype: int64
3    61
4    75
5    30
Name: P2, dtype: int64
6    39
7    54
8    69
Name: P3, dtype: int64


0    108
1    108
2    108
3    166
4    166
5    166
6    162
7    162
8    162
Name: Variable Population in Postcode , dtype: int32

In [147]:
# Everything in lambda is passed as a single element and then the final results are aggregated
def calculate_on_group2(x):
    print(x)
    return pd.Series(x)
df['Variable Population in Postcode '].apply(lambda x: calculate_on_group2(x))

30
34
44
61
75
30
39
54
69


Unnamed: 0,0
0,30
1,34
2,44
3,61
4,75
5,30
6,39
7,54
8,69


In [None]:
#https://www.analyticsvidhya.com/blog/2020/03/what-are-lambda-functions-in-python/#:~:text=We%20can%20use%20the%20apply,gets%20applied%20to%20each%20row
#https://towardsdatascience.com/apply-and-lambda-usage-in-pandas-b13a1ea037f7
#https://www.journaldev.com/33478/pandas-dataframe-apply-examples
#https://towardsdatascience.com/how-to-make-your-pandas-loop-71-803-times-faster-805030df4f06