# **Data preparation and feature creation**

# **Arithmetic operations**

### **Add/Sub/Mul/Div between Columns**

In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv('titanic.csv')

In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [4]:
titanic.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,
5,0,3,male,,0,0,8.4583,Q,
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,male,2.0,3,1,21.075,S,
8,1,3,female,27.0,0,2,11.1333,S,
9,1,2,female,14.0,1,0,30.0708,C,


In [5]:
titanic.age.fillna(titanic.age.mean(), inplace = True)

In [6]:
titanic.sibsp + titanic.parch

0      1
1      1
2      0
3      1
4      0
      ..
886    0
887    0
888    3
889    0
890    0
Length: 891, dtype: int64

In [7]:
titanic.sibsp.add(titanic.parch)

0      1
1      1
2      0
3      1
4      0
      ..
886    0
887    0
888    3
889    0
890    0
Length: 891, dtype: int64

In [8]:
titanic['num_relat'] = titanic.sibsp.add(titanic.parch)

In [9]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,num_relat
0,0,3,male,22.0,1,0,7.25,S,,1
1,1,1,female,38.0,1,0,71.2833,C,C,1
2,1,3,female,26.0,0,0,7.925,S,,0
3,1,1,female,35.0,1,0,53.1,S,C,1
4,0,3,male,35.0,0,0,8.05,S,,0


In [10]:
sales = pd.read_csv('sales.csv')

In [11]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  4 non-null      object 
 1   Mon         4 non-null      int64  
 2   Tue         4 non-null      int64  
 3   Wed         4 non-null      int64  
 4   Thu         3 non-null      float64
 5   Fri         4 non-null      int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 324.0+ bytes


In [12]:
sales.head()

Unnamed: 0.1,Unnamed: 0,Mon,Tue,Wed,Thu,Fri
0,Steven,34,27,15,,33
1,Mike,45,9,74,87.0,12
2,Andi,17,33,54,8.0,29
3,Paul,87,67,27,45.0,7


In [13]:
sales.Mon + sales.Thu

0      NaN
1    132.0
2     25.0
3    132.0
dtype: float64

In [14]:
sales.Mon.add(sales.Thu, fill_value = 0)

0     34.0
1    132.0
2     25.0
3    132.0
dtype: float64

In [15]:
sales['perc_bonus'] = [0.12, 0.15, 0.10, 0.20]

In [16]:
sales.Thu * sales.perc_bonus

0      NaN
1    13.05
2     0.80
3     9.00
dtype: float64

In [17]:
sales.Thu.mul(sales.perc_bonus, fill_value = 0)

0     0.00
1    13.05
2     0.80
3     9.00
dtype: float64

In [18]:
sales.Thu.add(sales.Thu.mul(sales.perc_bonus, fill_value = 0), fill_value = 0)

0      0.00
1    100.05
2      8.80
3     54.00
dtype: float64

In [19]:
sales

Unnamed: 0.1,Unnamed: 0,Mon,Tue,Wed,Thu,Fri,perc_bonus
0,Steven,34,27,15,,33,0.12
1,Mike,45,9,74,87.0,12,0.15
2,Andi,17,33,54,8.0,29,0.1
3,Paul,87,67,27,45.0,7,0.2


In [20]:
sales.iloc[:, :-1].sum(axis = 1, numeric_only = True)

0    109.0
1    227.0
2    141.0
3    233.0
dtype: float64

In [21]:
sales['Bonus'] = sales.iloc[:, :-1].sum(axis = 1, numeric_only = True).mul(sales.perc_bonus)

In [22]:
sales

Unnamed: 0.1,Unnamed: 0,Mon,Tue,Wed,Thu,Fri,perc_bonus,Bonus
0,Steven,34,27,15,,33,0.12,13.08
1,Mike,45,9,74,87.0,12,0.15,34.05
2,Andi,17,33,54,8.0,29,0.1,14.1
3,Paul,87,67,27,45.0,7,0.2,46.6


### **Add/Sub/Mul/Div between Columns and Scalar**

In [23]:
(1912 - titanic.age).astype('int32').head(20)

0     1890
1     1874
2     1886
3     1877
4     1877
5     1882
6     1858
7     1910
8     1885
9     1898
10    1908
11    1854
12    1892
13    1873
14    1898
15    1857
16    1910
17    1882
18    1881
19    1882
Name: age, dtype: int32

In [24]:
titanic.age.sub(1912, fill_value = 0).mul(-1)

0      1890.000000
1      1874.000000
2      1886.000000
3      1877.000000
4      1877.000000
          ...     
886    1885.000000
887    1893.000000
888    1882.300882
889    1886.000000
890    1880.000000
Name: age, Length: 891, dtype: float64

In [25]:
titanic['YoB'] = titanic.age.sub(1912, fill_value = 0).mul(-1).astype('int32')

In [26]:
titanic.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,num_relat,YoB
0,0,3,male,22.0,1,0,7.25,S,,1,1890
1,1,1,female,38.0,1,0,71.2833,C,C,1,1874
2,1,3,female,26.0,0,0,7.925,S,,0,1886
3,1,1,female,35.0,1,0,53.1,S,C,1,1877
4,0,3,male,35.0,0,0,8.05,S,,0,1877
5,0,3,male,29.699118,0,0,8.4583,Q,,0,1882
6,0,1,male,54.0,0,0,51.8625,S,E,0,1858
7,0,3,male,2.0,3,1,21.075,S,,4,1910
8,1,3,female,27.0,0,2,11.1333,S,,2,1885
9,1,2,female,14.0,1,0,30.0708,C,,1,1898


In [27]:
fx_rate = 1.1

In [28]:
titanic['EUR_fare'] = titanic.fare.div(fx_rate)

In [29]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,num_relat,YoB,EUR_fare
0,0,3,male,22.0,1,0,7.25,S,,1,1890,6.590909
1,1,1,female,38.0,1,0,71.2833,C,C,1,1874,64.803
2,1,3,female,26.0,0,0,7.925,S,,0,1886,7.204545
3,1,1,female,35.0,1,0,53.1,S,C,1,1877,48.272727
4,0,3,male,35.0,0,0,8.05,S,,0,1877,7.318182


In [30]:
titanic.drop(columns = ['sibsp', 'parch', 'deck', 'YoB', 'EUR_fare'], inplace = True)

In [31]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,fare,embarked,num_relat
0,0,3,male,22.0,7.25,S,1
1,1,1,female,38.0,71.2833,C,1
2,1,3,female,26.0,7.925,S,0
3,1,1,female,35.0,53.1,S,1
4,0,3,male,35.0,8.05,S,0


In [32]:
sales.Bonus.round(2)

0    13.08
1    34.05
2    14.10
3    46.60
Name: Bonus, dtype: float64

In [33]:
sales

Unnamed: 0.1,Unnamed: 0,Mon,Tue,Wed,Thu,Fri,perc_bonus,Bonus
0,Steven,34,27,15,,33,0.12,13.08
1,Mike,45,9,74,87.0,12,0.15,34.05
2,Andi,17,33,54,8.0,29,0.1,14.1
3,Paul,87,67,27,45.0,7,0.2,46.6


In [34]:
fixed_cost = 5

In [39]:
sales.rename(columns = {'Unnamed: 0': 'Name'}, inplace = True)

In [43]:
sales.set_index('Name', inplace = True)

In [44]:
sales.iloc[:, :-2].sub(fixed_cost, fill_value = 0)

Unnamed: 0_level_0,Mon,Tue,Wed,Thu,Fri
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Steven,29,22,10,-5.0,28
Mike,40,4,69,82.0,7
Andi,12,28,49,3.0,24
Paul,82,62,22,40.0,2


In [46]:
per_bonus = 0.1

In [47]:
sales.iloc[:, :-2].mul(per_bonus, fill_value = 0)

Unnamed: 0_level_0,Mon,Tue,Wed,Thu,Fri
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Steven,3.4,2.7,1.5,0.0,3.3
Mike,4.5,0.9,7.4,8.7,1.2
Andi,1.7,3.3,5.4,0.8,2.9
Paul,8.7,6.7,2.7,4.5,0.7


In [53]:
sales

Unnamed: 0_level_0,Mon,Tue,Wed,Thu,Fri,perc_bonus,Bonus
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Steven,34,27,15,,33,0.12,13.08
Mike,45,9,74,87.0,12,0.15,34.05
Andi,17,33,54,8.0,29,0.1,14.1
Paul,87,67,27,45.0,7,0.2,46.6


In [49]:
lot_size = 10
bonus_per_lot = 1.25

In [59]:
sales.iloc[:, :-2].floordiv(lot_size, fill_value = 0).mul(bonus_per_lot).sum(axis = 1)

Name
Steven    11.25
Mike      25.00
Andi      13.75
Paul      25.00
dtype: float64

## **Using map()**