# Learning Pandas from Start (Part IV, Data Manipulation)

In [1]:
import pandas as pd
import numpy as np

## Itemwise data change/transform

for ufunc, can directly apply to DataFrame object.

In [2]:
df = pd.DataFrame(np.arange(1,21).reshape(4,5), index=['red','blue','green','purple'],columns=['apple','banana','mango','orange','grape'])
df

Unnamed: 0,apple,banana,mango,orange,grape
red,1,2,3,4,5
blue,6,7,8,9,10
green,11,12,13,14,15
purple,16,17,18,19,20


In [3]:
np.sqrt(df)

Unnamed: 0,apple,banana,mango,orange,grape
red,1.0,1.414214,1.732051,2.0,2.236068
blue,2.44949,2.645751,2.828427,3.0,3.162278
green,3.316625,3.464102,3.605551,3.741657,3.872983
purple,4.0,4.123106,4.242641,4.358899,4.472136


In [4]:
dfx=df.replace(to_replace=[2,4,6], value=[20,40,60])
dfx

Unnamed: 0,apple,banana,mango,orange,grape
red,1,20,3,40,5
blue,60,7,8,9,10
green,11,12,13,14,15
purple,16,17,18,19,20


In [5]:
map_value = {2:20,4:40,6:60}
del dfx
dfx=df.replace(map_value)
dfx

Unnamed: 0,apple,banana,mango,orange,grape
red,1,20,3,40,5
blue,60,7,8,9,10
green,11,12,13,14,15
purple,16,17,18,19,20


In [6]:
df.loc['red', 'apple':'mango']=np.nan
df

Unnamed: 0,apple,banana,mango,orange,grape
red,,,,4,5
blue,6.0,7.0,8.0,9,10
green,11.0,12.0,13.0,14,15
purple,16.0,17.0,18.0,19,20


In [7]:
del dfx
dfx = df.replace(np.nan,0)
dfx

Unnamed: 0,apple,banana,mango,orange,grape
red,0.0,0.0,0.0,4,5
blue,6.0,7.0,8.0,9,10
green,11.0,12.0,13.0,14,15
purple,16.0,17.0,18.0,19,20


In [8]:
def foo(x):
    print(x)
    print('*'*50)
    return x+1000

df = df.applymap(foo)   # applymap is itemwise changing the dataframe.
df

nan
**************************************************
6.0
**************************************************
11.0
**************************************************
16.0
**************************************************
nan
**************************************************
7.0
**************************************************
12.0
**************************************************
17.0
**************************************************
nan
**************************************************
8.0
**************************************************
13.0
**************************************************
18.0
**************************************************
4
**************************************************
9
**************************************************
14
**************************************************
19
**************************************************
5
**************************************************
10
**************************************************
15
**********

Unnamed: 0,apple,banana,mango,orange,grape
red,,,,1004,1005
blue,1006.0,1007.0,1008.0,1009,1010
green,1011.0,1012.0,1013.0,1014,1015
purple,1016.0,1017.0,1018.0,1019,1020


In [10]:
one_col = df['orange']
one_col

red       1004
blue      1009
green     1014
purple    1019
Name: orange, dtype: int64

In [11]:
one_col.map(foo)   # map is doing itemwise change data for series.

1004
**************************************************
1009
**************************************************
1014
**************************************************
1019
**************************************************


red       2004
blue      2009
green     2014
purple    2019
Name: orange, dtype: int64

## Series data manipulation

In [14]:
def my_dec(func):
    def wrapper(*args,**kwargs):
        print(args)
        print(kwargs)
        print('*'*50)
        result = func(*args,**kwargs)
        return pd.Series([result,result,result])
    return wrapper

np.log = my_dec(np.log)
one_col.apply(np.log)     # the apply() of series send itemwise data for series to func. if func return series, result will be a dataframe.

(1004,)
{}
**************************************************
(1004,)
{}
**************************************************
(1009,)
{}
**************************************************
(1009,)
{}
**************************************************
(1014,)
{}
**************************************************
(1014,)
{}
**************************************************
(1019,)
{}
**************************************************
(1019,)
{}
**************************************************


Unnamed: 0,0,1,2
red,6.911747,6.911747,6.911747
blue,6.916715,6.916715,6.916715
green,6.921658,6.921658,6.921658
purple,6.926577,6.926577,6.926577


In [13]:
one_col.transform(np.log) # series transform will give one item by item to the func. and func can only return same size data. transform keeps shape of series

(1004,)
{}
**************************************************
(1009,)
{}
**************************************************
(1014,)
{}
**************************************************
(1019,)
{}
**************************************************


red       6.911747
blue      6.916715
green     6.921658
purple    6.926577
Name: orange, dtype: float64

## Row or column data change/transform for dataframe

The transform(), or apply(), will apply certain defined data change on rows or colunns.

In [33]:
def f(x):
    print(x)
    print('*'*30)
    return x**2
df.transform(f)    # we can use lambda if we do not need to print the passed argument value x

red        NaN
blue       6.0
green     11.0
purple    16.0
Name: apple, dtype: float64
******************************
red        NaN
blue       7.0
green     12.0
purple    17.0
Name: banana, dtype: float64
******************************
red        NaN
blue       8.0
green     13.0
purple    18.0
Name: mango, dtype: float64
******************************
red        4
blue       9
green     14
purple    19
Name: orange, dtype: int32
******************************
red        5
blue      10
green     15
purple    20
Name: grape, dtype: int32
******************************


Unnamed: 0,apple,banana,mango,orange,grape
red,,,,16,25
blue,36.0,49.0,64.0,81,100
green,121.0,144.0,169.0,196,225
purple,256.0,289.0,324.0,361,400


Data are passed to function f() column by column by transform(). Also can specify `axis=...` parameter to process data according to rows.

In [34]:
df.transform(f, axis=1)  # give axis = 1, or 'columns', means all columns data together to process, that is one row by one row.

apple     NaN
banana    NaN
mango     NaN
orange    4.0
grape     5.0
Name: red, dtype: float64
******************************
apple      6.0
banana     7.0
mango      8.0
orange     9.0
grape     10.0
Name: blue, dtype: float64
******************************
apple     11.0
banana    12.0
mango     13.0
orange    14.0
grape     15.0
Name: green, dtype: float64
******************************
apple     16.0
banana    17.0
mango     18.0
orange    19.0
grape     20.0
Name: purple, dtype: float64
******************************


Unnamed: 0,apple,banana,mango,orange,grape
red,,,,16.0,25.0
blue,36.0,49.0,64.0,81.0,100.0
green,121.0,144.0,169.0,196.0,225.0
purple,256.0,289.0,324.0,361.0,400.0


The difference between transform() and apply():
-  transform() works with function, a string name of function, a list of functions, and a dict. However, apply() is only allowed with function.
-  transform() cannot produce aggregated results. apply() can return less value thus reduce the shape. **Transform always result in same shape**.
-  **apply() works with multiple Series** at a time. But, transform() is only allowed to work with a single Series at a time. (so most of the case after done groupby we need use apply, cannot use transform)

In [35]:
def f2(x):
    print(x)
    print('*'*30)
    return (x.max())
df_a = df.apply(f2)   # the apply() give to function a column and it returns one scalar value. thus 2 dimensional dataframe result in one dimensional series.
df_a

red        NaN
blue       6.0
green     11.0
purple    16.0
Name: apple, dtype: float64
******************************
red        NaN
blue       7.0
green     12.0
purple    17.0
Name: banana, dtype: float64
******************************
red        NaN
blue       8.0
green     13.0
purple    18.0
Name: mango, dtype: float64
******************************
red        4
blue       9
green     14
purple    19
Name: orange, dtype: int32
******************************
red        5
blue      10
green     15
purple    20
Name: grape, dtype: int32
******************************


apple     16.0
banana    17.0
mango     18.0
orange    19.0
grape     20.0
dtype: float64

In [52]:
# df_b = df.transform(f2)  # transform will give one row to the function and take one scalar value. but as finally all value compose together cannot make the same shape as original, error raised.

In [37]:
df_auc = pd.DataFrame({'Auction_ID':[123,123,123,123,124,124,124,125],
                   'Bid_Price':[9,7,6,2,3,2,1,1]})
dfx = df_auc.groupby('Auction_ID')['Bid_Price'].apply(max)   # apply will take multiple row data which generated by gourpby().
dfx

Auction_ID
123    9
124    3
125    1
Name: Bid_Price, dtype: int64

In [38]:
def f_max(x):
    print(x)
    print('*'*30)
    return x.max()
dfy = df_auc.groupby('Auction_ID')['Bid_Price'].transform(f_max)  # transform scalar value will get broadcasted to all rows. makes final result the same length.
dfy

0    9
1    7
2    6
3    2
Name: 123, dtype: int64
******************************
4    3
5    2
6    1
Name: 124, dtype: int64
******************************
7    1
Name: 125, dtype: int64
******************************


0    9
1    9
2    9
3    9
4    3
5    3
6    3
7    1
Name: Bid_Price, dtype: int64

In [39]:
df_auc['Auction_Rank'] = df_auc.groupby('Auction_ID')['Bid_Price'].rank(ascending=False)
df_auc

Unnamed: 0,Auction_ID,Bid_Price,Auction_Rank
0,123,9,1.0
1,123,7,2.0
2,123,6,3.0
3,123,2,4.0
4,124,3,1.0
5,124,2,2.0
6,124,1,3.0
7,125,1,1.0


In [40]:
def f3(x):
    print(x)
    print('*'*30)
    return pd.Series([x['Bid_Price'].mean(),x['Bid_Price'].max(), x['Auction_Rank'].mean()],index=['mean','max','rank']) 
df_a = df_auc.groupby('Auction_ID').apply(f3)  # apply send dataframe of each group to the function to make the change, and return series. so finally series composed according Auction_ID as a dataframe.
df_a

   Auction_ID  Bid_Price  Auction_Rank
0         123          9           1.0
1         123          7           2.0
2         123          6           3.0
3         123          2           4.0
******************************
   Auction_ID  Bid_Price  Auction_Rank
4         124          3           1.0
5         124          2           2.0
6         124          1           3.0
******************************
   Auction_ID  Bid_Price  Auction_Rank
7         125          1           1.0
******************************


Unnamed: 0_level_0,mean,max,rank
Auction_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
123,6.0,9.0,2.5
124,2.0,3.0,2.0
125,1.0,1.0,1.0


This is giving rank according to the bid_price, within each auction group.

In [41]:
# df_b = df.groupby('Auction_ID').transform(f3)  # transform only send only column to the function for process, it does not support sending dataframe for process.

## Categorize data

In [42]:
df1 = pd.DataFrame(np.random.randint(1,100,(20,)))
df1

Unnamed: 0,0
0,96
1,48
2,16
3,72
4,89
5,53
6,46
7,24
8,24
9,31


In [43]:
bins = [0,25,50,75,100]
cat = pd.cut(df1[0],bins)
cat

0     (75, 100]
1      (25, 50]
2       (0, 25]
3      (50, 75]
4     (75, 100]
5      (50, 75]
6      (25, 50]
7       (0, 25]
8       (0, 25]
9      (25, 50]
10      (0, 25]
11    (75, 100]
12      (0, 25]
13     (50, 75]
14     (25, 50]
15      (0, 25]
16      (0, 25]
17    (75, 100]
18     (25, 50]
19     (50, 75]
Name: 0, dtype: category
Categories (4, interval[int64]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [44]:
pd.value_counts(cat)

(0, 25]      7
(25, 50]     5
(50, 75]     4
(75, 100]    4
Name: 0, dtype: int64

In [45]:
cat = pd.qcut(df1[0],4)
cat

0     (65.25, 96.0]
1     (43.0, 65.25]
2     (3.999, 24.0]
3     (65.25, 96.0]
4     (65.25, 96.0]
5     (43.0, 65.25]
6     (43.0, 65.25]
7     (3.999, 24.0]
8     (3.999, 24.0]
9      (24.0, 43.0]
10     (24.0, 43.0]
11    (65.25, 96.0]
12    (3.999, 24.0]
13    (43.0, 65.25]
14     (24.0, 43.0]
15    (3.999, 24.0]
16    (3.999, 24.0]
17    (65.25, 96.0]
18     (24.0, 43.0]
19    (43.0, 65.25]
Name: 0, dtype: category
Categories (4, interval[float64]): [(3.999, 24.0] < (24.0, 43.0] < (43.0, 65.25] < (65.25, 96.0]]

In [46]:
pd.value_counts(cat)

(3.999, 24.0]    6
(43.0, 65.25]    5
(65.25, 96.0]    5
(24.0, 43.0]     4
Name: 0, dtype: int64

## groupby()

In [47]:
df1.groupby(cat).aggregate(['sum','median', 'std'])

Unnamed: 0_level_0,0,0,0
Unnamed: 0_level_1,sum,median,std
0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
"(3.999, 24.0]",80,11.5,9.287985
"(24.0, 43.0]",124,29.5,6.480741
"(43.0, 65.25]",272,53.0,7.829432
"(65.25, 96.0]",421,84.0,9.066422


In [48]:
g = df1.groupby(cat)
g.groups

{(3.999, 24.0]: [2, 7, 8, 12, 15, 16], (24.0, 43.0]: [9, 10, 14, 18], (43.0, 65.25]: [1, 5, 6, 13, 19], (65.25, 96.0]: [0, 3, 4, 11, 17]}

In [49]:
from functools import reduce
a = g.apply(lambda x: [reduce(lambda x1,x2: x1+x2**2, x[0], 0), reduce(lambda x1,x2: x1+x2, x[0])])
a

0
(3.999, 24.0]      [1498, 80]
(24.0, 43.0]      [3970, 124]
(43.0, 65.25]    [15042, 272]
(65.25, 96.0]    [35777, 421]
dtype: object

Using apply(), we can do any data manipulation we want to do which we can define a function to achieve. We can define named function not just lambda function.

In [50]:
def b(x):
    print(x[0])
    for item in x[0]:
        print(item)
    print('*'*30)
    return np.nan
b = g.apply(b)
b

2     16
7     24
8     24
12     7
15     4
16     5
Name: 0, dtype: int32
16
24
24
7
4
5
******************************
9     31
10    25
14    28
18    40
Name: 0, dtype: int32
31
25
28
40
******************************
1     48
5     53
6     46
13    62
19    63
Name: 0, dtype: int32
48
53
46
62
63
******************************
0     96
3     72
4     89
11    80
17    84
Name: 0, dtype: int32
96
72
89
80
84
******************************


0
(3.999, 24.0]   NaN
(24.0, 43.0]    NaN
(43.0, 65.25]   NaN
(65.25, 96.0]   NaN
dtype: float64