## Manipulation Methods

### where , select , apply , if then else 

.apply method execute for every value in series , so avoid it 

In [14]:
import pandas as pd 
url = 'https://github.com/arunadas/effective-pandas/raw/main/data/vehicles.csv.zip'
df = pd.read_csv(url,dtype='unicode')
city_mpg = df.city08.astype(int)
make = df.make

In [6]:
def gt20(val):
    return val > 20 

In [13]:
%%timeit
city_mpg.apply(gt20)

2.51 ms ± 17.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
#broadcast method 
%timeit city_mpg.gt(20)

18 µs ± 109 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [15]:
make.size

41144

In [17]:
make.sample(n=3)

5483       Toyota
13671         GMC
11298    Chrysler
Name: make, dtype: object

In [18]:
make.value_counts()

make
Chevrolet                      4003
Ford                           3371
Dodge                          2583
GMC                            2494
Toyota                         2071
                               ... 
Volga Associated Automobile       1
Panos                             1
Mahindra                          1
Excalibur Autos                   1
London Coach Co Inc               1
Name: count, Length: 136, dtype: int64

In [20]:
top5 = make.value_counts().index[:5]
def generalize_top5(val):
    if val in top5:
        return val
    return 'other'    

In [24]:
# apply will call function generalize_top5 for each value 
%timeit make.apply(generalize_top5)

12.2 ms ± 133 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [25]:
%timeit make.where(make.isin(top5), other='Other')

1.17 ms ± 10.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [26]:
make.where(make.isin(top5), other='Other')

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [27]:
#mask is complement of where where ever condition is false it keeps the original value
make.mask(make.isin(top5), other='Other')

0        Alfa Romeo
1           Ferrari
2             Other
3             Other
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [29]:
#~ inversion of boolean array switching all true to false or vice versa
make.mask(~make.isin(top5), other='Other')

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [31]:
vc = make.value_counts()
top5 = vc.index[:5]
top10 = vc.index[:10]

def generalize(val):
    if val in top5:
       return val
    elif val in top10:
       return 'Top10'
    else:
        return 'Other'

In [32]:
make.apply(generalize)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [34]:
# usage of pandas
(make
.where(make.isin(top5), 'Top10')
.where(make.isin(top10), 'Other'))

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [35]:
# another option is select from numpy
import numpy as np
np.select([make.isin(top5), make.isin(top10)],
          [make,'Top10'], 'Other')


array(['Other', 'Other', 'Dodge', ..., 'Other', 'Other', 'Other'],
      dtype=object)

np.select(condlist, choicelist, default):

condlist: A list of boolean conditions.
choicelist: A list of values corresponding to each condition in condlist.
default: A fallback value used when none of the conditions are True

output of select is numpy array you can wrap it in series if you want series


In [36]:
pd.Series(np.select([make.isin(top5), make.isin(top10)],
          [make,'Top10'], 'Other'))

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Length: 41144, dtype: object

### Missing Data

In [37]:
cyl = df.cylinders

In [39]:
(cyl.isna().sum())

np.int64(206)

In [40]:
missing = cyl.isna()

In [41]:
make.loc[missing]

7138     Nissan
7139     Toyota
8143     Toyota
8144       Ford
8146       Ford
          ...  
34563     Tesla
34564     Tesla
34565     Tesla
34566     Tesla
34567     Tesla
Name: make, Length: 206, dtype: object

### Filling in missing Data 

In [43]:
cyl[cyl.isna()]

7138     NaN
7139     NaN
8143     NaN
8144     NaN
8146     NaN
        ... 
34563    NaN
34564    NaN
34565    NaN
34566    NaN
34567    NaN
Name: cylinders, Length: 206, dtype: object

In [45]:
cyl.fillna(0).loc[7136:7141]
# no mutation of data occurs in any operations

7136    6
7137    6
7138    0
7139    0
7140    6
7141    6
Name: cylinders, dtype: object

### Interpolating Data

In [46]:
temp = pd.Series([32, 40, None , 42, 39, 32])

In [47]:
temp.interpolate()

0    32.0
1    40.0
2    41.0
3    42.0
4    39.0
5    32.0
dtype: float64

### Clipping Data