In [1]:
import pandas as pd 
url = 'https://github.com/arunadas/effective-pandas/raw/main/data/vehicles.csv.zip'
df = pd.read_csv(url,dtype='unicode')
city_mpg = df.city08.astype(int)
highway_mpg = df.highway08.astype(int)
make = df.make

### Renaming the index

In [2]:
city_mpg.head()

0    19
1     9
2    23
3    10
4    17
Name: city08, dtype: int64

In [3]:
make.head()

0    Alfa Romeo
1       Ferrari
2         Dodge
3         Dodge
4        Subaru
Name: make, dtype: object

In [14]:
a = make.to_dict()
for k, v in list(a.items())[:5]:
    print(k,v)
city2 = city_mpg.rename(make.to_dict())
city2.head()
list(a.items())[:5]

0 Alfa Romeo
1 Ferrari
2 Dodge
3 Dodge
4 Subaru


[(0, 'Alfa Romeo'), (1, 'Ferrari'), (2, 'Dodge'), (3, 'Dodge'), (4, 'Subaru')]

In [15]:
#.rename method accepts series, scalar
import pandas as pd
test = pd.Series(['Dave','Suzy','Adam','Liv'])
test

0    Dave
1    Suzy
2    Adam
3     Liv
dtype: object

In [16]:
test.rename(index={0:'first'})

first    Dave
1        Suzy
2        Adam
3         Liv
dtype: object

In [18]:
def to_str(val):
    return f"idx-{val}"
    
test.rename(to_str)    

idx-0    Dave
idx-1    Suzy
idx-2    Adam
idx-3     Liv
dtype: object

In [20]:
s2 = pd.Series(['a','b','c','d'])
test.rename(index=s2)

a    Dave
b    Suzy
c    Adam
d     Liv
dtype: object

In [21]:
#passing a scalar value will change only .name attribute index will remain the same
test.rename(index='first')

0    Dave
1    Suzy
2    Adam
3     Liv
Name: first, dtype: object

### Resetting the index

In [23]:
# resetting the index for monotonic increase default is current index will become new column -- returing a dataframe
city2.reset_index()

pandas.core.frame.DataFrame

In [24]:
city2.reset_index(drop=True)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [25]:
test.rename_axis("first").reset_index()

Unnamed: 0,first,0
0,0,Dave
1,1,Suzy
2,2,Adam
3,3,Liv


### loc attribute

In [26]:
city2.loc['Subaru']

Subaru    17
Subaru    21
Subaru    22
Subaru    19
Subaru    20
          ..
Subaru    19
Subaru    20
Subaru    18
Subaru    18
Subaru    16
Name: city08, Length: 885, dtype: int64

In [30]:
test.loc[0]

'Dave'

In [32]:
s = pd.Series([145,142,38,13,2],
             index = ['paul','john','George','Ringo','George'])

In [33]:
s.loc['paul']

np.int64(145)

In [34]:
s.loc['George']

George    38
George     2
dtype: int64

In [35]:
# if you want series then pass list
s.loc[['paul']]

paul    145
dtype: int64

In [36]:
s.loc['George':'paul']

KeyError: "Cannot get left slice bound for non-unique label: 'George'"

In [37]:
#slicing closed include the last value
s.sort_index().loc["George":"paul"]

George     38
George      2
Ringo      13
john      142
paul      145
dtype: int64

In [38]:
s.sort_index().loc["G":"p"]

George     38
George      2
Ringo      13
john      142
dtype: int64

In [40]:
idx = pd.Index(['Dodge'])
city2.loc[idx]

Dodge    23
Dodge    10
Dodge    12
Dodge    11
Dodge    11
         ..
Dodge    18
Dodge    17
Dodge    14
Dodge    14
Dodge    11
Name: city08, Length: 2583, dtype: int64

In [41]:
idx = pd.Index(['Dodge','Dodge'])
city2.loc[idx]

Dodge    23
Dodge    10
Dodge    12
Dodge    11
Dodge    11
         ..
Dodge    18
Dodge    17
Dodge    14
Dodge    14
Dodge    11
Name: city08, Length: 5166, dtype: int64

In [42]:
mask = city2 > 50
city2.loc[mask]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64

In [43]:
# with function it allows to access intermediate series 
cost = pd.Series([1.00, 2.25, 3.99, .99, 2.79],
                 index = ['Gum', 'Cookie', 'Melon', 'Roll', 'Carrots'])
cost

Gum        1.00
Cookie     2.25
Melon      3.99
Roll       0.99
Carrots    2.79
dtype: float64

In [44]:
inflation = 1.10
(cost
   .mul(inflation)
   .loc[lambda s_:s_>3]
)

Melon      4.389
Carrots    3.069
dtype: float64

In [45]:
#boolean array before function --- wrong results
mask = cost > 3
(cost
    .mul(inflation)
    .loc[mask])

Melon    4.389
dtype: float64

In [46]:
def gt3(ser):
    return ser > 3

(cost
   .mul(inflation)
   .loc[gt3]
)
   

Melon      4.389
Carrots    3.069
dtype: float64

### iloc attribute
- A scalar index
- list of index positions
- a slice of positions (half open)
- numpy array
- fun that takes series and return any of the above

In [48]:
# numpy array example 
mask = city2 > 50
city2.iloc[mask]

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

In [51]:
type(mask)

pandas.core.series.Series

In [52]:
type(mask.to_numpy())

numpy.ndarray

In [54]:
city2.iloc[mask.to_numpy()]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64

### Heads and Tails

In [55]:
city2.head(5)

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
Name: city08, dtype: int64

In [56]:
city2.tail(5)

Subaru    19
Subaru    20
Subaru    18
Subaru    18
Subaru    16
Name: city08, dtype: int64

### Sampling

In [57]:
city2.sample(6, random_state=42)

Volvo         16
Mitsubishi    19
Buick         27
Jeep          15
Land Rover    13
Saab          17
Name: city08, dtype: int64

In [59]:
city2.sample(6, random_state=42) # --- reproducable 

Volvo         16
Mitsubishi    19
Buick         27
Jeep          15
Land Rover    13
Saab          17
Name: city08, dtype: int64

In [60]:
city2.sample(6)

BMW          17
Nissan       15
Chevrolet    13
Dodge        15
Dodge        12
Honda        17
Name: city08, dtype: int64

In [61]:
city2.sample(6) # --- #not reproducable 

Mitsubishi    20
GMC           15
Audi          18
Mitsubishi    18
Porsche       15
GMC           15
Name: city08, dtype: int64