In [3]:
import numpy as np
import pandas as pd
import seaborn as sns

### P1 - Pandas in Python

In [28]:
tips = sns.load_dataset("tips")

In [8]:
# to_numpy

x = tips.to_numpy()
x

array([[16.99, 1.01, 'Female', ..., 'Sun', 'Dinner', 2],
       [10.34, 1.66, 'Male', ..., 'Sun', 'Dinner', 3],
       [21.01, 3.5, 'Male', ..., 'Sun', 'Dinner', 3],
       ...,
       [22.67, 2.0, 'Male', ..., 'Sat', 'Dinner', 2],
       [17.82, 1.75, 'Male', ..., 'Sat', 'Dinner', 2],
       [18.78, 3.0, 'Female', ..., 'Thur', 'Dinner', 2]], dtype=object)

In [11]:
# sort_index

tips.sort_index(axis=0, ascending=False)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
243,18.78,3.00,Female,No,Thur,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
240,27.18,2.00,Female,Yes,Sat,Dinner,2
239,29.03,5.92,Male,No,Sat,Dinner,3
...,...,...,...,...,...,...,...
4,24.59,3.61,Female,No,Sun,Dinner,4
3,23.68,3.31,Male,No,Sun,Dinner,2
2,21.01,3.50,Male,No,Sun,Dinner,3
1,10.34,1.66,Male,No,Sun,Dinner,3


In [18]:
# sort_values 

tips.sort_values(by='tip', ascending=False).head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
170,50.81,10.0,Male,Yes,Sat,Dinner,3
212,48.33,9.0,Male,No,Sat,Dinner,4
23,39.42,7.58,Male,No,Sat,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
141,34.3,6.7,Male,No,Thur,Lunch,6


### P2 - Pandas in Python

In [24]:
# iat

tips.iat[0, 0]

16.99

### P3 - Sampling

In [32]:
# sample with n

tips.tip.sample(n=4)

165    3.48
4      3.61
194    4.00
142    5.00
Name: tip, dtype: float64

In [38]:
# sample with frac

tips.tip.sample(frac=0.02)

171    3.16
51     2.60
33     2.45
236    1.00
24     3.18
Name: tip, dtype: float64

In [39]:
# replace, if true a number can be sampled twice

tips.tip.sample(frac=0.02, replace=True)

161    2.50
81     3.40
226    2.00
70     1.97
42     3.06
Name: tip, dtype: float64

In [67]:
# weights: sample series

from sklearn.preprocessing import scale
weights = abs(scale(tips.tip))  # just to simulate some weights

tips.tip.sample(n=4, weights=weights)

211     5.16
170    10.00
222     1.92
179     3.55
Name: tip, dtype: float64

In [69]:
# weights: sample df

tips.tip.sample(n=4, weights=weights)

197    5.00
50     2.50
191    4.19
31     2.50
Name: tip, dtype: float64

### P4 - Accessing Data

In [72]:
# append

tips.tip.append(pd.Series([99])).reset_index(drop=True)

0       1.01
1       1.66
2       3.50
3       3.31
4       3.61
       ...  
240     2.00
241     2.00
242     1.75
243     3.00
244    99.00
Length: 245, dtype: float64

### P5 - isin, where, mask

In [144]:
# isin: can take dict, but looks like we have to provide all columns
# all vs any

d = {
    "sex": ["Female"],
    "smoker": ["No"],
    "day": ["Sun"],
    "time": ["Dinner"]
}

tips.isin(d).all(axis=1).head()

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [149]:
# where, returns all rows. non-matches will be NaN

tips.where(tips.tip > 2).head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,,,,,,,
1,,,,,,,
2,21.01,3.5,Male,No,Sun,Dinner,3.0
3,23.68,3.31,Male,No,Sun,Dinner,2.0
4,24.59,3.61,Female,No,Sun,Dinner,4.0


In [148]:
# will only return matches

tips[tips.tip > 2].head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4


In [155]:
# other, replaces nan

tips[['tip']].where(tips.tip > 2, other=999).head()

Unnamed: 0,tip
0,999.0
1,999.0
2,3.5
3,3.31
4,3.61


In [158]:
# mask, exact opposite of where

tips[['tip']].mask(tips.tip > 2, other=999).head()

Unnamed: 0,tip
0,1.01
1,1.66
2,999.0
3,999.0
4,999.0


In [165]:
# where match, replace with another column

tips[['tip']].where(tips[['tip']] > 3, tips['total_bill'], axis=0).head()

Unnamed: 0,tip
0,16.99
1,10.34
2,3.5
3,3.31
4,3.61


### P6 - duplicates

In [180]:
# duplicated, keep (first, last, False)

tips.duplicated(['sex', 'time', 'smoker', 'day', 'size'], keep='first')

0      False
1      False
2       True
3      False
4      False
       ...  
239     True
240     True
241     True
242     True
243    False
Length: 244, dtype: bool

In [192]:
# get by index

tips['sex'].get(1)

'Male'

### P7 - Index