In [1]:
import numpy as np
import pandas as pd

In [2]:
dates = pd.date_range('1/1/2023', periods=10)
dates

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10'],
              dtype='datetime64[ns]', freq='D')

In [3]:
# Create a dataframe with dates as the index
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2023-01-01,-0.275716,1.281337,-1.626059,-0.342007
2023-01-02,0.538574,0.062321,-1.142963,0.687714
2023-01-03,3.252529,-1.243151,0.576522,0.752074
2023-01-04,0.200502,0.066754,-0.194084,0.936919
2023-01-05,-0.05174,1.539505,1.063805,0.11325
2023-01-06,1.26683,0.57736,-0.820588,-0.850736
2023-01-07,-1.014024,-0.262142,0.959032,0.692985
2023-01-08,0.549139,-1.02033,-0.185078,0.131827
2023-01-09,0.795074,0.583225,-0.412308,-0.529318
2023-01-10,0.159926,-0.938423,-0.164951,0.943514


In [4]:
# The most basic indexing using []
s = df['A']
s[dates[3]]

0.20050231053755718

In [5]:
# We can pass a list of columns to [] to select columns in that order. 
# If a column is not contained in the DataFrame, an exception will be raised.

df[['B', 'C']]

Unnamed: 0,B,C
2023-01-01,1.281337,-1.626059
2023-01-02,0.062321,-1.142963
2023-01-03,-1.243151,0.576522
2023-01-04,0.066754,-0.194084
2023-01-05,1.539505,1.063805
2023-01-06,0.57736,-0.820588
2023-01-07,-0.262142,0.959032
2023-01-08,-1.02033,-0.185078
2023-01-09,0.583225,-0.412308
2023-01-10,-0.938423,-0.164951


## Attribute access

In [6]:
sa = pd.Series([1, 3, 5], index=['a', 'b', 'c'])
dfa = df.copy()

# We can also access an index on a Series or column on a DataFrame directly as an attribute:
sa.a

1

In [7]:
# Accessing an index on a dataframe directly as an attribute
dfa.A

2023-01-01   -0.275716
2023-01-02    0.538574
2023-01-03    3.252529
2023-01-04    0.200502
2023-01-05   -0.051740
2023-01-06    1.266830
2023-01-07   -1.014024
2023-01-08    0.549139
2023-01-09    0.795074
2023-01-10    0.159926
Freq: D, Name: A, dtype: float64

In [8]:
dfa['A'] = list(range(len(dfa.index)))
dfa

Unnamed: 0,A,B,C,D
2023-01-01,0,1.281337,-1.626059,-0.342007
2023-01-02,1,0.062321,-1.142963,0.687714
2023-01-03,2,-1.243151,0.576522,0.752074
2023-01-04,3,0.066754,-0.194084,0.936919
2023-01-05,4,1.539505,1.063805,0.11325
2023-01-06,5,0.57736,-0.820588,-0.850736
2023-01-07,6,-0.262142,0.959032,0.692985
2023-01-08,7,-1.02033,-0.185078,0.131827
2023-01-09,8,0.583225,-0.412308,-0.529318
2023-01-10,9,-0.938423,-0.164951,0.943514


In [9]:
df.A

2023-01-01   -0.275716
2023-01-02    0.538574
2023-01-03    3.252529
2023-01-04    0.200502
2023-01-05   -0.051740
2023-01-06    1.266830
2023-01-07   -1.014024
2023-01-08    0.549139
2023-01-09    0.795074
2023-01-10    0.159926
Freq: D, Name: A, dtype: float64

In [10]:
df

Unnamed: 0,A,B,C,D
2023-01-01,-0.275716,1.281337,-1.626059,-0.342007
2023-01-02,0.538574,0.062321,-1.142963,0.687714
2023-01-03,3.252529,-1.243151,0.576522,0.752074
2023-01-04,0.200502,0.066754,-0.194084,0.936919
2023-01-05,-0.05174,1.539505,1.063805,0.11325
2023-01-06,1.26683,0.57736,-0.820588,-0.850736
2023-01-07,-1.014024,-0.262142,0.959032,0.692985
2023-01-08,0.549139,-1.02033,-0.185078,0.131827
2023-01-09,0.795074,0.583225,-0.412308,-0.529318
2023-01-10,0.159926,-0.938423,-0.164951,0.943514


## Slicing ranges

In [11]:
# For Series, slicing returns a slice of the values and the corresponding labels:
s[:3]

2023-01-01   -0.275716
2023-01-02    0.538574
2023-01-03    3.252529
Freq: D, Name: A, dtype: float64

In [12]:
s[::2]

2023-01-01   -0.275716
2023-01-03    3.252529
2023-01-05   -0.051740
2023-01-07   -1.014024
2023-01-09    0.795074
Freq: 2D, Name: A, dtype: float64

In [13]:
s[::-1]

2023-01-10    0.159926
2023-01-09    0.795074
2023-01-08    0.549139
2023-01-07   -1.014024
2023-01-06    1.266830
2023-01-05   -0.051740
2023-01-04    0.200502
2023-01-03    3.252529
2023-01-02    0.538574
2023-01-01   -0.275716
Freq: -1D, Name: A, dtype: float64

In [15]:
# Setting (or assignment) works woth slicing as well:
s2 = s.copy()

s2[:3] = 0
s2

2023-01-01    0.000000
2023-01-02    0.000000
2023-01-03    0.000000
2023-01-04    0.200502
2023-01-05   -0.051740
2023-01-06    1.266830
2023-01-07   -1.014024
2023-01-08    0.549139
2023-01-09    0.795074
2023-01-10    0.159926
Freq: D, Name: A, dtype: float64

### 

For DataFrames, slicing inside of [] slices the rows.

In [16]:
df[:4]

Unnamed: 0,A,B,C,D
2023-01-01,-0.275716,1.281337,-1.626059,-0.342007
2023-01-02,0.538574,0.062321,-1.142963,0.687714
2023-01-03,3.252529,-1.243151,0.576522,0.752074
2023-01-04,0.200502,0.066754,-0.194084,0.936919


In [17]:
df[::-1]

Unnamed: 0,A,B,C,D
2023-01-10,0.159926,-0.938423,-0.164951,0.943514
2023-01-09,0.795074,0.583225,-0.412308,-0.529318
2023-01-08,0.549139,-1.02033,-0.185078,0.131827
2023-01-07,-1.014024,-0.262142,0.959032,0.692985
2023-01-06,1.26683,0.57736,-0.820588,-0.850736
2023-01-05,-0.05174,1.539505,1.063805,0.11325
2023-01-04,0.200502,0.066754,-0.194084,0.936919
2023-01-03,3.252529,-1.243151,0.576522,0.752074
2023-01-02,0.538574,0.062321,-1.142963,0.687714
2023-01-01,-0.275716,1.281337,-1.626059,-0.342007


## Selection by label

In [18]:
# Create a dataframe from a normal distribution
df1 = pd.DataFrame(np.random.randn(6, 5), columns=list('ABCDE'), index=pd.date_range('1/3/2015', periods=6))
df1

Unnamed: 0,A,B,C,D,E
2015-01-03,-0.410053,-1.315199,1.672055,0.68055,0.347426
2015-01-04,0.422156,-0.739529,-0.530206,-0.541658,0.385715
2015-01-05,0.874088,-0.828446,0.118881,0.232361,-1.107106
2015-01-06,-0.518065,0.82395,1.213931,-1.580868,1.657634
2015-01-07,1.002888,-0.253666,0.677227,0.069732,1.35073
2015-01-08,0.585646,1.080428,-1.797891,-0.468049,1.770426


### Warning:
.loc is strict when you present slicers that are not compatible (or convertible) with the index type. For
example using integers in a DatetimeIndex. 
These will raise a TypeError. 

In [27]:
try:
    df1.loc[2:3]
except TypeError:
    print('The error reads: ', 'cannot do slice indexing on DatetimeIndex with these indexers [2] of type int')

The error reads:  cannot do slice indexing on DatetimeIndex with these indexers [2] of type int


In [30]:
# String likes in slicing can be convertible to the type of the index and lead to natural slicing:

df1.loc['20150106': '20150107']

Unnamed: 0,A,B,C,D,E
2015-01-06,-0.518065,0.82395,1.213931,-1.580868,1.657634
2015-01-07,1.002888,-0.253666,0.677227,0.069732,1.35073


In [38]:
df1.loc['20150106', 'A']

-0.5180650967417435

In [39]:
df1.loc['20150106'].A

-0.5180650967417435

In [40]:
df1.loc['20150106'].loc['A']

-0.5180650967417435

In [41]:
s

2023-01-01   -0.275716
2023-01-02    0.538574
2023-01-03    3.252529
2023-01-04    0.200502
2023-01-05   -0.051740
2023-01-06    1.266830
2023-01-07   -1.014024
2023-01-08    0.549139
2023-01-09    0.795074
2023-01-10    0.159926
Freq: D, Name: A, dtype: float64

In [42]:
sw = s.copy()

In [43]:
sw.where(s>0)

2023-01-01         NaN
2023-01-02    0.538574
2023-01-03    3.252529
2023-01-04    0.200502
2023-01-05         NaN
2023-01-06    1.266830
2023-01-07         NaN
2023-01-08    0.549139
2023-01-09    0.795074
2023-01-10    0.159926
Freq: D, Name: A, dtype: float64

In [44]:
sw.where(s>0, -s)

2023-01-01    0.275716
2023-01-02    0.538574
2023-01-03    3.252529
2023-01-04    0.200502
2023-01-05    0.051740
2023-01-06    1.266830
2023-01-07    1.014024
2023-01-08    0.549139
2023-01-09    0.795074
2023-01-10    0.159926
Freq: D, Name: A, dtype: float64

In [45]:
s[s>0]

2023-01-02    0.538574
2023-01-03    3.252529
2023-01-04    0.200502
2023-01-06    1.266830
2023-01-08    0.549139
2023-01-09    0.795074
2023-01-10    0.159926
Name: A, dtype: float64

In [46]:
s

2023-01-01   -0.275716
2023-01-02    0.538574
2023-01-03    3.252529
2023-01-04    0.200502
2023-01-05   -0.051740
2023-01-06    1.266830
2023-01-07   -1.014024
2023-01-08    0.549139
2023-01-09    0.795074
2023-01-10    0.159926
Freq: D, Name: A, dtype: float64

In [48]:
s.iat[2]

3.2525289532440373

In [51]:
np.random.randn(2,2) + 7

array([[7.26082671, 6.55215829],
       [6.96165418, 4.9640456 ]])

In [70]:
n = 20
variables = {'product': np.random.choice(['laptop', 'hair_gel', 'bread', 'fuel', 'printing', 'camera'], size=n),
            'company': np.random.choice(['bayley', 'paul', 'brilliant', 'tatenda', 'skyleh'], size=n),
            'industry': np.random.choice(['electronics', 'beauty', 'retail', 'service'], size=n),
             'salesperson': np.random.choice(['Anna', 'Charles', 'Edmore', 'Idah'], size=n),
             'city': np.random.choice(['Harare', 'Bulawayo', 'Gweru'], size=n),
             'price': np.random.randn(20) + 3 }

In [71]:
dff = pd.DataFrame(variables, index=dates, columns=['product', 'company', 'industry', 'salesperson', 'city', 'price'])
dff

Unnamed: 0,product,company,industry,salesperson,city,price
2022-05-25,bread,paul,service,Idah,Gweru,3.362275
2022-05-09,printing,bayley,beauty,Charles,Gweru,2.476113
2022-05-06,laptop,brilliant,electronics,Anna,Bulawayo,3.648064
2022-05-23,fuel,bayley,electronics,Charles,Gweru,3.40627
2022-05-23,bread,bayley,electronics,Anna,Gweru,4.132297
2022-05-16,camera,brilliant,electronics,Idah,Harare,2.630581
2022-05-16,hair_gel,tatenda,electronics,Charles,Harare,2.122755
2022-05-08,bread,tatenda,beauty,Edmore,Bulawayo,3.172209
2022-05-23,camera,tatenda,retail,Charles,Bulawayo,3.286928
2022-05-16,bread,brilliant,beauty,Anna,Gweru,4.142248


In [73]:
dff.sort_index()

Unnamed: 0,product,company,industry,salesperson,city,price
2022-05-06,laptop,brilliant,electronics,Anna,Bulawayo,3.648064
2022-05-08,bread,tatenda,beauty,Edmore,Bulawayo,3.172209
2022-05-09,printing,bayley,beauty,Charles,Gweru,2.476113
2022-05-09,laptop,brilliant,retail,Anna,Bulawayo,2.039125
2022-05-10,printing,brilliant,beauty,Edmore,Bulawayo,2.008434
2022-05-12,printing,skyleh,service,Charles,Gweru,4.090701
2022-05-15,hair_gel,bayley,service,Idah,Harare,2.741518
2022-05-16,laptop,bayley,retail,Anna,Gweru,2.858181
2022-05-16,bread,brilliant,beauty,Anna,Gweru,4.142248
2022-05-16,hair_gel,tatenda,electronics,Charles,Harare,2.122755


In [75]:
dff.groupby(['product', 'salesperson']).price.sum()

product   salesperson
bread     Anna           8.274545
          Edmore         3.172209
          Idah           3.362275
camera    Charles        3.286928
          Idah           2.630581
fuel      Anna           6.342887
          Charles        3.406270
hair_gel  Charles        4.198773
          Idah           2.741518
laptop    Anna           8.545370
printing  Anna           3.386387
          Charles        6.566814
          Edmore         2.008434
          Idah           4.088928
Name: price, dtype: float64

In [76]:
df

Unnamed: 0,A,B,C,D
2023-01-01,-0.275716,1.281337,-1.626059,-0.342007
2023-01-02,0.538574,0.062321,-1.142963,0.687714
2023-01-03,3.252529,-1.243151,0.576522,0.752074
2023-01-04,0.200502,0.066754,-0.194084,0.936919
2023-01-05,-0.05174,1.539505,1.063805,0.11325
2023-01-06,1.26683,0.57736,-0.820588,-0.850736
2023-01-07,-1.014024,-0.262142,0.959032,0.692985
2023-01-08,0.549139,-1.02033,-0.185078,0.131827
2023-01-09,0.795074,0.583225,-0.412308,-0.529318
2023-01-10,0.159926,-0.938423,-0.164951,0.943514


In [82]:
df.query?

In [84]:
df.query('A > B')

Unnamed: 0,A,B,C,D
2023-01-02,0.538574,0.062321,-1.142963,0.687714
2023-01-03,3.252529,-1.243151,0.576522,0.752074
2023-01-04,0.200502,0.066754,-0.194084,0.936919
2023-01-06,1.26683,0.57736,-0.820588,-0.850736
2023-01-08,0.549139,-1.02033,-0.185078,0.131827
2023-01-09,0.795074,0.583225,-0.412308,-0.529318
2023-01-10,0.159926,-0.938423,-0.164951,0.943514


In [85]:
dffg = pd.DataFrame({'col': ["A", "A", "B", "B"],
                     'A': [80, 23, np.nan, 22],
                     'B': [80, 55, 76, 67]})
dffg

Unnamed: 0,col,A,B
0,A,80.0,80
1,A,23.0,55
2,B,,76
3,B,22.0,67


In [86]:
idx, cols = pd.factorize(dffg['col'])

In [89]:
idx, cols

(array([0, 0, 1, 1]), Index(['A', 'B'], dtype='object'))

In [88]:
cols

Index(['A', 'B'], dtype='object')

In [90]:
dffg.reindex(cols, axis=1).to_numpy()[np.arange(len(dffg)), idx]

array([80., 23., 76., 67.])

In [91]:
dffg

Unnamed: 0,col,A,B
0,A,80.0,80
1,A,23.0,55
2,B,,76
3,B,22.0,67
