# Chapter 2 
# User Guide
## 2.1 10 minutes to pandas
### 2.1.1 Object creation

In [1]:
import numpy as np
import pandas as pd

**Creating Series**

In [4]:
s = pd.Series(data=[1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

**Creating a DataFrame**

In [6]:
dates = pd.date_range(start="20200101", periods=6)
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(6, 4), 
                  index = dates, 
                  columns = list("ABCD"))

df

Unnamed: 0,A,B,C,D
2020-01-01,1.663687,0.848234,-0.633912,-0.386969
2020-01-02,-1.11205,0.721042,0.674549,0.205275
2020-01-03,0.603508,-0.557389,1.29362,-1.444347
2020-01-04,-1.240802,-0.534342,0.09321,-1.080956
2020-01-05,1.367237,1.004106,1.373293,0.033081
2020-01-06,0.340592,-0.863473,-0.549296,-1.099534


In [9]:
df2 = pd.DataFrame(data = {"A": 1., 
                          "B": pd.Timestamp('20130102'), 
                          "C": pd.Series(1, index = list(range(4)), dtype = "float32"), 
                          "D": np.array([3] * 4, dtype = 'int32'), 
                          "E": pd.Categorical(['test', 'train', 'test', 'train']), 
                          "F": 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [10]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [11]:
df2.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

### 2.1.2 Viewing data

In [12]:
df.head()

Unnamed: 0,A,B,C,D
2020-01-01,1.663687,0.848234,-0.633912,-0.386969
2020-01-02,-1.11205,0.721042,0.674549,0.205275
2020-01-03,0.603508,-0.557389,1.29362,-1.444347
2020-01-04,-1.240802,-0.534342,0.09321,-1.080956
2020-01-05,1.367237,1.004106,1.373293,0.033081


In [15]:
df.tail(n = 3)

Unnamed: 0,A,B,C,D
2020-01-04,-1.240802,-0.534342,0.09321,-1.080956
2020-01-05,1.367237,1.004106,1.373293,0.033081
2020-01-06,0.340592,-0.863473,-0.549296,-1.099534


In [16]:
df.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [18]:
df.to_numpy()

array([[ 1.6636872 ,  0.84823351, -0.63391221, -0.38696875],
       [-1.11204962,  0.72104247,  0.67454901,  0.20527496],
       [ 0.60350799, -0.55738866,  1.29362034, -1.4443473 ],
       [-1.240802  , -0.53434174,  0.09320963, -1.08095597],
       [ 1.36723728,  1.00410563,  1.37329256,  0.03308121],
       [ 0.34059244, -0.86347342, -0.54929633, -1.0995336 ]])

> Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental difference between pandas and numpy: **Numpy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column**. When you call `DataFrame.to_numpy()` pandas will find the Numpy dtype that can hold all of the dtypes in the DataFrame. This may end up being `object`, which requires casting every value to a Python object.

In [19]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [22]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.270362,0.10303,0.375244,-0.628908
std,1.221072,0.83973,0.881115,0.675781
min,-1.240802,-0.863473,-0.633912,-1.444347
25%,-0.748889,-0.551627,-0.38867,-1.094889
50%,0.47205,0.09335,0.383879,-0.733962
75%,1.176305,0.816436,1.138853,-0.071931
max,1.663687,1.004106,1.373293,0.205275


In [23]:
df.T

Unnamed: 0,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05,2020-01-06
A,1.663687,-1.11205,0.603508,-1.240802,1.367237,0.340592
B,0.848234,0.721042,-0.557389,-0.534342,1.004106,-0.863473
C,-0.633912,0.674549,1.29362,0.09321,1.373293,-0.549296
D,-0.386969,0.205275,-1.444347,-1.080956,0.033081,-1.099534


**Sorting by axis**

In [26]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2020-01-01,-0.386969,-0.633912,0.848234,1.663687
2020-01-02,0.205275,0.674549,0.721042,-1.11205
2020-01-03,-1.444347,1.29362,-0.557389,0.603508
2020-01-04,-1.080956,0.09321,-0.534342,-1.240802
2020-01-05,0.033081,1.373293,1.004106,1.367237
2020-01-06,-1.099534,-0.549296,-0.863473,0.340592


In [27]:
df.sort_index(axis = 0, ascending = False)

Unnamed: 0,A,B,C,D
2020-01-06,0.340592,-0.863473,-0.549296,-1.099534
2020-01-05,1.367237,1.004106,1.373293,0.033081
2020-01-04,-1.240802,-0.534342,0.09321,-1.080956
2020-01-03,0.603508,-0.557389,1.29362,-1.444347
2020-01-02,-1.11205,0.721042,0.674549,0.205275
2020-01-01,1.663687,0.848234,-0.633912,-0.386969


**Sorting by values**

In [39]:
df.sort_values(by = list("ABCD"), 
              ascending = False, 
              axis = 0)

Unnamed: 0,A,B,C,D
2020-01-01,1.663687,0.848234,-0.633912,-0.386969
2020-01-05,1.367237,1.004106,1.373293,0.033081
2020-01-03,0.603508,-0.557389,1.29362,-1.444347
2020-01-06,0.340592,-0.863473,-0.549296,-1.099534
2020-01-02,-1.11205,0.721042,0.674549,0.205275
2020-01-04,-1.240802,-0.534342,0.09321,-1.080956


### 2.1.3 Selection

**Getting**

Selecting a single column, which yields a `Series`.

In [40]:
df['A']

2020-01-01    1.663687
2020-01-02   -1.112050
2020-01-03    0.603508
2020-01-04   -1.240802
2020-01-05    1.367237
2020-01-06    0.340592
Freq: D, Name: A, dtype: float64

In [41]:
df[0:3]

Unnamed: 0,A,B,C,D
2020-01-01,1.663687,0.848234,-0.633912,-0.386969
2020-01-02,-1.11205,0.721042,0.674549,0.205275
2020-01-03,0.603508,-0.557389,1.29362,-1.444347


In [47]:
df['20200101':'20200103']

Unnamed: 0,A,B,C,D
2020-01-01,1.663687,0.848234,-0.633912,-0.386969
2020-01-02,-1.11205,0.721042,0.674549,0.205275
2020-01-03,0.603508,-0.557389,1.29362,-1.444347


**Selection by label**

In [48]:
df.loc[dates[0]]

A    1.663687
B    0.848234
C   -0.633912
D   -0.386969
Name: 2020-01-01 00:00:00, dtype: float64

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [50]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2020-01-01,1.663687,0.848234
2020-01-02,-1.11205,0.721042
2020-01-03,0.603508,-0.557389
2020-01-04,-1.240802,-0.534342
2020-01-05,1.367237,1.004106
2020-01-06,0.340592,-0.863473


In [51]:
df.loc['20200101':'20200103', ["A", "B"]]

Unnamed: 0,A,B
2020-01-01,1.663687,0.848234
2020-01-02,-1.11205,0.721042
2020-01-03,0.603508,-0.557389


In [53]:
df.loc['20200101', ["A", "B"]]

A    1.663687
B    0.848234
Name: 2020-01-01 00:00:00, dtype: float64

In [55]:
df.loc[dates[0], "A"] # a scalar value

1.6636872047782487

In [57]:
df.at[dates[0], 'A']

1.6636872047782487

**Selection by position**

In [58]:
df.iloc[3]

A   -1.240802
B   -0.534342
C    0.093210
D   -1.080956
Name: 2020-01-04 00:00:00, dtype: float64

In [59]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2020-01-04,-1.240802,-0.534342
2020-01-05,1.367237,1.004106


In [60]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2020-01-02,-1.11205,0.674549
2020-01-03,0.603508,1.29362
2020-01-05,1.367237,1.373293


In [61]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2020-01-02,-1.11205,0.721042,0.674549,0.205275
2020-01-03,0.603508,-0.557389,1.29362,-1.444347


In [62]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2020-01-01,0.848234,-0.633912
2020-01-02,0.721042,0.674549
2020-01-03,-0.557389,1.29362
2020-01-04,-0.534342,0.09321
2020-01-05,1.004106,1.373293
2020-01-06,-0.863473,-0.549296


In [63]:
df.iloc[1, 1]

0.7210424695766117

In [65]:
df.iat[1, 1] # more faster

0.7210424695766117

**Boolen indexing**

In [75]:
df[df["A"] > 0] # selection rows

Unnamed: 0,A,B,C,D
2020-01-01,1.663687,0.848234,-0.633912,-0.386969
2020-01-03,0.603508,-0.557389,1.29362,-1.444347
2020-01-05,1.367237,1.004106,1.373293,0.033081
2020-01-06,0.340592,-0.863473,-0.549296,-1.099534


In [68]:
df[df > 0]

Unnamed: 0,A,B,C,D
2020-01-01,1.663687,0.848234,,
2020-01-02,,0.721042,0.674549,0.205275
2020-01-03,0.603508,,1.29362,
2020-01-04,,,0.09321,
2020-01-05,1.367237,1.004106,1.373293,0.033081
2020-01-06,0.340592,,,


In [70]:
df2 = df.copy()

df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2020-01-01,1.663687,0.848234,-0.633912,-0.386969,one
2020-01-02,-1.11205,0.721042,0.674549,0.205275,one
2020-01-03,0.603508,-0.557389,1.29362,-1.444347,two
2020-01-04,-1.240802,-0.534342,0.09321,-1.080956,three
2020-01-05,1.367237,1.004106,1.373293,0.033081,four
2020-01-06,0.340592,-0.863473,-0.549296,-1.099534,three


**Using the `isin()` method for filtering**

In [76]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2020-01-03,0.603508,-0.557389,1.29362,-1.444347,two
2020-01-05,1.367237,1.004106,1.373293,0.033081,four


**Setting**

Setting a new column automatically aligns the data by the indexes.