In [73]:
import pandas as pd
import numpy as np

## 1. object creation

In [74]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [75]:
dates = pd.date_range('20130101', periods = 6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [76]:
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.883957,-0.591698,-1.873865,-1.014039
2013-01-02,-0.953374,2.724529,-0.374995,0.788453
2013-01-03,0.2037,0.846582,0.256878,0.636555
2013-01-04,1.364283,0.872162,-1.160863,0.795858
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027
2013-01-06,0.09011,-0.637476,0.004852,-1.916671


In [77]:
df2 = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20130102'),
                    'C':pd.Series(1, index = list(range(4)), dtype = 'float32'),
                    'D':np.array([3]*4, dtype = 'int32'),
                    'E':pd.Categorical(["test","train","test","train"]),
                    'F':'Foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,Foo
1,1.0,2013-01-02,1.0,3,train,Foo
2,1.0,2013-01-02,1.0,3,test,Foo
3,1.0,2013-01-02,1.0,3,train,Foo


In [78]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## 2. Viewing Data

In [79]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-1.883957,-0.591698,-1.873865,-1.014039
2013-01-02,-0.953374,2.724529,-0.374995,0.788453
2013-01-03,0.2037,0.846582,0.256878,0.636555
2013-01-04,1.364283,0.872162,-1.160863,0.795858
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027


In [80]:
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,-0.953374,2.724529,-0.374995,0.788453
2013-01-03,0.2037,0.846582,0.256878,0.636555
2013-01-04,1.364283,0.872162,-1.160863,0.795858
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027
2013-01-06,0.09011,-0.637476,0.004852,-1.916671


In [81]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [82]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [83]:
df.to_numpy()

array([[-1.88395687, -0.59169814, -1.8738653 , -1.01403916],
       [-0.95337445,  2.7245291 , -0.37499524,  0.78845329],
       [ 0.20369961,  0.84658184,  0.25687808,  0.63655451],
       [ 1.36428283,  0.87216156, -1.16086258,  0.79585838],
       [-1.64210562, -2.31906825, -0.23922425, -1.74702659],
       [ 0.09011044, -0.6374759 ,  0.00485196, -1.91667068]])

In [84]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'Foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'Foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'Foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'Foo']],
      dtype=object)

In [85]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.470224,0.149172,-0.564536,-0.409478
std,1.244102,1.726353,0.801327,1.296782
min,-1.883957,-2.319068,-1.873865,-1.916671
25%,-1.469923,-0.626031,-0.964396,-1.56378
50%,-0.431632,0.127442,-0.30711,-0.188742
75%,0.175302,0.865767,-0.056167,0.750479
max,1.364283,2.724529,0.256878,0.795858


In [86]:
# 행,열 바꿔줌
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.883957,-0.953374,0.2037,1.364283,-1.642106,0.09011
B,-0.591698,2.724529,0.846582,0.872162,-2.319068,-0.637476
C,-1.873865,-0.374995,0.256878,-1.160863,-0.239224,0.004852
D,-1.014039,0.788453,0.636555,0.795858,-1.747027,-1.916671


In [87]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.014039,-1.873865,-0.591698,-1.883957
2013-01-02,0.788453,-0.374995,2.724529,-0.953374
2013-01-03,0.636555,0.256878,0.846582,0.2037
2013-01-04,0.795858,-1.160863,0.872162,1.364283
2013-01-05,-1.747027,-0.239224,-2.319068,-1.642106
2013-01-06,-1.916671,0.004852,-0.637476,0.09011


In [88]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027
2013-01-06,0.09011,-0.637476,0.004852,-1.916671
2013-01-01,-1.883957,-0.591698,-1.873865,-1.014039
2013-01-03,0.2037,0.846582,0.256878,0.636555
2013-01-04,1.364283,0.872162,-1.160863,0.795858
2013-01-02,-0.953374,2.724529,-0.374995,0.788453


## 3. SELECTION 

In [89]:
# getting
df
df['A']

2013-01-01   -1.883957
2013-01-02   -0.953374
2013-01-03    0.203700
2013-01-04    1.364283
2013-01-05   -1.642106
2013-01-06    0.090110
Freq: D, Name: A, dtype: float64

In [90]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.883957,-0.591698,-1.873865,-1.014039
2013-01-02,-0.953374,2.724529,-0.374995,0.788453
2013-01-03,0.2037,0.846582,0.256878,0.636555
2013-01-04,1.364283,0.872162,-1.160863,0.795858
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027
2013-01-06,0.09011,-0.637476,0.004852,-1.916671


In [91]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.883957,-0.591698,-1.873865,-1.014039
2013-01-02,-0.953374,2.724529,-0.374995,0.788453
2013-01-03,0.2037,0.846582,0.256878,0.636555


In [92]:
df['20130103':'20130104']

Unnamed: 0,A,B,C,D
2013-01-03,0.2037,0.846582,0.256878,0.636555
2013-01-04,1.364283,0.872162,-1.160863,0.795858


In [93]:
# selection by lable
df.loc[dates[0]]

A   -1.883957
B   -0.591698
C   -1.873865
D   -1.014039
Name: 2013-01-01 00:00:00, dtype: float64

In [94]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.883957,-0.591698
2013-01-02,-0.953374,2.724529
2013-01-03,0.2037,0.846582
2013-01-04,1.364283,0.872162
2013-01-05,-1.642106,-2.319068
2013-01-06,0.09011,-0.637476


In [95]:
df.loc['20130102':'20130104', ['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.953374,2.724529
2013-01-03,0.2037,0.846582
2013-01-04,1.364283,0.872162


In [96]:
df.loc[dates[0],['A','B']]

A   -1.883957
B   -0.591698
Name: 2013-01-01 00:00:00, dtype: float64

In [97]:
df.loc[dates[0],'A']

-1.8839568714514485

In [98]:
# Selection by Position 
df.iloc[3]

A    1.364283
B    0.872162
C   -1.160863
D    0.795858
Name: 2013-01-04 00:00:00, dtype: float64

In [99]:
df.iloc[0:2,0:2]

Unnamed: 0,A,B
2013-01-01,-1.883957,-0.591698
2013-01-02,-0.953374,2.724529


In [100]:
df.iloc[[1,3,5],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.953374,-0.374995
2013-01-04,1.364283,-1.160863
2013-01-06,0.09011,0.004852


In [101]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-0.953374,2.724529,-0.374995,0.788453
2013-01-03,0.2037,0.846582,0.256878,0.636555


In [102]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.591698,-1.873865
2013-01-02,2.724529,-0.374995
2013-01-03,0.846582,0.256878
2013-01-04,0.872162,-1.160863
2013-01-05,-2.319068,-0.239224
2013-01-06,-0.637476,0.004852


In [103]:
df.iloc[1,1]

2.7245290983464936

In [104]:
#For getting fast access to a scalar (equivalent to the prior method)
df.iat[1,1]

2.7245290983464936

In [105]:
# Boolean Index
df[df['A']>0]

Unnamed: 0,A,B,C,D
2013-01-03,0.2037,0.846582,0.256878,0.636555
2013-01-04,1.364283,0.872162,-1.160863,0.795858
2013-01-06,0.09011,-0.637476,0.004852,-1.916671


In [106]:
df['A']>0

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04     True
2013-01-05    False
2013-01-06     True
Freq: D, Name: A, dtype: bool

In [107]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,2.724529,,0.788453
2013-01-03,0.2037,0.846582,0.256878,0.636555
2013-01-04,1.364283,0.872162,,0.795858
2013-01-05,,,,
2013-01-06,0.09011,,0.004852,


In [108]:
df2 = df.copy()
df2

Unnamed: 0,A,B,C,D
2013-01-01,-1.883957,-0.591698,-1.873865,-1.014039
2013-01-02,-0.953374,2.724529,-0.374995,0.788453
2013-01-03,0.2037,0.846582,0.256878,0.636555
2013-01-04,1.364283,0.872162,-1.160863,0.795858
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027
2013-01-06,0.09011,-0.637476,0.004852,-1.916671


In [109]:
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.883957,-0.591698,-1.873865,-1.014039,one
2013-01-02,-0.953374,2.724529,-0.374995,0.788453,one
2013-01-03,0.2037,0.846582,0.256878,0.636555,two
2013-01-04,1.364283,0.872162,-1.160863,0.795858,three
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027,four
2013-01-06,0.09011,-0.637476,0.004852,-1.916671,three


In [110]:
# Using the isin() method for filtering
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.2037,0.846582,0.256878,0.636555,two
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027,four


In [111]:
# Setting
s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range('20130102', periods = 6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [112]:
df['f'] = s1
df

Unnamed: 0,A,B,C,D,f
2013-01-01,-1.883957,-0.591698,-1.873865,-1.014039,
2013-01-02,-0.953374,2.724529,-0.374995,0.788453,1.0
2013-01-03,0.2037,0.846582,0.256878,0.636555,2.0
2013-01-04,1.364283,0.872162,-1.160863,0.795858,3.0
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027,4.0
2013-01-06,0.09011,-0.637476,0.004852,-1.916671,5.0


In [113]:
df.at[dates[0],'A'] = 0
df

Unnamed: 0,A,B,C,D,f
2013-01-01,0.0,-0.591698,-1.873865,-1.014039,
2013-01-02,-0.953374,2.724529,-0.374995,0.788453,1.0
2013-01-03,0.2037,0.846582,0.256878,0.636555,2.0
2013-01-04,1.364283,0.872162,-1.160863,0.795858,3.0
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027,4.0
2013-01-06,0.09011,-0.637476,0.004852,-1.916671,5.0


In [114]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,f
2013-01-01,0.0,0.0,-1.873865,-1.014039,
2013-01-02,-0.953374,2.724529,-0.374995,0.788453,1.0
2013-01-03,0.2037,0.846582,0.256878,0.636555,2.0
2013-01-04,1.364283,0.872162,-1.160863,0.795858,3.0
2013-01-05,-1.642106,-2.319068,-0.239224,-1.747027,4.0
2013-01-06,0.09011,-0.637476,0.004852,-1.916671,5.0


In [115]:
df.loc[:,'D'] = np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,f
2013-01-01,0.0,0.0,-1.873865,5,
2013-01-02,-0.953374,2.724529,-0.374995,5,1.0
2013-01-03,0.2037,0.846582,0.256878,5,2.0
2013-01-04,1.364283,0.872162,-1.160863,5,3.0
2013-01-05,-1.642106,-2.319068,-0.239224,5,4.0
2013-01-06,0.09011,-0.637476,0.004852,5,5.0


In [116]:
df = df.fillna(0)
df

Unnamed: 0,A,B,C,D,f
2013-01-01,0.0,0.0,-1.873865,5,0.0
2013-01-02,-0.953374,2.724529,-0.374995,5,1.0
2013-01-03,0.2037,0.846582,0.256878,5,2.0
2013-01-04,1.364283,0.872162,-1.160863,5,3.0
2013-01-05,-1.642106,-2.319068,-0.239224,5,4.0
2013-01-06,0.09011,-0.637476,0.004852,5,5.0


In [117]:
df2 = df.copy()

In [118]:
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D,f
2013-01-01,0.0,0.0,-1.873865,-5,0.0
2013-01-02,-0.953374,-2.724529,-0.374995,-5,-1.0
2013-01-03,-0.2037,-0.846582,-0.256878,-5,-2.0
2013-01-04,-1.364283,-0.872162,-1.160863,-5,-3.0
2013-01-05,-1.642106,-2.319068,-0.239224,-5,-4.0
2013-01-06,-0.09011,-0.637476,-0.004852,-5,-5.0


## 4. Missing Data

In [119]:
df1 = df.reindex(index = dates[0:4], columns = list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,f,E
2013-01-01,0.0,0.0,-1.873865,5,0.0,
2013-01-02,-0.953374,2.724529,-0.374995,5,1.0,
2013-01-03,0.2037,0.846582,0.256878,5,2.0,
2013-01-04,1.364283,0.872162,-1.160863,5,3.0,


In [120]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,f,E
2013-01-01,0.0,0.0,-1.873865,5,0.0,1.0
2013-01-02,-0.953374,2.724529,-0.374995,5,1.0,1.0
2013-01-03,0.2037,0.846582,0.256878,5,2.0,
2013-01-04,1.364283,0.872162,-1.160863,5,3.0,


In [121]:
df1.dropna(how = 'any')

Unnamed: 0,A,B,C,D,f,E
2013-01-01,0.0,0.0,-1.873865,5,0.0,1.0
2013-01-02,-0.953374,2.724529,-0.374995,5,1.0,1.0


In [122]:
df1.fillna(value = 5)

Unnamed: 0,A,B,C,D,f,E
2013-01-01,0.0,0.0,-1.873865,5,0.0,1.0
2013-01-02,-0.953374,2.724529,-0.374995,5,1.0,1.0
2013-01-03,0.2037,0.846582,0.256878,5,2.0,5.0
2013-01-04,1.364283,0.872162,-1.160863,5,3.0,5.0


In [123]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,f,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


## 5. Operations 
* STATS

In [124]:
df.mean()

A   -0.156231
B    0.247788
C   -0.564536
D    5.000000
f    2.500000
dtype: float64

In [125]:
df.mean(1)

2013-01-01    0.625227
2013-01-02    1.479232
2013-01-03    1.661432
2013-01-04    1.815116
2013-01-05    0.959920
2013-01-06    1.891497
Freq: D, dtype: float64

In [132]:
# Operating with objects that have different dimensionality and need alignment. 
# In addition, pandas automatically broadcasts along the specified dimension.
s = pd.Series([1,3,5,np.nan,6,8], index = dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [134]:
s = pd.Series([1,3,5,np.nan,6,8], index = dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [135]:
df

Unnamed: 0,A,B,C,D,f
2013-01-01,0.0,0.0,-1.873865,5,0.0
2013-01-02,-0.953374,2.724529,-0.374995,5,1.0
2013-01-03,0.2037,0.846582,0.256878,5,2.0
2013-01-04,1.364283,0.872162,-1.160863,5,3.0
2013-01-05,-1.642106,-2.319068,-0.239224,5,4.0
2013-01-06,0.09011,-0.637476,0.004852,5,5.0


In [138]:
# 이해안됌 ㅠㅠ
df.sub(s, axis = 'index')

Unnamed: 0,A,B,C,D,f
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-0.7963,-0.153418,-0.743122,4.0,1.0
2013-01-04,-1.635717,-2.127838,-4.160863,2.0,0.0
2013-01-05,-6.642106,-7.319068,-5.239224,0.0,-1.0
2013-01-06,,,,,


* APPLY - 이해안됌 ㅠㅠ

In [139]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,f
2013-01-01,0.0,0.0,-1.873865,5,0.0
2013-01-02,-0.953374,2.724529,-2.248861,10,1.0
2013-01-03,-0.749675,3.571111,-1.991982,15,3.0
2013-01-04,0.614608,4.443273,-3.152845,20,6.0
2013-01-05,-1.027498,2.124204,-3.392069,25,10.0
2013-01-06,-0.937387,1.486728,-3.387217,30,15.0


In [141]:
df.apply(lambda x : x.max() - x.min())

A    3.006388
B    5.043597
C    2.130743
D    0.000000
f    5.000000
dtype: float64

* Histogramming

In [144]:
s = pd.Series(np.random.randint(0,7,size = 10))
s

0    0
1    1
2    6
3    5
4    2
5    3
6    5
7    0
8    4
9    3
dtype: int64

In [146]:
s.value_counts()

5    2
3    2
0    2
6    1
4    1
2    1
1    1
dtype: int64

* String Methods

In [148]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [150]:
# 소문자로 변경 
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object