In [1]:
from pandas import Series, DataFrame
import pandas as pd

from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt

In [5]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [6]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [7]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [9]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [10]:
frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [11]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [12]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [15]:
frame.loc[['a', 'b', 'c', 'd'], states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


## Indexing, selection, and filtering

In [16]:
obj = Series([np.arange(4.)], index=['a', 'b', 'c', 'd'])
obj['b']

array([ 0.,  1.,  2.,  3.])

In [17]:
obj[1]

array([ 0.,  1.,  2.,  3.])

In [18]:
obj[2:4] # 숫자로 호출시 마지막 값은 제외하고 반환

c    [0.0, 1.0, 2.0, 3.0]
d    [0.0, 1.0, 2.0, 3.0]
dtype: object

In [19]:
obj[['b', 'a', 'd']]

b    [0.0, 1.0, 2.0, 3.0]
a    [0.0, 1.0, 2.0, 3.0]
d    [0.0, 1.0, 2.0, 3.0]
dtype: object

In [20]:
obj[[1, 3]]

b    [0.0, 1.0, 2.0, 3.0]
d    [0.0, 1.0, 2.0, 3.0]
dtype: object

In [23]:
obj['b':'c'] # 레이블로 호출 시 마지막 값까지 포함하여 반환

b    [0.0, 1.0, 2.0, 3.0]
c    [0.0, 1.0, 2.0, 3.0]
dtype: object

## Arithmetic and data alignment

In [24]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
print(s1)
print(s2)

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64


In [25]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [30]:
from IPython.core.interactiveshell import InteractiveShell

In [31]:
InteractiveShell.ast_node_interactivity = 'all'

# 쉘 안에 있는 요청된 모든 결과값을 출력 ('last' 옵션은 마지막 값만 출력)

In [32]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1
df2

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [33]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


## Arithmetic methods with fill values

In [39]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df1
df2

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [35]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [36]:
df1.add(df2, fill_value=0) # 둘 중 값이 없는 영역을 0으로 채워서 add 수행

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [38]:
df1.reindex(columns=df2.columns, fill_value=0)
# df1의 columns 형태를 df2와 일치시킴

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [41]:
df1.reindex(index= df2.index, columns=df2.columns)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,
1,4.0,5.0,6.0,7.0,
2,8.0,9.0,10.0,11.0,
3,,,,,


In [42]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [43]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [44]:
arr - arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [46]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame
series

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [47]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [48]:
series2= Series(range(3), index=['b', 'e', 'f'])
series2
frame + series2

b    0
e    1
f    2
dtype: int32

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [53]:
series3 = frame['d']
series3
frame.sub(series3, axis=0)

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [54]:
frame - series3

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


## Function application and mapping

In [57]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,1.532256,1.231623,-0.431919
Ohio,-0.252291,-0.758108,-0.017489
Texas,0.489973,-0.658772,-0.803285
Oregon,0.640425,-2.748455,1.175428


In [58]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.532256,1.231623,0.431919
Ohio,0.252291,0.758108,0.017489
Texas,0.489973,0.658772,0.803285
Oregon,0.640425,2.748455,1.175428


In [60]:
f = lambda x: x.max() - x.min()

In [61]:
frame.apply(f)

b    1.784547
d    3.980078
e    1.978714
dtype: float64

In [62]:
frame.apply(f, axis=1)

Utah      1.964175
Ohio      0.740618
Texas     1.293258
Oregon    3.923883
dtype: float64

In [63]:
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.252291,-2.748455,-0.803285
max,1.532256,1.231623,1.175428


In [65]:
format = lambda x: '%.2f' %x

In [71]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,1.53,1.23,-0.43
Ohio,-0.25,-0.76,-0.02
Texas,0.49,-0.66,-0.8
Oregon,0.64,-2.75,1.18


In [73]:
frame['e'].map(format)

Utah      -0.43
Ohio      -0.02
Texas     -0.80
Oregon     1.18
Name: e, dtype: object

## Sorting and ranking

In [74]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [75]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [76]:
frame.sort_index(axis=1) # Sort를 수행하여도 값이 실제로 바뀌는 건 아님.

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [78]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [79]:
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [81]:
frame.sort_values(by='b')

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [82]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [83]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [84]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [85]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [86]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                   'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [87]:
frame.rank(axis=1) # row기준 값들을 비교하여 순위를 매김

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


In [89]:
d1 = DataFrame(np.arange(15).reshape(3, 5))
d1

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14


In [90]:
d1.rank(axis=0)

Unnamed: 0,0,1,2,3,4
0,1.0,1.0,1.0,1.0,1.0
1,2.0,2.0,2.0,2.0,2.0
2,3.0,3.0,3.0,3.0,3.0


In [91]:
d1.rank(axis=1)

Unnamed: 0,0,1,2,3,4
0,1.0,2.0,3.0,4.0,5.0
1,1.0,2.0,3.0,4.0,5.0
2,1.0,2.0,3.0,4.0,5.0


## Axis indexes with duplicate values

In [93]:
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj             

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [94]:
obj.index.is_unique

False

In [95]:
obj['a']

a    0
a    1
dtype: int32

In [96]:
obj['c']

4

In [97]:
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,1.136194,0.076893,0.207515
a,0.589691,0.572011,-0.415713
b,-0.300044,-1.998121,-1.038338
b,-1.208146,-0.183324,-0.224324


In [98]:
df.loc['b']

Unnamed: 0,0,1,2
b,-0.300044,-1.998121,-1.038338
b,-1.208146,-0.183324,-0.224324


## Summarizing and computing descriptive statistics

In [100]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
              [np.nan, np.nan], [0.75, -1.3]],
              index=['a', 'b', 'c', 'd'],
              columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [102]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [103]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [104]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [106]:
df.idxmax()

one    b
two    d
dtype: object

In [107]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [109]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [110]:
obj = Series(['a', 'a', 'b', 'c'] * 4);obj
obj.describe() # top은 가장 많이 나타난 값

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

count     16
unique     3
top        a
freq       8
dtype: object

## Correlation and coveriance

In [117]:
import pandas_datareader as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker)

all_data

{'AAPL':                   Open        High         Low       Close   Adj Close  \
 Date                                                                     
 2010-01-04   30.490000   30.642857   30.340000   30.572857   27.505054   
 2010-01-05   30.657143   30.798571   30.464285   30.625713   27.552608   
 2010-01-06   30.625713   30.747143   30.107143   30.138571   27.114347   
 2010-01-07   30.250000   30.285715   29.864286   30.082857   27.064222   
 2010-01-08   30.042856   30.285715   29.865715   30.282858   27.244156   
 2010-01-11   30.400000   30.428572   29.778572   30.015715   27.003820   
 2010-01-12   29.884285   29.967142   29.488571   29.674286   26.696650   
 2010-01-13   29.695715   30.132856   29.157143   30.092857   27.073221   
 2010-01-14   30.015715   30.065714   29.860001   29.918571   26.916422   
 2010-01-15   30.132856   30.228571   29.410000   29.418571   26.466595   
 2010-01-19   29.761429   30.741428   29.605715   30.719999   27.637430   
 2010-01-20   30.

In [120]:
price = DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.items()})

In [123]:
returns = price.pct_change()
returns.tail(10)

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-21,-0.001841,-0.004403,0.00451,-0.00469
2017-08-22,0.016348,0.019886,0.004846,0.013999
2017-08-23,0.001252,0.002498,0.008014,-0.006014
2017-08-24,-0.004438,-0.00617,0.005628,-0.000413
2017-08-25,0.003704,-0.005851,0.005597,0.001788
2017-08-28,0.010071,-0.002271,-0.008557,0.000137
2017-08-29,0.008918,0.008185,0.004421,0.003021
2017-08-30,0.002701,0.008987,-0.004052,0.013142
2017-08-31,0.003979,0.010499,0.003297,0.010269
2017-09-01,0.000305,-0.002119,0.007341,-0.011101


In [124]:
returns.MSFT.corr(returns.IBM)

0.48300904568365077

In [125]:
returns.MSFT.cov(returns.IBM)

8.0518452845229066e-05

In [126]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.41409,0.36799,0.394345
GOOG,0.41409,1.0,0.391414,0.476315
IBM,0.36799,0.391414,1.0,0.483009
MSFT,0.394345,0.476315,0.483009,1.0


In [127]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000259,0.000102,7e-05,9e-05
GOOG,0.000102,0.000234,7.1e-05,0.000103
IBM,7e-05,7.1e-05,0.000139,8.1e-05
MSFT,9e-05,0.000103,8.1e-05,0.0002


In [128]:
returns.corrwith(returns.IBM)

AAPL    0.367990
GOOG    0.391414
IBM     1.000000
MSFT    0.483009
dtype: float64

In [129]:
returns.corrwith(volume)

AAPL   -0.072689
GOOG   -0.014020
IBM    -0.210648
MSFT   -0.090433
dtype: float64

## Unique values, value counts, and membership

In [130]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [131]:
uniques = obj.unique(); uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [132]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [134]:
pd.value_counts(obj.values, sort=False)

c    3
b    2
a    3
d    1
dtype: int64

In [135]:
mask = obj.isin(['b', 'c']); mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [136]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [137]:
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [138]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


## Handling missing data

In [139]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [140]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [141]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## Filtering out missing data

In [142]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [143]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [147]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
                 [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [148]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [149]:
data[4] = NA; data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [150]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [152]:
df = DataFrame(np.random.randn(7, 3))
df.loc[:4, 1] = NA; df.loc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.829802,,
1,-1.043467,,
2,0.748768,,
3,-0.653465,,-0.108493
4,1.249164,,0.085936
5,-1.654352,0.37747,0.521522
6,1.081079,-0.627849,1.824992


In [158]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
5,-1.654352,0.37747,0.521522
6,1.081079,-0.627849,1.824992


## Filling in missing data

In [159]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.829802,0.0,0.0
1,-1.043467,0.0,0.0
2,0.748768,0.0,0.0
3,-0.653465,0.0,-0.108493
4,1.249164,0.0,0.085936
5,-1.654352,0.37747,0.521522
6,1.081079,-0.627849,1.824992


In [160]:
df.fillna({1: 0.5, 3: -1})

Unnamed: 0,0,1,2
0,0.829802,0.5,
1,-1.043467,0.5,
2,0.748768,0.5,
3,-0.653465,0.5,-0.108493
4,1.249164,0.5,0.085936
5,-1.654352,0.37747,0.521522
6,1.081079,-0.627849,1.824992


In [161]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.829802,0.0,0.0
1,-1.043467,0.0,0.0
2,0.748768,0.0,0.0
3,-0.653465,0.0,-0.108493
4,1.249164,0.0,0.085936
5,-1.654352,0.37747,0.521522
6,1.081079,-0.627849,1.824992


In [162]:
df = DataFrame(np.random.randn(6, 3))
df.loc[2:, 1] = NA; df.loc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.575744,-0.713005,-0.433134
1,0.208134,0.415482,0.700486
2,0.647905,,-0.394148
3,-3.271371,,0.50207
4,0.619805,,
5,-0.072034,,


In [163]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.575744,-0.713005,-0.433134
1,0.208134,0.415482,0.700486
2,0.647905,0.415482,-0.394148
3,-3.271371,0.415482,0.50207
4,0.619805,0.415482,0.50207
5,-0.072034,0.415482,0.50207


In [164]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.575744,-0.713005,-0.433134
1,0.208134,0.415482,0.700486
2,0.647905,0.415482,-0.394148
3,-3.271371,0.415482,0.50207
4,0.619805,,0.50207
5,-0.072034,,0.50207


In [166]:
data = Series([1., NA, 3.5, NA, 7])
data.mean()

3.8333333333333335

In [167]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## Hierarchical indexing

In [168]:
data = Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data

a  1    1.040921
   2   -0.952738
   3   -0.491666
b  1    1.404710
   2    0.888648
   3    0.816714
c  1    1.067306
   2    0.364526
d  2   -0.385297
   3    1.234713
dtype: float64

In [169]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [170]:
data['b']

1    1.404710
2    0.888648
3    0.816714
dtype: float64

In [171]:
data['b':'c']

b  1    1.404710
   2    0.888648
   3    0.816714
c  1    1.067306
   2    0.364526
dtype: float64

In [172]:
data.loc[['b', 'd']]

b  1    1.404710
   2    0.888648
   3    0.816714
d  2   -0.385297
   3    1.234713
dtype: float64

In [173]:
data[:,2]

a   -0.952738
b    0.888648
c    0.364526
d   -0.385297
dtype: float64

In [174]:
data.unstack()

Unnamed: 0,1,2,3
a,1.040921,-0.952738,-0.491666
b,1.40471,0.888648,0.816714
c,1.067306,0.364526,
d,,-0.385297,1.234713


In [177]:
data.unstack().stack()

a  1    1.040921
   2   -0.952738
   3   -0.491666
b  1    1.404710
   2    0.888648
   3    0.816714
c  1    1.067306
   2    0.364526
d  2   -0.385297
   3    1.234713
dtype: float64

In [178]:
s = Series([3, 8, 2, 0, 3, 5], index=list('abcdef'));s

a    3
b    8
c    2
d    0
e    3
f    5
dtype: int64

In [180]:
data.unstack(level=0)

Unnamed: 0,a,b,c,d
1,1.040921,1.40471,1.067306,
2,-0.952738,0.888648,0.364526,-0.385297
3,-0.491666,0.816714,,1.234713


In [181]:
data.unstack(level=-1)

Unnamed: 0,1,2,3
a,1.040921,-0.952738,-0.491666
b,1.40471,0.888648,0.816714
c,1.067306,0.364526,
d,,-0.385297,1.234713


In [182]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [183]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [184]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


## Reordering and sorting levels

In [186]:
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [190]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [192]:
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


## Summary statistics by level

In [193]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [194]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


## Using a DataFrame's columns

In [195]:
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [196]:
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [197]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [198]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


---

# Other pandas topic

## integer indexing

In [202]:
ser = Series(np.arange(3.))
ser.iloc[-1]

2.0

In [203]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [204]:
ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

2.0

In [206]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [209]:
ser3 = Series(range(3), index=[-5, 1, 3])
ser3
ser3.iloc[2]

-5    0
 1    1
 3    2
dtype: int32

2

In [210]:
frame = DataFrame(np.arange(6).reshape((3,2)), index=[2, 0, 1])
frame
frame.iloc[0]

Unnamed: 0,0,1
2,0,1
0,2,3
1,4,5


0    0
1    1
Name: 2, dtype: int32

## Panel data

In [213]:
import pandas_datareader as web

pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk))
                     for stk in ['AAPL', 'GOOG', 'MSFT', 'IBM']))

In [214]:
pdata

<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 1931 (major_axis) x 6 (minor_axis)
Items axis: AAPL to MSFT
Major_axis axis: 2010-01-04 00:00:00 to 2017-09-01 00:00:00
Minor_axis axis: Open to Volume

In [217]:
pdata = pdata.swapaxes('items', 'minor')
pdata['Adj Close']

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.505054,312.204773,109.173752,25.275177
2010-01-05,27.552608,310.829926,107.854950,25.283342
2010-01-06,27.114347,302.994293,107.154320,25.128178
2010-01-07,27.064222,295.940735,106.783417,24.866852
2010-01-08,27.244156,299.885956,107.854950,25.038349
2010-01-11,27.003820,299.432648,106.725700,24.719858
2010-01-12,26.696650,294.137512,107.574692,24.556530
2010-01-13,27.073221,292.448822,107.343887,24.785191
2010-01-14,26.916422,293.823669,109.058350,25.283342
2010-01-15,26.466595,288.917053,108.621490,25.201679


In [218]:
pdata.loc[:, '6/1/2012', :]

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
AAPL,81.308571,81.807144,80.074287,80.141426,72.099739,130246900.0
GOOG,284.827393,285.255798,283.113831,284.42392,284.42392,6138700.0
IBM,190.119995,191.720001,188.600006,189.080002,162.828171,5206400.0
MSFT,28.76,28.959999,28.440001,28.450001,24.668653,56634300.0


In [219]:
pdata.loc['Adj Close', '5/22/2012':, :]

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-05-22,71.583061,299.278229,169.493561,25.804539
2012-05-23,73.329674,303.592072,168.890732,25.240936
2012-05-24,72.656212,300.702881,168.864868,25.206251
2012-05-25,72.266800,294.660553,167.323425,25.197577
2012-05-29,73.549454,296.060303,169.183533,25.631123
2012-05-30,74.436249,293.016693,167.521484,25.440367
2012-05-31,74.251175,289.345459,166.117752,25.310303
2012-06-01,72.099739,284.423920,162.828171,24.668653
2012-06-04,72.523842,288.214691,162.363113,24.755363
2012-06-05,72.336205,284.139984,162.931534,24.720682


In [220]:
stacked = pdata.loc[:, '5/30/2012':, :].to_frame()
stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Adj Close,Volume
Date,minor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-05-30,AAPL,81.314285,82.855713,80.937141,82.738571,74.436249,132357400.0
2012-05-30,GOOG,292.981842,294.844849,290.675476,293.016693,293.016693,3827600.0
2012-05-30,IBM,194.800003,195.470001,193.770004,194.529999,167.521484,3602500.0
2012-05-30,MSFT,29.350000,29.480000,29.120001,29.340000,25.440367,41585500.0
2012-05-31,AAPL,82.962860,83.071426,81.637146,82.532860,74.251175,122918600.0
2012-05-31,GOOG,293.260773,293.898407,288.418945,289.345459,289.345459,5958800.0
2012-05-31,IBM,194.100006,194.929993,192.000000,192.899994,166.117752,9287500.0
2012-05-31,MSFT,29.299999,29.420000,28.940001,29.190001,25.310303,39134000.0
2012-06-01,AAPL,81.308571,81.807144,80.074287,80.141426,72.099739,130246900.0
2012-06-01,GOOG,284.827393,285.255798,283.113831,284.423920,284.423920,6138700.0


In [221]:
stacked.to_panel()

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  """Entry point for launching an IPython kernel.


<class 'pandas.core.panel.Panel'>
Dimensions: 6 (items) x 1325 (major_axis) x 4 (minor_axis)
Items axis: Open to Volume
Major_axis axis: 2012-05-30 00:00:00 to 2017-09-01 00:00:00
Minor_axis axis: AAPL to MSFT

# Data loading, storage,and file formats

In [224]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10,6))
from pandas import Series, DataFrame
import pandas as pd
np.set_printoptions(precision=4)

In [225]:
%pwd

'c:\\pydev\\pydata-book-master'

## Reading and Writind Data in Text Format

In [228]:
!type ch06\ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [230]:
df = pd.read_csv('ch06/ex1.csv'); df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [231]:
pd.read_table('ch06/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [232]:
!type ch06\ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [235]:
from IPython.core.interactiveshell import InteractiveShell

In [236]:
InteractiveShell.ast_node_interactivity = 'all'

In [237]:
pd.read_csv('ch06/ex2.csv', header=None)
pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [238]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ch06/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [240]:
!type ch06\csv_mindex.csv
parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])
parsed

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [242]:
list(open('ch06/ex3.txt'))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [243]:
result = pd.read_table('ch06/ex3.txt', sep='\s+')
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [None]:
정규 표현식
ㅇ? : ? 앞에 문자가 0 or 1 포함된 경우
ㅇ+ : + 앞에 문자가 1번이상 반복
ㅇ* : * 앞에 문자가 0번이상 반복



In [245]:
!type ch06\ex4.csv
pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [246]:
!type ch06\ex5.csv
result = pd.read_csv('ch06/ex5.csv')
result
pd.isnull(result)

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo


Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [247]:
result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [248]:
result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [249]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('ch06/ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


## Reading text files in pieces

In [251]:
result = pd.read_csv('ch06/ex6.csv')
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.817480,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.358480,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.637830,2.172201,G


In [252]:
pd.read_csv('ch06/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [253]:
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)
chunker

<pandas.io.parsers.TextFileReader at 0xbcccc18>

In [265]:
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)

tot = Series([])
for piece in chunker:
    tot =tot.add(piece['key'].value_counts(), fill_value=0)
tot = tot.sort_values(ascending=False)


In [266]:
tot[:10] # 가장 많이 나온 10개
tot[-10:] # 가장 적게 나온 10개

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

4    171.0
6    166.0
7    164.0
8    162.0
3    162.0
5    157.0
2    152.0
0    151.0
9    150.0
1    146.0
dtype: float64

In [267]:
tot.sum()

10000.0

## Writing data out to text format

In [268]:
data = pd.read_csv('ch06/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [269]:
data.to_csv('ch06/out.csv')
!type ch06\out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [270]:
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [272]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [273]:
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [274]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


In [276]:
dates = pd.date_range('1/1/2000', periods=7)
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

In [279]:
ts = Series(np.arange(7), index=dates)
ts.to_csv('ch06/tseries.csv')
!type ch06\tseries.csv

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


In [280]:
Series.from_csv('ch06/tseries.csv', parse_dates=True)

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

## Manually working with delimited formats

In [281]:
!type ch06\ex7.csv

"a","b","c"
"1","2","3"
"1","2","3","4"


In [282]:
import csv
f = open('ch06/ex7.csv')

reader = csv.reader(f)

In [283]:
for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']


In [284]:
lines = list(csv.reader(open('ch06/ex7.csv')))
header, values = lines[0], lines[1:]
header
values
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

['a', 'b', 'c']

[['1', '2', '3'], ['1', '2', '3', '4']]

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [287]:
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL    

In [288]:
with open('mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

14

6

6

6

In [289]:
!type mydata.csv

one;two;three
1;2;3
4;5;6
7;8;9


## JSON data

In [293]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
              {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""

In [295]:
import json
result = json.loads(obj)
result
type(result)

{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},
  {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}

dict

In [297]:
asjson = json.dumps(result); asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}]}'

In [298]:
siblings = DataFrame(result['siblings'], columns=['name', 'age']); siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


##  Parsing XML with lxml.objectify

In [301]:
!type ch06\mta_perf\Performance_MNR.xml

<?xml  version="1.0" encoding="ISO-8859-1"?>
<PERFORMANCE>
<INDICATOR>
  <INDICATOR_SEQ>28445</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>On-Time Performance (West of Hudson)</INDICATOR_NAME>
  <DESCRIPTION>Percent of commuter trains that arrive at their destinations within 5 minutes and 59 seconds of the scheduled time. West of Hudson services include the Pascack Valley and Port Jervis lines. Metro-North Railroad contracts with New Jersey Transit to operate service on these lines.
</DESCRIPTION>
  <PERIOD_YEAR>2008</PERIOD_YEAR>
  <PERIOD_MONTH>1</PERIOD_MONTH>
  <CATEGORY>Service Indicators</CATEGORY>
  <FREQUENCY>M</FREQUENCY>
  <DESIRED_CHANGE>U</DESIRED_CHANGE>
  <INDICATOR_UNIT>%</INDICATOR_UNIT>
  <DECIMAL_PLACES>1</DECIMAL_PLACES>
  <YTD_TARGET>95.00</YTD_TARGET>
  <YTD_ACTUAL>96.90</YTD_ACTUAL>
  <MONTHLY_TARGET>95.00</MONTHLY_TARGET>
  <MONTHLY_ACTUAL>96.90</MONTHLY_ACTUAL>
</INDICATOR>
<INDICATOR>
  <INDICAT

  <DECIMAL_PLACES>1</DECIMAL_PLACES>
  <YTD_TARGET>95.20</YTD_TARGET>
  <YTD_ACTUAL>96.70</YTD_ACTUAL>
  <MONTHLY_TARGET>95.20</MONTHLY_TARGET>
  <MONTHLY_ACTUAL>96.70</MONTHLY_ACTUAL>
</INDICATOR>
<INDICATOR>
  <INDICATOR_SEQ>28460</INDICATOR_SEQ>
  <PARENT_SEQ>28445</PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>Port Jervis Line - OTP</INDICATOR_NAME>
  <DESCRIPTION>Percent of commuter trains that arrive at their destinations within 5 minutes and 59 seconds of the scheduled time. Metro-North Railroad contracts with New York Transit to operate service on the Port Jervis Line.</DESCRIPTION>
  <PERIOD_YEAR>2010</PERIOD_YEAR>
  <PERIOD_MONTH>2</PERIOD_MONTH>
  <CATEGORY>Service Indicators</CATEGORY>
  <FREQUENCY>M</FREQUENCY>
  <DESIRED_CHANGE>U</DESIRED_CHANGE>
  <INDICATOR_UNIT>%</INDICATOR_UNIT>
  <DECIMAL_PLACES>1</DECIMAL_PLACES>
  <YTD_TARGET>95.20</YTD_TARGET>
  <YTD_ACTUAL>93.40</YTD_ACTUAL>
  <MONTHLY_TARGET>95.20</MONTHLY_TARGET>
  <MONTHLY_ACT

  <INDICATOR_SEQ>28463</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>Mean Distance Between Failures</INDICATOR_NAME>
  <DESCRIPTION>Average number of miles a railcar travels before a mechanical failure makes the train arrive at its final destination later than 5 minutes and 59 seconds</DESCRIPTION>
  <PERIOD_YEAR>2010</PERIOD_YEAR>
  <PERIOD_MONTH>12</PERIOD_MONTH>
  <CATEGORY>Service Indicators</CATEGORY>
  <FREQUENCY>M</FREQUENCY>
  <DESIRED_CHANGE>U</DESIRED_CHANGE>
  <INDICATOR_UNIT>-</INDICATOR_UNIT>
  <DECIMAL_PLACES>0</DECIMAL_PLACES>
  <YTD_TARGET>115,000.00</YTD_TARGET>
  <YTD_ACTUAL>129,329.00</YTD_ACTUAL>
  <MONTHLY_TARGET>115,000.00</MONTHLY_TARGET>
  <MONTHLY_ACTUAL>102,536.00</MONTHLY_ACTUAL>
</INDICATOR>
<INDICATOR>
  <INDICATOR_SEQ>28463</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>Mean Distance Between Failures</INDICATOR_NAME>
  <DESCRIP

  <INDICATOR_SEQ>28627</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>Employee Lost Time and Restricted Duty Rate</INDICATOR_NAME>
  <DESCRIPTION>An employee lost time injury or illness is one that prevents an employee from returning to work for at least one full shift. The rate is injuries and illnesses per 200,000 worker hours.</DESCRIPTION>
  <PERIOD_YEAR>2011</PERIOD_YEAR>
  <PERIOD_MONTH>6</PERIOD_MONTH>
  <CATEGORY>Safety Indicators</CATEGORY>
  <FREQUENCY>M</FREQUENCY>
  <DESIRED_CHANGE>D</DESIRED_CHANGE>
  <INDICATOR_UNIT>-</INDICATOR_UNIT>
  <DECIMAL_PLACES>2</DECIMAL_PLACES>
  <YTD_TARGET>1.70</YTD_TARGET>
  <YTD_ACTUAL></YTD_ACTUAL>
  <MONTHLY_TARGET>1.70</MONTHLY_TARGET>
  <MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
<INDICATOR>
  <INDICATOR_SEQ>28627</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>Employee Lost Time and Restricted Duty Rate</IN

  <MONTHLY_TARGET>98.20</MONTHLY_TARGET>
  <MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
<INDICATOR>
  <INDICATOR_SEQ>28345</INDICATOR_SEQ>
  <PARENT_SEQ>55526</PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>Hudson Line - OTP</INDICATOR_NAME>
  <DESCRIPTION>Percent of commuter trains that arrive at their destinations within 5 minutes and 59 seconds of the scheduled time.</DESCRIPTION>
  <PERIOD_YEAR>2011</PERIOD_YEAR>
  <PERIOD_MONTH>9</PERIOD_MONTH>
  <CATEGORY>Service Indicators</CATEGORY>
  <FREQUENCY>M</FREQUENCY>
  <DESIRED_CHANGE>U</DESIRED_CHANGE>
  <INDICATOR_UNIT>%</INDICATOR_UNIT>
  <DECIMAL_PLACES>1</DECIMAL_PLACES>
  <YTD_TARGET>98.20</YTD_TARGET>
  <YTD_ACTUAL></YTD_ACTUAL>
  <MONTHLY_TARGET>98.20</MONTHLY_TARGET>
  <MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
<INDICATOR>
  <INDICATOR_SEQ>28345</INDICATOR_SEQ>
  <PARENT_SEQ>55526</PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>Hudson Line - OTP</INDICATOR

  <INDICATOR_NAME>On-Time Performance (East of Hudson)</INDICATOR_NAME>
  <DESCRIPTION>Percent of commuter trains that arrive at their destinations within 5 minutes and 59 seconds of the scheduled time. East of Hudson service includes the Harlem, Hudson and New Haven lines.</DESCRIPTION>
  <PERIOD_YEAR>2008</PERIOD_YEAR>
  <PERIOD_MONTH>12</PERIOD_MONTH>
  <CATEGORY>Service Indicators</CATEGORY>
  <FREQUENCY>M</FREQUENCY>
  <DESIRED_CHANGE>U</DESIRED_CHANGE>
  <INDICATOR_UNIT>%</INDICATOR_UNIT>
  <DECIMAL_PLACES>1</DECIMAL_PLACES>
  <YTD_TARGET>97.60</YTD_TARGET>
  <YTD_ACTUAL>97.50</YTD_ACTUAL>
  <MONTHLY_TARGET>97.60</MONTHLY_TARGET>
  <MONTHLY_ACTUAL>96.50</MONTHLY_ACTUAL>
</INDICATOR>
<INDICATOR>
  <INDICATOR_SEQ>55526</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>On-Time Performance (East of Hudson)</INDICATOR_NAME>
  <DESCRIPTION>Percent of commuter trains that arrive at their destinations within 5 minutes and 59 s

  <MONTHLY_ACTUAL>100.00</MONTHLY_ACTUAL>
</INDICATOR>
<INDICATOR>
  <INDICATOR_SEQ>373889</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
  <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION>
  <PERIOD_YEAR>2010</PERIOD_YEAR>
  <PERIOD_MONTH>5</PERIOD_MONTH>
  <CATEGORY>Service Indicators</CATEGORY>
  <FREQUENCY>M</FREQUENCY>
  <DESIRED_CHANGE>U</DESIRED_CHANGE>
  <INDICATOR_UNIT>%</INDICATOR_UNIT>
  <DECIMAL_PLACES>1</DECIMAL_PLACES>
  <YTD_TARGET>97.00</YTD_TARGET>
  <YTD_ACTUAL>99.58</YTD_ACTUAL>
  <MONTHLY_TARGET>97.00</MONTHLY_TARGET>
  <MONTHLY_ACTUAL>100.00</MONTHLY_ACTUAL>
</INDICATOR>
<INDICATOR>
  <INDICATOR_SEQ>373889</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGE

In [303]:
from lxml import objectify

path = 'ch06\mta_perf\Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()
parsed
root

<lxml.etree._ElementTree at 0xa812d48>

<Element PERFORMANCE at 0xc6b8548>

In [304]:
data = []

skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
               'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
data

[{'AGENCY_NAME': 'Metro-North Railroad',
  'CATEGORY': 'Service Indicators',
  'DESCRIPTION': 'Percent of commuter trains that arrive at their destinations within 5 minutes and 59 seconds of the scheduled time. West of Hudson services include the Pascack Valley and Port Jervis lines. Metro-North Railroad contracts with New Jersey Transit to operate service on these lines.\n',
  'FREQUENCY': 'M',
  'INDICATOR_NAME': 'On-Time Performance (West of Hudson)',
  'INDICATOR_UNIT': '%',
  'MONTHLY_ACTUAL': 96.9,
  'MONTHLY_TARGET': 95.0,
  'PERIOD_MONTH': 1,
  'PERIOD_YEAR': 2008,
  'YTD_ACTUAL': 96.9,
  'YTD_TARGET': 95.0},
 {'AGENCY_NAME': 'Metro-North Railroad',
  'CATEGORY': 'Service Indicators',
  'DESCRIPTION': 'Percent of commuter trains that arrive at their destinations within 5 minutes and 59 seconds of the scheduled time. West of Hudson services include the Pascack Valley and Port Jervis lines. Metro-North Railroad contracts with New Jersey Transit to operate service on these lines.\

In [305]:
perf = DataFrame(data); perf

Unnamed: 0,AGENCY_NAME,CATEGORY,DESCRIPTION,FREQUENCY,INDICATOR_NAME,INDICATOR_UNIT,MONTHLY_ACTUAL,MONTHLY_TARGET,PERIOD_MONTH,PERIOD_YEAR,YTD_ACTUAL,YTD_TARGET
0,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,1,2008,96.9,95
1,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,95,95,2,2008,96,95
2,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,3,2008,96.3,95
3,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,98.3,95,4,2008,96.8,95
4,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,95.8,95,5,2008,96.6,95
5,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,94.4,95,6,2008,96.2,95
6,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96,95,7,2008,96.2,95
7,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.4,95,8,2008,96.2,95
8,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,93.7,95,9,2008,95.9,95
9,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.4,95,10,2008,96,95


In [306]:
root

<Element PERFORMANCE at 0xc6b8548>

In [308]:
root.get('href')
root.text

## Oracle DB 연동

In [317]:
from sqlalchemy import create_engine
import cx_Oracle
con = cx_Oracle.connect('scott/tiger@192.168.56.101/orcl')
print(con.version)
con.close()

11.2.0.1.0


In [319]:
from sqlalchemy import create_engine
import cx_Oracle
con = cx_Oracle.connect('scott/tiger@192.168.56.101/orcl')
cur = con.cursor()
cur.execute('select * from emp order by empno')
for result in cur:
    print(result)
cur.close()
con.close()

<cx_Oracle.Cursor on <cx_Oracle.Connection to scott@192.168.56.101/orcl>>

(7369, 'SMITH', 'CLERK', 7902, datetime.datetime(1980, 12, 17, 0, 0), 800.0, None, 20)
(7499, 'ALLEN', 'SALESMAN', 7698, datetime.datetime(1981, 2, 20, 0, 0), 1600.0, 300.0, 30)
(7521, 'WARD', 'SALESMAN', 7698, datetime.datetime(1981, 2, 22, 0, 0), 1250.0, 500.0, 30)
(7566, 'JONES', 'MANAGER', 7839, datetime.datetime(1981, 4, 2, 0, 0), 2975.0, None, 20)
(7654, 'MARTIN', 'SALESMAN', 7698, datetime.datetime(1981, 9, 28, 0, 0), 1250.0, 1400.0, 30)
(7698, 'BLAKE', 'MANAGER', 7839, datetime.datetime(1981, 5, 1, 0, 0), 2850.0, None, 30)
(7782, 'CLARK', 'MANAGER', 7839, datetime.datetime(1981, 6, 9, 0, 0), 2450.0, None, 10)
(7788, 'SCOTT', 'ANALYST', 7566, datetime.datetime(1987, 4, 19, 0, 0), 3000.0, None, 20)
(7839, 'KING', 'PRESIDENT', None, datetime.datetime(1981, 11, 17, 0, 0), 5000.0, None, 10)
(7844, 'TURNER', 'SALESMAN', 7698, datetime.datetime(1981, 9, 8, 0, 0), 1500.0, 0.0, 30)
(7876, 'ADAMS', 'CLERK', 7788, datetime.datetime(1987, 5, 23, 0, 0), 1100.0, None, 20)
(7900, 'JAMES', 'CL

In [2]:
import numpy as np
from pandas import DataFrame, Series
import pandas as pd

from sqlalchemy import create_engine
import cx_Oracle

In [3]:
engine = create_engine('oracle://scott:tiger@192.168.56.101:1521/orcl')

In [4]:
with engine.connect() as conn, conn.begin():
    data = pd.read_sql_table('emp',conn)

In [5]:
data

Unnamed: 0,empno,ename,job,mgr,hiredate,sal,comm,deptno
0,7369,SMITH,CLERK,7902.0,1980-12-17,800,,20
1,7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300.0,30
2,7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500.0,30
3,7566,JONES,MANAGER,7839.0,1981-04-02,2975,,20
4,7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250,1400.0,30
5,7698,BLAKE,MANAGER,7839.0,1981-05-01,2850,,30
6,7782,CLARK,MANAGER,7839.0,1981-06-09,2450,,10
7,7788,SCOTT,ANALYST,7566.0,1987-04-19,3000,,20
8,7839,KING,PRESIDENT,,1981-11-17,5000,,10
9,7844,TURNER,SALESMAN,7698.0,1981-09-08,1500,0.0,30
