In [1]:
import numpy as np
import pandas as pd

In [3]:
pd.Series([1.0, 3.0, np.nan, np.inf, -1.0])

0    1.0
1    3.0
2    NaN
3    inf
4   -1.0
dtype: float64

In [5]:
al =pd.Series([1.0, 3.0, np.nan, np.inf, -1.0],index=['a','b','c','d','e'])
al

a    1.0
b    3.0
c    NaN
d    inf
e   -1.0
dtype: float64

In [6]:
al.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [7]:
al.values

array([ 1.,  3., nan, inf, -1.])

In [9]:
dict_date={
    '국어':100,
    '영어':99,
    '수학':65
}
a2 = pd.Series(dict_date)
a2

국어    100
영어     99
수학     65
dtype: int64

## Series 연산

In [17]:
s1 = pd.Series(np.arange(10.0))
s1

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [14]:
s2 = pd.Series(np.arange(0.1,1,0.1))
s2

0    0.1
1    0.2
2    0.3
3    0.4
4    0.5
5    0.6
6    0.7
7    0.8
8    0.9
dtype: float64

In [18]:
print(s1+s2)
print(s1 * s2)

0    0.1
1    1.2
2    2.3
3    3.4
4    4.5
5    5.6
6    6.7
7    7.8
8    8.9
9    NaN
dtype: float64
0    0.0
1    0.2
2    0.6
3    1.2
4    2.0
5    3.0
6    4.2
7    5.6
8    7.2
9    NaN
dtype: float64


### date_range

In [19]:
pd.date_range('2018-03-05','2020-03-05')

DatetimeIndex(['2018-03-05', '2018-03-06', '2018-03-07', '2018-03-08',
               '2018-03-09', '2018-03-10', '2018-03-11', '2018-03-12',
               '2018-03-13', '2018-03-14',
               ...
               '2020-02-25', '2020-02-26', '2020-02-27', '2020-02-28',
               '2020-02-29', '2020-03-01', '2020-03-02', '2020-03-03',
               '2020-03-04', '2020-03-05'],
              dtype='datetime64[ns]', length=732, freq='D')

In [20]:
pd.date_range('2018-03-05','2020-03-05', freq='M')

DatetimeIndex(['2018-03-31', '2018-04-30', '2018-05-31', '2018-06-30',
               '2018-07-31', '2018-08-31', '2018-09-30', '2018-10-31',
               '2018-11-30', '2018-12-31', '2019-01-31', '2019-02-28',
               '2019-03-31', '2019-04-30', '2019-05-31', '2019-06-30',
               '2019-07-31', '2019-08-31', '2019-09-30', '2019-10-31',
               '2019-11-30', '2019-12-31', '2020-01-31', '2020-02-29'],
              dtype='datetime64[ns]', freq='M')

## DataFrame 2차원 데이터

In [24]:
list1 = np.arange(10)
list2 = np.arange(0,1.0,0.1)

In [25]:
list1 = np.arange(15).reshape(5,3)
list1

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [31]:
df1 = pd.DataFrame(list1, columns=['a','b','c'], index=[1,2,3,4,5])
df1

Unnamed: 0,a,b,c
1,0,1,2
2,3,4,5
3,6,7,8
4,9,10,11
5,12,13,14


In [37]:
english_score = np.random.randint(70,100,5)
kor_score = np.random.randint(50,100,5)
math_score = np.random.randint(0,100,5)

score_df = pd.DataFrame({
    'english':english_score,
    'kor' : kor_score,
    'math' : math_score
})

In [38]:
score_df

Unnamed: 0,english,kor,math
0,91,56,10
1,87,88,53
2,95,54,67
3,75,87,72
4,87,93,29


In [39]:
score_df.index

RangeIndex(start=0, stop=5, step=1)

In [40]:
score_df.columns

Index(['english', 'kor', 'math'], dtype='object')

In [41]:
score_df.columns = ['eng','kor','math']
score_df

Unnamed: 0,eng,kor,math
0,91,56,10
1,87,88,53
2,95,54,67
3,75,87,72
4,87,93,29


## indexing, slicing

In [99]:
KTX_data = {'경부선 KTX': [39060, 39896, 42005, 43621, 41702, 41266, 32427],
            '호남선 KTX': [7313, 6967, 6873, 6626, 8675, 10622, 9228],
            '경전선 KTX': [3627, 4168, 4088, 4424, 4606, 4984, 5570],
            '전라선 KTX': [309, 1771, 1954, 2244, 3146, 3945, 5766],
            '동해선 KTX': [np.nan,np.nan, np.nan, np.nan, 2395, 3786, 6667]}

col_list = ['경부선 KTX','호남선 KTX','경전선 KTX','전라선 KTX','동해선 KTX']
index_list = ['2011', '2012', '2013', '2014', '2015', '2016', '2017']

In [100]:
pd.DataFrame(KTX_data,columns=col_list,index=index_list).to_csv('ktx_data.csv',index=False, encoding='cp949')

In [230]:
ktx_df = pd.read_csv('ktx_data.csv',encoding='cp949')
ktx_df.index = index_list
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [53]:
ktx_df.tail()#전체데이터에서 뒤에 데이터 5개가 기본

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [56]:
ktx_df.head(3) #전체데이터에서 앞에 데이터 5개가 기본

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,


In [55]:
ktx_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, 2011 to 2017
Data columns (total 5 columns):
경부선 KTX    7 non-null int64
호남선 KTX    7 non-null int64
경전선 KTX    7 non-null int64
전라선 KTX    7 non-null int64
동해선 KTX    3 non-null float64
dtypes: float64(1), int64(4)
memory usage: 336.0+ bytes


In [57]:
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [74]:
ktx_df[['경부선 KTX','전라선 KTX']]

Unnamed: 0,경부선 KTX,전라선 KTX
2011,39060,309
2012,39896,1771
2013,42005,1954
2014,43621,2244
2015,41702,3146
2016,41266,3945
2017,32427,5766


## loc, iloc

In [76]:
ktx_df.loc['2011','경부선 KTX']

39060

In [77]:
ktx_df.loc[:'2014',:]

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,


In [79]:
ktx_df.loc[:'2014',:'전라선 KTX']

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX
2011,39060,7313,3627,309
2012,39896,6967,4168,1771
2013,42005,6873,4088,1954
2014,43621,6626,4424,2244


In [80]:
# iloc
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [83]:
ktx_df.iloc[:5,:3]

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX
2011,39060,7313,3627
2012,39896,6967,4168
2013,42005,6873,4088
2014,43621,6626,4424
2015,41702,8675,4606


### 조건으로 slicing

In [96]:
ktx_df[ktx_df['호남선 KTX'] >7000 ][['호남선 KTX']]

Unnamed: 0,호남선 KTX
2011,7313
2015,8675
2016,10622
2017,9228


In [102]:
temp_df = pd.read_csv('ktx_data.csv',encoding='cp949')
temp_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
0,39060,7313,3627,309,
1,39896,6967,4168,1771,
2,42005,6873,4088,1954,
3,43621,6626,4424,2244,
4,41702,8675,4606,3146,2395.0
5,41266,10622,4984,3945,3786.0
6,32427,9228,5570,5766,6667.0


In [106]:
temp_df[:3]

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
0,39060,7313,3627,309,
1,39896,6967,4168,1771,
2,42005,6873,4088,1954,


## 메소드

In [107]:
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [110]:
ktx_df.sum()

경부선 KTX    279977.0
호남선 KTX     56304.0
경전선 KTX     31467.0
전라선 KTX     19135.0
동해선 KTX     12848.0
dtype: float64

In [112]:
ktx_df.sum(axis=0)

경부선 KTX    279977.0
호남선 KTX     56304.0
경전선 KTX     31467.0
전라선 KTX     19135.0
동해선 KTX     12848.0
dtype: float64

In [115]:
ktx_df.mean(),ktx_df.mean(axis=0)

(경부선 KTX    39996.714286
 호남선 KTX     8043.428571
 경전선 KTX     4495.285714
 전라선 KTX     2733.571429
 동해선 KTX     4282.666667
 dtype: float64, 경부선 KTX    39996.714286
 호남선 KTX     8043.428571
 경전선 KTX     4495.285714
 전라선 KTX     2733.571429
 동해선 KTX     4282.666667
 dtype: float64)

In [116]:
ktx_df.var(),ktx_df.var(axis=0)

(경부선 KTX    1.331265e+07
 호남선 KTX    2.253726e+06
 경전선 KTX    4.063516e+05
 전라선 KTX    3.080964e+06
 동해선 KTX    4.747504e+06
 dtype: float64, 경부선 KTX    1.331265e+07
 호남선 KTX    2.253726e+06
 경전선 KTX    4.063516e+05
 전라선 KTX    3.080964e+06
 동해선 KTX    4.747504e+06
 dtype: float64)

In [117]:
ktx_df.cumsum(),ktx_df.cumsum(axis=1)

(       경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
 2011   39060.0   7313.0   3627.0    309.0      NaN
 2012   78956.0  14280.0   7795.0   2080.0      NaN
 2013  120961.0  21153.0  11883.0   4034.0      NaN
 2014  164582.0  27779.0  16307.0   6278.0      NaN
 2015  206284.0  36454.0  20913.0   9424.0   2395.0
 2016  247550.0  47076.0  25897.0  13369.0   6181.0
 2017  279977.0  56304.0  31467.0  19135.0  12848.0,
       경부선 KTX  호남선 KTX  경전선 KTX  전라선 KTX  동해선 KTX
 2011  39060.0  46373.0  50000.0  50309.0      NaN
 2012  39896.0  46863.0  51031.0  52802.0      NaN
 2013  42005.0  48878.0  52966.0  54920.0      NaN
 2014  43621.0  50247.0  54671.0  56915.0      NaN
 2015  41702.0  50377.0  54983.0  58129.0  60524.0
 2016  41266.0  51888.0  56872.0  60817.0  64603.0
 2017  32427.0  41655.0  47225.0  52991.0  59658.0)

In [119]:
ktx_df.describe()

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
count,7.0,7.0,7.0,7.0,3.0
mean,39996.714286,8043.428571,4495.285714,2733.571429,4282.666667
std,3648.650331,1501.241359,637.457113,1755.267392,2178.876851
min,32427.0,6626.0,3627.0,309.0,2395.0
25%,39478.0,6920.0,4128.0,1862.5,3090.5
50%,41266.0,7313.0,4424.0,2244.0,3786.0
75%,41853.5,8951.5,4795.0,3545.5,5226.5
max,43621.0,10622.0,5570.0,5766.0,6667.0


In [120]:
ktx_df.T

Unnamed: 0,2011,2012,2013,2014,2015,2016,2017
경부선 KTX,39060.0,39896.0,42005.0,43621.0,41702.0,41266.0,32427.0
호남선 KTX,7313.0,6967.0,6873.0,6626.0,8675.0,10622.0,9228.0
경전선 KTX,3627.0,4168.0,4088.0,4424.0,4606.0,4984.0,5570.0
전라선 KTX,309.0,1771.0,1954.0,2244.0,3146.0,3945.0,5766.0
동해선 KTX,,,,,2395.0,3786.0,6667.0


In [122]:
ktx_df.values #numpy배열로 반환

array([[39060.,  7313.,  3627.,   309.,    nan],
       [39896.,  6967.,  4168.,  1771.,    nan],
       [42005.,  6873.,  4088.,  1954.,    nan],
       [43621.,  6626.,  4424.,  2244.,    nan],
       [41702.,  8675.,  4606.,  3146.,  2395.],
       [41266., 10622.,  4984.,  3945.,  3786.],
       [32427.,  9228.,  5570.,  5766.,  6667.]])

## 메소드

In [123]:
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [129]:
new_df = pd.DataFrame(
    [
        [50000,35000,7000,7000,8000]
    ],
    index =[2018],
    columns=ktx_df.columns
)
new_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2018,50000,35000,7000,7000,8000


In [133]:
appended_ktx = ktx_df.append(new_df, ignore_index=True)
appended_ktx

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
0,39060,7313,3627,309,
1,39896,6967,4168,1771,
2,42005,6873,4088,1954,
3,43621,6626,4424,2244,
4,41702,8675,4606,3146,2395.0
5,41266,10622,4984,3945,3786.0
6,32427,9228,5570,5766,6667.0
7,50000,35000,7000,7000,8000.0


In [162]:
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [170]:
# joine데이터 준비
data = np.random.randint(0,100000,(7,1))
data

array([[82132],
       [31141],
       [65172],
       [17345],
       [30105],
       [60040],
       [23284]])

In [192]:
# joine데이터 준비
air_df = pd.DataFrame(data,columns=['항공편'], index=index_list)
air_df

Unnamed: 0,항공편
2011,82132
2012,31141
2013,65172
2014,17345
2015,30105
2016,60040
2017,23284


In [193]:
ktx_df.join(air_df)

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank,항공편
2011,39060,7313,3627,309,,B,82132
2012,39896,6967,4168,1771,,B,31141
2013,42005,6873,4088,1954,,B,65172
2014,43621,6626,4424,2244,,C,17345
2015,41702,8675,4606,3146,2395.0,A,30105
2016,41266,10622,4984,3945,3786.0,C,60040
2017,32427,9228,5570,5766,6667.0,A,23284


## df에 새로운  column을 할당

In [173]:
air_df

Unnamed: 0,항공편
2011,82132
2012,31141
2013,65172
2014,17345
2015,30105
2016,60040
2017,23284


In [194]:
ktx_df['항공편'] = air_df['항공편']
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank,항공편
2011,39060,7313,3627,309,,B,82132
2012,39896,6967,4168,1771,,B,31141
2013,42005,6873,4088,1954,,B,65172
2014,43621,6626,4424,2244,,C,17345
2015,41702,8675,4606,3146,2395.0,A,30105
2016,41266,10622,4984,3945,3786.0,C,60040
2017,32427,9228,5570,5766,6667.0,A,23284


In [195]:
ktx_df = ktx_df.drop('항공편',axis=1)
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank
2011,39060,7313,3627,309,,B
2012,39896,6967,4168,1771,,B
2013,42005,6873,4088,1954,,B
2014,43621,6626,4424,2244,,C
2015,41702,8675,4606,3146,2395.0,A
2016,41266,10622,4984,3945,3786.0,C
2017,32427,9228,5570,5766,6667.0,A


In [161]:
ktx_df.drop('항공편',axis=1,inplace=True)
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


## merge

In [176]:
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [231]:
ktx_df['rank'] = 'A'
ktx_df.loc[:'2013','rank'] = 'B'
ktx_df.loc[['2014','2016'],'rank']='C'

ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank
2011,39060,7313,3627,309,,B
2012,39896,6967,4168,1771,,B
2013,42005,6873,4088,1954,,B
2014,43621,6626,4424,2244,,C
2015,41702,8675,4606,3146,2395.0,A
2016,41266,10622,4984,3945,3786.0,C
2017,32427,9228,5570,5766,6667.0,A


In [183]:
rank_df = pd.DataFrame({
    'rank':['A','B','C','D'],
    'price':[100,10,1,0.1]
})
rank_df

Unnamed: 0,rank,price
0,A,100.0
1,B,10.0
2,C,1.0
3,D,0.1


In [186]:
ktx_df.merge(rank_df, on='rank',how='outer')

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank,price
0,39060.0,7313.0,3627.0,309.0,,B,10.0
1,39896.0,6967.0,4168.0,1771.0,,B,10.0
2,42005.0,6873.0,4088.0,1954.0,,B,10.0
3,43621.0,6626.0,4424.0,2244.0,,C,1.0
4,41266.0,10622.0,4984.0,3945.0,3786.0,C,1.0
5,41702.0,8675.0,4606.0,3146.0,2395.0,A,100.0
6,32427.0,9228.0,5570.0,5766.0,6667.0,A,100.0
7,,,,,,D,0.1


In [187]:
ktx_df.merge(rank_df, on='rank',how='left')

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank,price
0,39060,7313,3627,309,,B,10.0
1,39896,6967,4168,1771,,B,10.0
2,42005,6873,4088,1954,,B,10.0
3,43621,6626,4424,2244,,C,1.0
4,41702,8675,4606,3146,2395.0,A,100.0
5,41266,10622,4984,3945,3786.0,C,1.0
6,32427,9228,5570,5766,6667.0,A,100.0


In [188]:
ktx_df.merge(rank_df, on='rank',how='right')

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank,price
0,39060.0,7313.0,3627.0,309.0,,B,10.0
1,39896.0,6967.0,4168.0,1771.0,,B,10.0
2,42005.0,6873.0,4088.0,1954.0,,B,10.0
3,43621.0,6626.0,4424.0,2244.0,,C,1.0
4,41266.0,10622.0,4984.0,3945.0,3786.0,C,1.0
5,41702.0,8675.0,4606.0,3146.0,2395.0,A,100.0
6,32427.0,9228.0,5570.0,5766.0,6667.0,A,100.0
7,,,,,,D,0.1


In [189]:
# value_count
ktx_df['rank'].value_counts()

B    3
C    2
A    2
Name: rank, dtype: int64

## sort_values

In [196]:
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank
2011,39060,7313,3627,309,,B
2012,39896,6967,4168,1771,,B
2013,42005,6873,4088,1954,,B
2014,43621,6626,4424,2244,,C
2015,41702,8675,4606,3146,2395.0,A
2016,41266,10622,4984,3945,3786.0,C
2017,32427,9228,5570,5766,6667.0,A


In [197]:
ktx_df.sort_values('경부선 KTX')

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank
2017,32427,9228,5570,5766,6667.0,A
2011,39060,7313,3627,309,,B
2012,39896,6967,4168,1771,,B
2016,41266,10622,4984,3945,3786.0,C
2015,41702,8675,4606,3146,2395.0,A
2013,42005,6873,4088,1954,,B
2014,43621,6626,4424,2244,,C


In [199]:
ktx_df.sort_values('경부선 KTX', ascending=False)

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank
2014,43621,6626,4424,2244,,C
2013,42005,6873,4088,1954,,B
2015,41702,8675,4606,3146,2395.0,A
2016,41266,10622,4984,3945,3786.0,C
2012,39896,6967,4168,1771,,B
2011,39060,7313,3627,309,,B
2017,32427,9228,5570,5766,6667.0,A


### corr 상관관계

In [203]:
ktx_df.corr()

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
경부선 KTX,1.0,-0.344196,-0.541461,-0.564343,-0.96016
호남선 KTX,-0.344196,1.0,0.7258,0.720398,0.0804
경전선 KTX,-0.541461,0.7258,1.0,0.993434,0.997205
전라선 KTX,-0.564343,0.720398,0.993434,1.0,0.99974
동해선 KTX,-0.96016,0.0804,0.997205,0.99974,1.0


In [204]:
ktx_df.corr('pearson')

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
경부선 KTX,1.0,-0.344196,-0.541461,-0.564343,-0.96016
호남선 KTX,-0.344196,1.0,0.7258,0.720398,0.0804
경전선 KTX,-0.541461,0.7258,1.0,0.993434,0.997205
전라선 KTX,-0.564343,0.720398,0.993434,1.0,0.99974
동해선 KTX,-0.96016,0.0804,0.997205,0.99974,1.0


In [205]:
ktx_df.corr('spearman')

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
경부선 KTX,1.0,-0.607143,-0.178571,-0.071429,-1.0
호남선 KTX,-0.607143,1.0,0.642857,0.607143,0.5
경전선 KTX,-0.178571,0.642857,1.0,0.964286,1.0
전라선 KTX,-0.071429,0.607143,0.964286,1.0,1.0
동해선 KTX,-1.0,0.5,1.0,1.0,1.0


### apply

In [211]:
#apply(<func>)
ktx_df

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX,rank
2011,39060,7313,3627,309,,B
2012,39896,6967,4168,1771,,B
2013,42005,6873,4088,1954,,B
2014,43621,6626,4424,2244,,C
2015,41702,8675,4606,3146,2395.0,A
2016,41266,10622,4984,3945,3786.0,C
2017,32427,9228,5570,5766,6667.0,A


In [212]:
def apply_func_sample(data):
    print(type(data))
    print(data)
    print("-"*30)

In [213]:
ktx_df.apply(apply_func_sample)

<class 'pandas.core.series.Series'>
2011    39060
2012    39896
2013    42005
2014    43621
2015    41702
2016    41266
2017    32427
Name: 경부선 KTX, dtype: object
------------------------------
<class 'pandas.core.series.Series'>
2011     7313
2012     6967
2013     6873
2014     6626
2015     8675
2016    10622
2017     9228
Name: 호남선 KTX, dtype: object
------------------------------
<class 'pandas.core.series.Series'>
2011    3627
2012    4168
2013    4088
2014    4424
2015    4606
2016    4984
2017    5570
Name: 경전선 KTX, dtype: object
------------------------------
<class 'pandas.core.series.Series'>
2011     309
2012    1771
2013    1954
2014    2244
2015    3146
2016    3945
2017    5766
Name: 전라선 KTX, dtype: object
------------------------------
<class 'pandas.core.series.Series'>
2011     NaN
2012     NaN
2013     NaN
2014     NaN
2015    2395
2016    3786
2017    6667
Name: 동해선 KTX, dtype: object
------------------------------
<class 'pandas.core.series.Series'>
2011    B
2012 

경부선 KTX    None
호남선 KTX    None
경전선 KTX    None
전라선 KTX    None
동해선 KTX    None
rank       None
dtype: object

In [241]:
ktx_df['경부선 KTX'].where(lambda x: x>=5000,'<5000')
ktx_df['경부선 KTX'].where(lambda x: x<5000,'>=5000')


2011    >=5000
2012    >=5000
2013    >=5000
2014    >=5000
2015    >=5000
2016    >=5000
2017    >=5000
Name: 경부선 KTX, dtype: object

In [244]:
def categorizing_by_5000(data):
    data = data.where(lambda x: x>=5000,0)
    data = data.where(lambda x : x<5000,1)
   
    return data

In [245]:
#inplace사용하면 실제값도 바뀜
ktx_df.iloc[:,:-2].apply(categorizing_by_5000)

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX
2011,1,1,0,0
2012,1,1,0,0
2013,1,1,0,0
2014,1,1,0,0
2015,1,1,0,0
2016,1,1,0,0
2017,1,1,1,1


# 연습문제