# Numpy 
 - Python에서 과학 계산을 위한 패키지. 이후에 배울 pandas, scikit-learn 등등이 모두 numpy에 바탕을 두고 있다

In [2]:
import numpy as np

## Array

In [4]:
a = np.array([1,2,3])
a

array([1, 2, 3])

In [6]:
a[:2]

array([1, 2])

## array와 리스트의 차이 
 - array와 리스트는 사용방법이 거의 비슷하지만 내부 구조가 다르다. 똑같은 내용이라도 array로 저장하면 메모리 용량도 더 적게 차지하고 속도도 더 빠르다.
 - 리스트는 여러 가지 자료형을 섞어서 쓸 수 있으나 **(예: [1, 'hello', 3.14')**, array는 한 가지 자료형만 쓸 수 있다.
 - 리스트는 Linked List 형태로 되어있고 Numpy 의 Array는 배열 형태로 되어있다. 

# Pandas 
 - R의 데이터프레임(data.frame)과 같은 형식으로 데이터를 다룰 수 있게 해주는 라이브러리
 - Series는 R의 vector와, DataFrame은 R의 data.frame과 유사

In [7]:
from pandas import DataFrame, Series
import pandas as dp

## Series

In [9]:
s = Series([1, 3, 5, numpy.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [11]:
s[0]

1.0

In [13]:
s[1]

3.0

## DataFrame

In [14]:
df = DataFrame({'name': ['유재명', '이정민', '박희영'],
                'age': [17, 20, 23]})

In [16]:
df

Unnamed: 0,age,name
0,17,유재명
1,20,이정민
2,23,박희영


In [18]:
df['age']

0    17
1    20
2    23
Name: age, dtype: int64

In [20]:
df['name']

0    유재명
1    이정민
2    박희영
Name: name, dtype: object

## .loc

In [21]:
df.loc[0]

age      17
name    유재명
Name: 0, dtype: object

In [22]:
df.loc[0,'name']

'유재명'

## .iloc

In [24]:
df.iloc[0,1]

'유재명'

## .ix
 - 알아서 .loc과 .iloc을 적용해준다

In [25]:
df.ix[0]

age      17
name    유재명
Name: 0, dtype: object

In [26]:
df.ix[0, 'name']

'유재명'

In [27]:
df.ix[0, 1]

'유재명'

## boolean indexing

In [28]:
df[df.age > 19]

Unnamed: 0,age,name
1,20,이정민
2,23,박희영


## csv로 저장

In [29]:
df.to_csv('people.csv', encoding='cp949')

## csv 읽기
 - <a href="http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html"> pandas.read_csv() </a>

In [30]:
from pandas import read_csv

In [31]:
read_csv('people.csv', encoding='cp949')

Unnamed: 0.1,Unnamed: 0,age,name
0,0,17,유재명
1,1,20,이정민
2,2,23,박희영


## 실습

In [32]:
car = read_csv('automobile.csv')

In [34]:
car.shape

(159, 26)

In [35]:
car.head()

Unnamed: 0,symboling,normalized_losses,maker,fuel,aspiration,doors,body,wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
1,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
2,1,158,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
3,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
4,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430


## filtering

In [36]:
car.loc[car.wheels=="4wd"]

Unnamed: 0,symboling,normalized_losses,maker,fuel,aspiration,doors,body,wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
1,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
99,2,83,subaru,gas,std,two,hatchback,4wd,front,93.3,...,108,2bbl,3.62,2.64,8.7,73,4400,26,31,7603
103,0,102,subaru,gas,std,four,sedan,4wd,front,97.0,...,108,2bbl,3.62,2.64,9.0,82,4800,24,25,9233
104,0,102,subaru,gas,turbo,four,sedan,4wd,front,97.0,...,108,mpfi,3.62,2.64,7.7,111,4800,24,29,11259
107,0,85,subaru,gas,std,four,wagon,4wd,front,96.9,...,108,2bbl,3.62,2.64,9.0,82,4800,23,29,8013
108,0,85,subaru,gas,turbo,four,wagon,4wd,front,96.9,...,108,mpfi,3.62,2.64,7.7,111,4800,23,23,11694
113,0,81,toyota,gas,std,four,wagon,4wd,front,95.7,...,92,2bbl,3.05,3.03,9.0,62,4800,27,32,7898
114,0,91,toyota,gas,std,four,wagon,4wd,front,95.7,...,92,2bbl,3.05,3.03,9.0,62,4800,27,32,8778


In [37]:
car.loc[car.wheels=="4wd", 'symboling']

1      2
99     2
103    0
104    0
107    0
108    0
113    0
114    0
Name: symboling, dtype: int64

In [38]:
car.loc[car.wheels == '4wd', 'symboling'].mean()

0.5

In [39]:
car.loc[car.wheels == 'fwd', 'symboling'].mean()

0.89523809523809528

In [40]:
car.loc[car.wheels == 'rwd', 'symboling'].mean()

0.41304347826086957

## Grouping

In [41]:
grouped = car.groupby('wheels')

In [42]:
grouped

<pandas.core.groupby.DataFrameGroupBy object at 0x00000126B42C4E80>

In [43]:
grouped.get_group('4wd')

Unnamed: 0,symboling,normalized_losses,maker,fuel,aspiration,doors,body,wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
1,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
99,2,83,subaru,gas,std,two,hatchback,4wd,front,93.3,...,108,2bbl,3.62,2.64,8.7,73,4400,26,31,7603
103,0,102,subaru,gas,std,four,sedan,4wd,front,97.0,...,108,2bbl,3.62,2.64,9.0,82,4800,24,25,9233
104,0,102,subaru,gas,turbo,four,sedan,4wd,front,97.0,...,108,mpfi,3.62,2.64,7.7,111,4800,24,29,11259
107,0,85,subaru,gas,std,four,wagon,4wd,front,96.9,...,108,2bbl,3.62,2.64,9.0,82,4800,23,29,8013
108,0,85,subaru,gas,turbo,four,wagon,4wd,front,96.9,...,108,mpfi,3.62,2.64,7.7,111,4800,23,23,11694
113,0,81,toyota,gas,std,four,wagon,4wd,front,95.7,...,92,2bbl,3.05,3.03,9.0,62,4800,27,32,7898
114,0,91,toyota,gas,std,four,wagon,4wd,front,95.7,...,92,2bbl,3.05,3.03,9.0,62,4800,27,32,8778


In [44]:
grouped['symboling'].mean()

wheels
4wd    0.500000
fwd    0.895238
rwd    0.413043
Name: symboling, dtype: float64

## Aggregation

In [45]:
grouped['symboling'].agg(numpy.mean)

wheels
4wd    0.500000
fwd    0.895238
rwd    0.413043
Name: symboling, dtype: float64

In [47]:
grouped['symboling'].agg([numpy.mean,numpy.sum])

Unnamed: 0_level_0,mean,sum
wheels,Unnamed: 1_level_1,Unnamed: 2_level_1
4wd,0.5,4
fwd,0.895238,94
rwd,0.413043,19


In [48]:
grouped['symboling'].agg({"평균":numpy.mean,"합계":numpy.sum})

Unnamed: 0_level_0,합계,평균
wheels,Unnamed: 1_level_1,Unnamed: 2_level_1
4wd,4,0.5
fwd,94,0.895238
rwd,19,0.413043


## OrderedDict
 - python에서 dictionary는 순서를 보존하지 않기 때문에 열 순서를 뜻대로 할 수 없다. 이때는 OrderedDict를 사용한다

In [49]:
from collections import OrderedDict

In [50]:
d = OrderedDict([('평균', numpy.mean), ('합계', numpy.sum)])

In [51]:
d

OrderedDict([('평균', <function numpy.core.fromnumeric.mean>),
             ('합계', <function numpy.core.fromnumeric.sum>)])

In [52]:
d['평균']

<function numpy.core.fromnumeric.mean>

## aggregation with OrderedDict 

In [53]:
grouped['symboling'].agg(OrderedDict([('평균', numpy.mean), ('합계', numpy.sum)]))

Unnamed: 0_level_0,평균,합계
wheels,Unnamed: 1_level_1,Unnamed: 2_level_1
4wd,0.5,4
fwd,0.895238,94
rwd,0.413043,19
