# Numpy

In [1]:
import numpy as np

## array 생성

In [2]:
a = [1, 2, 3, 4, 5]
b = [3, 5, 0.4, 4.77, '4']

In [3]:
arr_a = np.array(a, int)
arr_a

array([1, 2, 3, 4, 5])

In [4]:
# 하나의 데이터 타입만 지원 - 자동 형변환
arr_b = np.array(b, float)
arr_b

array([3.  , 5.  , 0.4 , 4.77, 4.  ])

In [6]:
# 파이썬 list는 서로 다른 list라도 같은 값끼리는 주소가 같다.
print(a[2] is b[0])
print(id(a[2]))
print(id(b[0]))

True
140734494613312
140734494613312


In [7]:
# ndarray는 같은 값이라도 다른 array에 있으면 주소가 다르다.
print(arr_a[2] is arr_b[0])
print(id(arr_a[2]))
print(id(arr_b[0]))

False
2334197015152
2334197015472


In [8]:
# 데이터 타입
print(arr_a.dtype)
# array의 형태
print(arr_a.shape)

int32
(5,)


## handling shape

In [9]:
arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
arr.shape

(2, 4)

### reshape

In [11]:
# array의 형태 변경
reshape_arr = arr.reshape(1, 8)
reshape_arr.shape

(1, 8)

In [12]:
# -1은 전체 size를 고려하여 수를 자동으로 선정해줌.
re2 = arr.reshape(4, -1)
re2.shape

(4, 2)

### flatten

In [13]:
# 1 rank로 만들어 줌
flat_arr = arr.flatten()
flat_arr.shape

(8,)

## indexing and slicing

In [15]:
# 행과 열을 각각 slicing 가능
arr[:, 2:]

array([[3, 4],
       [7, 8]])

In [17]:
arr[1:]

array([[5, 6, 7, 8]])

In [18]:
arr2 = np.arange(100).reshape(10, 10)
arr2

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [21]:
arr2[:, -2]

array([ 8, 18, 28, 38, 48, 58, 68, 78, 88, 98])

## creation function

### arange

In [22]:
# array의 값의 범위를 지정하여 생성
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [23]:
# 시작, 끝, step 지정 가능
np.arange(10, 100, 10)

array([10, 20, 30, 40, 50, 60, 70, 80, 90])

### ones, zeros, empty

In [24]:
np.ones((3, 4), float)

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [25]:
np.zeros((2, 3), int)

array([[0, 0, 0],
       [0, 0, 0]])

In [26]:
# 비어있는 array - 메모리 초기화가 되지 않아 잉여 값이 들어있을 수 있음.
np.empty((2, 5), int)

array([[          0,  1074266112,           0,  1075052544, -1717986918],
       [ 1071225241,  -515396076,  1074992250,           0,  1074790400]])

### something_like

In [27]:
np.zeros_like(arr)

array([[0, 0, 0, 0],
       [0, 0, 0, 0]])

### eye, identity, diag

In [28]:
np.identity(3, int)

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [30]:
np.eye(3, 4, k=1)

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [32]:
np.diag(arr, k=1)

array([2, 7])

## random sampling

In [37]:
np.random.normal(0, 1, 10)

array([-0.0622927 ,  1.52834243,  1.14804604, -1.85714166, -2.14649139,
        1.88453224,  0.49335554,  1.08514382,  0.99140463,  0.7969362 ])

## operation functions

In [42]:
arr3 = np.arange(1, 37).reshape(3, 3, 4)
arr3

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24]],

       [[25, 26, 27, 28],
        [29, 30, 31, 32],
        [33, 34, 35, 36]]])

### axis

In [39]:
arr3.sum(axis=0)

array([[39, 42, 45, 48],
       [51, 54, 57, 60],
       [63, 66, 69, 72]])

In [40]:
arr3.sum(axis=1)

array([[15, 18, 21, 24],
       [51, 54, 57, 60],
       [87, 90, 93, 96]])

In [41]:
arr3.sum(axis=2)

array([[ 10,  26,  42],
       [ 58,  74,  90],
       [106, 122, 138]])

### concatenate

In [43]:
a = np.array([1, 2, 3], int)
b = np.array([4, 5, 6], int)

In [45]:
np.vstack((a, b))

array([[1, 2, 3],
       [4, 5, 6]])

In [54]:
np.hstack((a, b))

array([1, 2, 3, 4, 5, 6])

In [53]:
np.hstack((a.reshape(-1, 1), b.reshape(-1, 1)))

array([[1, 4],
       [2, 5],
       [3, 6]])

In [51]:
np.concatenate((a.reshape(-1, 1), b.reshape(-1, 1)), axis=1)

array([[1, 4],
       [2, 5],
       [3, 6]])

In [56]:
# new axis - 새로운 축 추가
b[np.newaxis, :].T

array([[4],
       [5],
       [6]])

## array operation

In [57]:
# element-wise oper
a + b

array([5, 7, 9])

In [58]:
a - b

array([-3, -3, -3])

In [59]:
a * b

array([ 4, 10, 18])

In [68]:
# 행렬 곱셈 or 내적
a.dot(b)

32

In [70]:
c = np.array([[1, 2], [3, 4]])
d = np.array([[5, 6], [7, 8]])
c.dot(d)

array([[19, 22],
       [43, 50]])

In [62]:
# transpose
np.vstack((a, b)).T

array([[1, 4],
       [2, 5],
       [3, 6]])

In [64]:
np.vstack((a, b)).transpose()

array([[1, 4],
       [2, 5],
       [3, 6]])

In [71]:
# broadcasting
# matrix - scalar
np.array([1, 2, 3]) + 3

array([4, 5, 6])

In [72]:
# matrix - vector
c + [6, 7]

array([[ 7,  9],
       [ 9, 11]])

## numpy performance

In [73]:
def sclar_vector_product(scalar, vector):
    result = []
    for value in vector:
        result.append(scalar * value)
    return result

iternation_max = 100000000
vector = list(range(iternation_max))
scalar = 2

# for loop
%timeit sclar_vector_product(scalar, vector) 
# list comprehension
%timeit [scalar * value for value in range(iternation_max)]
# numpy
%timeit np.arange(iternation_max) * scalar

10.8 s ± 103 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
11.3 s ± 56.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
246 ms ± 4.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## comparisons

In [74]:
ar = np.arange(10)

In [75]:
ar < 5

array([ True,  True,  True,  True,  True, False, False, False, False,
       False])

In [76]:
np.all(ar < 10)

True

In [77]:
np.any(ar < 5)

True

In [81]:
ar < np.arange(-5, 5)

array([False, False, False, False, False, False, False, False, False,
       False])

In [82]:
np.logical_and(ar < 8, ar > 3)

array([False, False, False, False,  True,  True,  True,  True, False,
       False])

In [83]:
np.logical_or(ar < 8, ar > 3)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [85]:
np.logical_not(ar < 3)

array([False, False, False,  True,  True,  True,  True,  True,  True,
        True])

In [86]:
# true인 index 반환
np.where(ar < 5)

(array([0, 1, 2, 3, 4], dtype=int64),)

In [87]:
# 참, 거짓값 array
np.where(ar < 5, 1, 0)

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [88]:
test = np.array([1, np.NaN, np.Inf])

In [89]:
# NaN 판별
np.isnan(test)

array([False,  True, False])

In [90]:
# 유한한 값 판별
np.isfinite(test)

array([ True, False, False])

In [91]:
# 최대값의 index
np.argmax(ar)

9

In [92]:
# 최소값의 index
np.argmin(ar)

0

In [94]:
# axis 지정
np.argmax(ar.reshape(2, 5), axis=1)

array([4, 4], dtype=int64)

In [95]:
# 크기 순서로 나열했을 때의 index
np.argsort(np.array([4, 2, 56, 6]))

array([1, 0, 3, 2], dtype=int64)

In [96]:
# boolean index - 참인 값만 추출
ar[ar < 5]

array([0, 1, 2, 3, 4])

In [97]:
# fancy index
i = np.array([3, 4, 1, 1, 7, 9])
ar[i]

array([3, 4, 1, 1, 7, 9])

In [98]:
# matrix fancy index
r = [1, 1, 0, 0]
c = [3, 4, 1, 2]
ar.reshape(2, 5)[r, c]

array([8, 9, 1, 2])

---

# Pandas

In [99]:
import pandas as pd

In [100]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'

In [102]:
df = pd.read_csv(data_url, sep='\s+', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


## series

In [104]:
pd.Series(data=[1, 2, 3, 4, 5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [105]:
# index 지정 가능
pd.Series(data=[1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [107]:
# dict로 데이터를 넣으면 key값이 index
dict_data = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
series_obj = pd.Series(data=dict_data)

In [108]:
series_obj.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [109]:
series_obj.values

array([1, 2, 3, 4, 5], dtype=int64)

In [113]:
# data와 index의 길이가 달라서 에러
pd.Series(data=[1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

ValueError: Length of values (5) does not match length of index (7)

In [112]:
# 대응되는 값이 없을 경우 NaN으로 채워짐
pd.Series(data={'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
dtype: float64

## data frame

In [114]:
pd.DataFrame(df, columns=[0, 1, 2])

Unnamed: 0,0,1,2
0,0.00632,18.0,2.31
1,0.02731,0.0,7.07
2,0.02729,0.0,7.07
3,0.03237,0.0,2.18
4,0.06905,0.0,2.18
...,...,...,...
501,0.06263,0.0,11.93
502,0.04527,0.0,11.93
503,0.06076,0.0,11.93
504,0.10959,0.0,11.93


In [115]:
# indexing - loc : index의 실제 값
pd.Series(data={'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g']).loc[:'c']

a    1.0
b    2.0
c    3.0
dtype: float64

In [117]:
# iloc : index의 순서
pd.Series(data={'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g']).iloc[:3]

a    1.0
b    2.0
c    3.0
dtype: float64

In [129]:
df_temp = pd.DataFrame(df, columns=[0, 1, 2]).iloc[:10]
df_temp

Unnamed: 0,0,1,2
0,0.00632,18.0,2.31
1,0.02731,0.0,7.07
2,0.02729,0.0,7.07
3,0.03237,0.0,2.18
4,0.06905,0.0,2.18
5,0.02985,0.0,2.18
6,0.08829,12.5,7.87
7,0.14455,12.5,7.87
8,0.21124,12.5,7.87
9,0.17004,12.5,7.87


In [130]:
# 새로운 column 생성
df_temp[3] = df_temp[1] > 10
df_temp

Unnamed: 0,0,1,2,3
0,0.00632,18.0,2.31,True
1,0.02731,0.0,7.07,False
2,0.02729,0.0,7.07,False
3,0.03237,0.0,2.18,False
4,0.06905,0.0,2.18,False
5,0.02985,0.0,2.18,False
6,0.08829,12.5,7.87,True
7,0.14455,12.5,7.87,True
8,0.21124,12.5,7.87,True
9,0.17004,12.5,7.87,True


In [131]:
# 대응되는 값이 없는 index는 NaN 값으로 채움.
df_temp[4] = pd.Series(data=['a', 'a', 'a'], index=[4, 5, 9])
df_temp

Unnamed: 0,0,1,2,3,4
0,0.00632,18.0,2.31,True,
1,0.02731,0.0,7.07,False,
2,0.02729,0.0,7.07,False,
3,0.03237,0.0,2.18,False,
4,0.06905,0.0,2.18,False,a
5,0.02985,0.0,2.18,False,a
6,0.08829,12.5,7.87,True,
7,0.14455,12.5,7.87,True,
8,0.21124,12.5,7.87,True,
9,0.17004,12.5,7.87,True,a


In [133]:
# transpose
df_temp.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.00632,0.02731,0.02729,0.03237,0.06905,0.02985,0.08829,0.14455,0.21124,0.17004
1,18.0,0.0,0.0,0.0,0.0,0.0,12.5,12.5,12.5,12.5
2,2.31,7.07,7.07,2.18,2.18,2.18,7.87,7.87,7.87,7.87
3,True,False,False,False,False,False,True,True,True,True
4,,,,,a,a,,,,a


In [135]:
# to csv - 파일로도 출력 가능
csv = df_temp.to_csv()
print(csv)
print(type(csv))

,0,1,2,3,4
0,0.00632,18.0,2.31,True,
1,0.02731,0.0,7.07,False,
2,0.02729,0.0,7.07,False,
3,0.03237,0.0,2.18,False,
4,0.06905,0.0,2.18,False,a
5,0.02985,0.0,2.18,False,a
6,0.08829,12.5,7.87,True,
7,0.14455,12.5,7.87,True,
8,0.21124,12.5,7.87,True,
9,0.17004,12.5,7.87,True,a

<class 'str'>


In [138]:
# drop - col 삭제
df_temp.drop(4, axis=1)

Unnamed: 0,0,1,2,3
0,0.00632,18.0,2.31,True
1,0.02731,0.0,7.07,False
2,0.02729,0.0,7.07,False
3,0.03237,0.0,2.18,False
4,0.06905,0.0,2.18,False
5,0.02985,0.0,2.18,False
6,0.08829,12.5,7.87,True
7,0.14455,12.5,7.87,True
8,0.21124,12.5,7.87,True
9,0.17004,12.5,7.87,True


In [139]:
# 원본은 변화 없음
df_temp

Unnamed: 0,0,1,2,3,4
0,0.00632,18.0,2.31,True,
1,0.02731,0.0,7.07,False,
2,0.02729,0.0,7.07,False,
3,0.03237,0.0,2.18,False,
4,0.06905,0.0,2.18,False,a
5,0.02985,0.0,2.18,False,a
6,0.08829,12.5,7.87,True,
7,0.14455,12.5,7.87,True,
8,0.21124,12.5,7.87,True,
9,0.17004,12.5,7.87,True,a


In [140]:
# 원본의 col 삭제
del df_temp[4]
df_temp

Unnamed: 0,0,1,2,3
0,0.00632,18.0,2.31,True
1,0.02731,0.0,7.07,False
2,0.02729,0.0,7.07,False
3,0.03237,0.0,2.18,False
4,0.06905,0.0,2.18,False
5,0.02985,0.0,2.18,False
6,0.08829,12.5,7.87,True
7,0.14455,12.5,7.87,True
8,0.21124,12.5,7.87,True
9,0.17004,12.5,7.87,True


In [141]:
# json 데이터 입력
json = {'col1': {'a': 23, 'b': 44}, 'col2': {'c': 52, 'b': 22}}
pd.DataFrame(json)

Unnamed: 0,col1,col2
a,23.0,
b,44.0,22.0
c,,52.0
