In [1]:
import pandas as pd
import numpy as np

#### pandas : 파이썬 진영에서 R에 대응하기 위해 만든 라이브러리
- 기반은 numpy
- pandas의 자료구조 : Series(1차원 데이터), DataFrame(2차원 데이터)

#### numpy : 고속연산, 대용량 데이터 처리 등 파이썬 대비 10~수백배의 처리속도를 제공한다
- 분석 작업 시 파이썬을 사용하지 않고 pandas를 사용한다
- numpy의 자료구조 -> ndarray(배열)

#### np.nan
- 파이썬의 None -> 값이 없다 -> 결측치, 제거대상

In [2]:
s = pd.Series([1,2,3,np.nan,5,6])
s

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
5    6.0
dtype: float64

In [3]:
s.dtype

dtype('float64')

In [4]:
# shape : 데이터의 개수, 형태를 확인하기 위해 사용 (튜플 형식)
# ex) (6, ) 라는 뜻은 데이터가 6개 있고, 1차원이라는 의미
s.shape

(6,)

## < DataFrame 생성하기 >

#### 1. 인덱스 데이터 생성 (행) 

In [5]:
# 임의의 날짜 데이터 생성 => 인덱스로 사용하기
dates = pd.date_range('20190121', periods=37)
dates

DatetimeIndex(['2019-01-21', '2019-01-22', '2019-01-23', '2019-01-24',
               '2019-01-25', '2019-01-26', '2019-01-27', '2019-01-28',
               '2019-01-29', '2019-01-30', '2019-01-31', '2019-02-01',
               '2019-02-02', '2019-02-03', '2019-02-04', '2019-02-05',
               '2019-02-06', '2019-02-07', '2019-02-08', '2019-02-09',
               '2019-02-10', '2019-02-11', '2019-02-12', '2019-02-13',
               '2019-02-14', '2019-02-15', '2019-02-16', '2019-02-17',
               '2019-02-18', '2019-02-19', '2019-02-20', '2019-02-21',
               '2019-02-22', '2019-02-23', '2019-02-24', '2019-02-25',
               '2019-02-26'],
              dtype='datetime64[ns]', freq='D')

#### 2. 컬럼생성 (열)

In [6]:
cols= ['A', 'B', 'C', 'D']

#### 3. 임의의 배열 값 생성

In [9]:
# 임의의 수 만들기
# numpy 자료구조 : 배열
np.random.randn(37,4)

array([[ 0.09806809, -0.65009407, -0.2008858 ,  2.21399769],
       [-0.25871029,  0.72193583, -0.35644257,  0.73424657],
       [ 0.94875051,  0.13200786,  1.37379253,  0.86327168],
       [ 0.63774345, -1.53992156,  0.08813565, -0.6914163 ],
       [-1.6896839 , -2.03305828,  0.88620276, -0.83541939],
       [-1.71312675, -0.56133048,  0.16988115, -1.76599494],
       [-0.30799566,  0.01386468,  0.43871846,  0.45279958],
       [ 1.2325874 , -1.25847682, -1.19809312,  0.00876499],
       [-0.73269658,  0.97765436, -0.34968407, -0.15059944],
       [-1.11610795, -1.47780994,  1.45929891, -0.64486936],
       [ 0.67878405,  0.6593138 , -0.51208101,  0.96452412],
       [-0.41690573,  1.8757979 ,  1.70155991,  0.363742  ],
       [ 0.3114188 , -0.69048159,  0.63453405, -1.65541121],
       [ 0.46820062,  0.57174955, -0.09184948, -0.65624646],
       [-1.76820887, -1.67744268,  1.07935602, -1.43672759],
       [-0.90046427, -0.39328884, -0.44302623,  0.19226464],
       [-0.83111068, -0.

#### 4. DataFrame 생성하기 =>  pd.DataFrame(배열값, index값, columns값)

In [10]:
index_size = len(dates)
cols_size = len(cols)
df = pd.DataFrame(np.random.randn(index_size, cols_size)
                 , index = dates
                 , columns = cols)
df.head()

Unnamed: 0,A,B,C,D
2019-01-21,0.268979,-0.15295,0.120132,-0.303722
2019-01-22,-0.531976,-0.228745,-0.620546,0.66363
2019-01-23,0.804702,-1.082933,-1.134419,1.078503
2019-01-24,1.609795,-1.477634,0.681511,2.02941
2019-01-25,-1.363776,-0.162394,0.110927,0.635739


## < DataFrame 살펴보기 >

In [11]:
# 데이터의 모습, 모양 크기
df.shape

(37, 4)

In [12]:
type(df.values)

numpy.ndarray

In [14]:
df.describe() # DataFrame의 통계적인 결과치

Unnamed: 0,A,B,C,D
count,37.0,37.0,37.0,37.0
mean,0.075304,-0.068579,-0.058296,-0.122897
std,0.998986,1.108758,0.997242,1.091076
min,-2.927973,-2.61963,-1.633163,-2.489418
25%,-0.538233,-1.038459,-0.920316,-0.820533
50%,0.268979,-0.162394,0.002257,-0.144209
75%,0.748563,0.474349,0.478917,0.66363
max,1.609795,2.241661,2.73441,2.02941


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 37 entries, 2019-01-21 to 2019-02-26
Freq: D
Data columns (total 4 columns):
A    37 non-null float64
B    37 non-null float64
C    37 non-null float64
D    37 non-null float64
dtypes: float64(4)
memory usage: 1.4 KB


In [18]:
df.values

array([[ 2.68979039e-01, -1.52950207e-01,  1.20131554e-01,
        -3.03722171e-01],
       [-5.31976066e-01, -2.28744913e-01, -6.20545843e-01,
         6.63630111e-01],
       [ 8.04702126e-01, -1.08293330e+00, -1.13441858e+00,
         1.07850261e+00],
       [ 1.60979501e+00, -1.47763431e+00,  6.81510975e-01,
         2.02941031e+00],
       [-1.36377624e+00, -1.62393854e-01,  1.10927183e-01,
         6.35739348e-01],
       [-1.04711999e+00,  7.30862952e-01,  2.25747061e-03,
        -1.07539821e+00],
       [ 1.03687928e+00,  3.49057321e-01,  4.80598063e-01,
         9.49977386e-02],
       [ 2.09672632e-01,  1.27921349e-01, -7.61123766e-01,
        -1.26317251e+00],
       [-2.55933208e-01, -4.26402328e-01,  6.43899951e-01,
        -5.49792191e-01],
       [ 6.77091509e-01, -2.62015355e-01,  2.50460856e-01,
         4.84319410e-01],
       [ 1.14362663e+00, -1.03845918e+00,  1.65338579e+00,
         1.39849731e+00],
       [-1.09032900e+00,  1.76946808e+00,  1.44769453e-01,
      