<a href="https://colab.research.google.com/github/akasia1/AA04/blob/master/py_module/py_module_3_pandas_ipynb%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Python module 3. **pandas**

# Using pandas

* [10 Minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html)
* [Pandas tutorial with interactive exercises](https://www.kaggle.com/pistak/pandas-tutorial-with-interactive-exercises)

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## [1] Make data: Series, and DataFrame
> pandas의 데이터 구조
- Series
- DataFrame

### Series
> 1차원 데이터

In [0]:
# Creating a Series by passing a list of values
s = pd.Series([1,3,5,np.nan,6,8])
s

In [0]:
# Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:
dates = pd.date_range('20191129', periods=6)
dates

In [0]:
# Make dataframe using an array with random numbers
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

In [0]:
# check types of df  --> same type
df.dtypes

### 데이터프레임 (DataFrame)
- 2차원 데이터
- 다차원 데이터

In [0]:
# Creating a DataFrame by passing a dict of objects that can be converted to series-like.
df2 = pd.DataFrame({ 'A' : 1., 
                    'B' : pd.Timestamp('20191129'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })

In [0]:
df2

In [0]:
# check types of df2 --> different types
df2.dtypes



---



## [2] Handling data
- head()
- tail()
- describe()

In [0]:
# head()
# df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df.head()

In [0]:
df.head(2)

In [0]:
df.tail(3)

In [0]:
# Display the index, columns, and the underlying NumPy data:
df.index

In [0]:
# describe() shows a quick statistic summary of your data:
df.describe()

In [0]:
df.columns

In [0]:
df2.describe()

In [0]:
# Transposing your dataframe:
df.T

### Sorting

#### Sort by index
- sort_index(axis=0, ascending=False)
- sort_index(axis=1, ascending=False)

In [0]:
# Sorting by an axis:
df, df.sort_index(axis=0, ascending=False)

In [0]:
df,df.sort_index(axis=1, ascending=False)

#### Sort by value
- sort_values(by='column')

In [0]:
# Sorting by values:
df.sort_values(by='B') #, ascending=False)

#### Selecting data by indexing and slicing
- indexing
- slicing


In [0]:
# Selecting a single column, which yields a Series
df['A']

In [0]:
# Selecting via [], which slices the rows.
df[0:3]

In [0]:
df['20191129':'20191201'] # 인덱스가 아닌 값인 경우는 지전된 범위가 다 선택된다.

#### Selecting data by label

> **loc, iloc**


In [0]:
df

In [0]:
df.loc[dates[0]]   # loc()

In [0]:
# Selecting on a multi-axis by label:
df.loc[:,['A','B']]

#### [도전코딩]

> Select data for first two days AND comumn 3,4 from df.

In [0]:
# df.loc[0:2,['C','D']]
# df.loc['20191129':'20191130',['C','D']]
# df.loc[dates[:2],['C','D']]

#### Selecting data by position (iloc())

In [0]:
df

In [0]:
df.iloc[3]  # 결과는 차원축소형으로 표현됨.

In [0]:
# [다시 도전]
# Select data for first two days AND comumn 3,4 from df.
# Use iloc
df.iloc[:2,2:4]

In [0]:
# Select one item
df.iloc[1,1]

#### Selecting data by Boolean indexing

In [0]:
df

In [0]:
df[df.A > 0]

In [0]:
df[df > 0]

### 데이터 재구성(setting) 또는 확장

In [0]:
# Setting a new column automatically aligns the data by the indexes.
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20191129', periods=6))
s1

In [0]:
df

In [0]:
df['F'] = s1

In [0]:
df  # 기존 df의 구조에 맞춰서 확장, 재구성됨.

#### Setting data by label

> **at, iat**

In [0]:
# Setting values by label:
df.at[dates[0],'A'] = 0
df

In [0]:
# Setting values by position (index):
df.iat[0,1] = 0
df

In [0]:
len(df), df.shape, df.size

In [0]:
# Setting by assigning with a NumPy array:
df.loc[:,'D'] = np.array([5] * len(df))
df

### Missing data 처리
- pandas primarily uses the value **np.nan** to represent missing data. 
- dropna()
- fillna()

In [0]:
df.columns

In [0]:
# Reindexing allows you to change/add/delete the index on a specified axis.
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

In [0]:
df1.loc[dates[0]:dates[1],'E'] = 1
df1

In [0]:
# To drop any rows that have missing data.
df1.dropna(how='any')

In [0]:
# Filling missing data.
df1.fillna(value=5)

In [0]:
# Get the boolean mask where values are nan.
pd.isna(df1)



---



### 데이터 통계 (Statistics)

In [0]:
df

In [0]:
df.mean()

In [0]:
df.mean(0)

In [0]:
df.mean(1)

In [0]:
df.std(0)

#### [도전] 데이터프레임 df의 평균(mean(0))과 표준편차를 이용한 그래프
- 평균에 대한 꺽은선그래프
- 평균과 표준편차를 이용한 막대그래프
> x-축은 A,B,C,D,E

In [0]:
import numpy as np
import matplotlib.pyplot as plt
# 노트북 셀 내에 그림 출력
%matplotlib inline

In [0]:
# 막대그래프(bar graph)를 그린다.
means = [1, 2, 3]
stddevs = [0.2, 0.4, 0.5]
bar_labels = ['bar 1', 'bar 2', 'bar 3']

# plot bars
x_pos = list(range(len(bar_labels)))
plt.bar(x_pos, means, yerr=stddevs)

plt.show()

In [0]:
plt.plot(df.mean(0), '-o', ms=8)

In [0]:
# df의 평균(mean(0))과 표준편차를 이용한 바그래프
bar_labels = df.columns
# plot bars
plt.bar(bar_labels, df.mean(0), yerr=df.std(0))

***

## pandas의 데이터 시각화

In [0]:
df

### 꺽은선 그래프

In [0]:
df['A'].plot(marker='o', c='r', ms=8)

In [0]:
df['F'].plot(marker='o', c='r', ms=8)

In [0]:
df.plot(y='F', marker='o', c='r', ms=8)   # x-축은 날짜인덱스

In [0]:
# 두 개의 그래프를 한 축에 그리기
df['A','F'].plot(marker='o', c='r', ms=8)

In [0]:
df.plot(y=['A','F'], marker='o')

### 산포도 (Scatter graph)

In [0]:
df.plot(kind='scatter',x='F',y='A', color='r', marker='o', s=32)  

In [0]:
df.plot(kind='scatter',x='B',y='C', color='blue', marker='d', s=50)  

### 바 그래프

In [0]:
df['A'].plot.bar()

In [0]:
df['F'].plot.bar()

In [0]:
df.plot(kind='bar', y='F')

#### 다중 바그래프

In [0]:
df.plot(kind='bar',y=['A','C','F'])

#### 평균/표준편차 그래프

In [0]:
df.mean(0), df.std(0), df.columns

In [0]:
# plot bars
# bar_labels = df.columns
# plt.bar(bar_labels, df.mean(0), yerr=df.std(0))
df.mean(0).plot.bar(yerr=df.std(0)) #, rot=0) 



---



### 그래프 이어 그리기

In [0]:
x1 = np.arange(0.0, 5.0, 0.01)
y1 = np.sin(2*np.pi*x1)
plt.figure(figsize=(10,4))
plt.plot(x1, y1, 'o', ms = 4)

In [0]:
x1.shape, y1.shape

#### pandas 데이터프레임을 이용한 그래프 

In [0]:
df = pd.DataFrame(dict(sine=y1), index=x1, columns=['sine'])
df.head()

In [0]:
df.tail()

In [0]:
pre_size = int(len(df) * 0.8)
post_size = len(df) - pre_size
pre, post = df.iloc[0:pre_size], df.iloc[pre_size:len(df)]
print(len(pre), len(post))

In [0]:
post.shape

In [0]:
post2 = post + np.random.normal(scale=0.2, size=len(post)).reshape(-1,1)

In [0]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(0, len(pre)), pre, 'g', label="pre")
plt.plot(np.arange(len(pre), len(pre) + len(post)), post, 'b', marker='.', label="post")
plt.plot(np.arange(len(pre), len(pre) + len(post)), post2, 'r', label="post2")
plt.ylabel('Value')
plt.xlabel('Time Step')
plt.legend()
plt.show();

#### post  post2를 동시에 그려서 두 그래프를 비교해보시오. 

In [0]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(pre), len(pre) + len(post)), post, 'b', marker='.', label="post")
plt.plot(np.arange(len(pre), len(pre) + len(post)), post2, 'r', label="post2")
plt.ylabel('Value')
plt.xlabel('Time Step')
plt.legend()
plt.show();
