# optional_1_data_exploration

training-data-analyst/courses/machine_learning/deepdive2/time_series_prediction/labs/optional_1_data_exploration.ipynb

#### global variable로 cell contents replace

In [None]:
# Allow you to easily have Python variables in SQL query.
@register_cell_magic('with_globals')
def with_globals(line, cell):
    contents = cell.format(**globals())
    if 'print' in line:    # print가 포함되면 cell contents를 출력
        print(contents)
    get_ipython().run_cell(contents)

#### create the dataset in our project BiqQuery

In [None]:
!bq mk stock_src

#### upload csv to bigquery

In [None]:
# https://cloud.google.com/bigquery/docs/loading-data-local?hl=ko#bq

%%bash

TABLE=price_history
SCHEMA=symbol:STRING,Date:DATE,Open:FLOAT,Close:FLOAT
                
test -f $TABLE.csv || unzip ../stock_src/$TABLE.csv.zip
gsutil -m cp $TABLE.csv gs://$BUCKET/stock_src/$TABLE.csv
bq load --source_format=CSV --skip_leading_rows=1 \
    stock_src.$TABLE gs://$BUCKET/stock_src/$TABLE.csv  $SCHEMA

#### bigquery table columns 목록 확인

In [None]:
%%bigquery --project {PROJECT}
SELECT table_name, column_name, data_type
FROM `stock_src.INFORMATION_SCHEMA.COLUMNS`
ORDER BY table_name, ordinal_position

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.rolling.html
# https://ordo.tistory.com/67
# window=100, center로 mean값 생성 ==> smooth plot 생성
df_stock['Close_smoothed'] = df_stock.Close.rolling(100, center=True).mean()

In [None]:
# dataframe의 date값을 datetime type으로 변경
df_stock.Date = pd.to_datetime(df_stock.Date)

#### 하나의 도표에 여러 값 plot 하기

In [None]:
# plotting price of symbol and S&P 500
def plot_with_sp(symbol):
    df_stock = query_stock(symbol)
    df_stock.Date = pd.to_datetime(df_stock.Date)

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax2 = ax1.twinx()

    ax = df_sp.plot(x='Date', y='Close', label='S&P', color='green', ax=ax1,
                    alpha=0.7)
    ax = df_stock.plot(x='Date', y='Close', label=symbol,
                       title=symbol + ' and S&P index', ax=ax2, alpha=0.7)
    ax1.legend(loc=3)
    ax2.legend(loc=4)

    ax1.set_ylabel('S&P price')
    ax2.set_ylabel(symbol + ' price')

    ax.set_xlim(pd.to_datetime('2004-08-05'), pd.to_datetime('2013-08-05'))

In [39]:
# https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions?hl=ko#lag
# https://gent.tistory.com/339
# LAG(col, offset) : offset이전 row의 col값 반환
# OVER (PARTITION BY symbol ORDER BY year DESC) : symbol별 정렬 후 LAG 처리

%%bigquery df --project {PROJECT}   # bigquery 실행 결과가 dataframe df로 전달
WITH 
with_year AS  # 임시 테이블 with_year 생성
    (
    SELECT symbol, 
    EXTRACT(YEAR FROM date) AS year,  # datetime에서 year만 추출
    close
    FROM `stock_src.price_history`
    WHERE symbol in (SELECT symbol FROM `stock_src.snp500`)
    ),
year_aggregated AS # 임시 테이블 year_aggregated 생성
    (
    SELECT year, symbol, AVG(close) as avg_close
    FROM with_year
    WHERE year >= 2000
    GROUP BY year, symbol
    )
SELECT year, symbol, avg_close as close,
(LAG(avg_close, 1) OVER (PARTITION BY symbol ORDER BY year DESC) # symbol 별 
) AS next_yr_close
FROM year_aggregated
ORDER BY symbol, year

In [None]:
# dropna: na가 포함된 row 제거 후 
# inplace=True: df에 저장
df.dropna(inplace=True)

In [None]:
# percent_increase 컬럼 추가
df['percent_increase'] = (df.next_yr_close - df.close) / df.close

In [None]:
# symbol값 n개 샘플 추출. symbol값만 남음
random_stocks = df.symbol.sample(n=3, random_state=3)

# symbol에 해당하는 row merge
df.merge(random_stocks)

# arg 기준으로 정렬
df.sort_values('percent_increase')

In [82]:
df.head()

Unnamed: 0,Date,industry,close
0,2000-01-03,Consumer Discretionary,46.464
1,2000-01-04,Consumer Discretionary,45.101538
2,2000-01-05,Consumer Discretionary,45.071077
3,2000-01-06,Consumer Discretionary,44.745077
4,2000-01-07,Consumer Discretionary,45.738308


In [None]:
# set_index(['industry', 'Date']) : index를 industry와 date로 변경
# unstack(0) : index=0인 industry 컬럼을 기준으로 pivot. industry가 컬럼명으로 이동
# dropna() : NaN이 포함된 row 삭제
df.set_index(['industry', 'Date']).unstack(0).dropna()

In [None]:
# plot dataframe 
ax = df_ind.plot(figsize=(16, 8))
# Move legend down.
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), shadow=True, ncol=2)

In [None]:
# close price를 0~1 사이 값으로 변경
def min_max_scale(df):
    return (df - df.min()) / df.max()

In [None]:
# http://www.gurubee.net/lecture/2674
# https://geniusjo.tistory.com/entry/%EC%9C%88%EB%8F%84%EC%9A%B0-%ED%95%A8%EC%88%98WINDOW-FUNCTION

# (현재 row로부터 n_days 앞의 row)부터 (현재 row로부터 바로 앞의 row)까지의 row
# ROWS BETWEEN {n_days} PRECEDING AND 1 PRECEDING

def get_window_fxn(agg_fxn, n_days):
    """Generate a time-series feature. 
    
    E.g., Compute the average of the price over the past 5 days."""
    SCALE_VALUE = 'close'
    sql = '''
    ({agg_fxn}(close) OVER (PARTITION BY symbol
                      ORDER BY date desc
                      ROWS BETWEEN {n_days} PRECEDING AND 1 PRECEDING))/{scale}
                      AS close_{agg_fxn}_prior_{n_days}_days'''.format(
                          agg_fxn=agg_fxn, n_days=n_days, scale=SCALE_VALUE)
    return sql
