# Import Library

In [1]:
import os
import fnmatch as fn
import numpy as np
import pandas as pd
import dbfread

import pandas_profiling as pp
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from matplotlib import font_manager, rc
%matplotlib inline

import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
from math import sin,cos,sqrt,atan2,radians 
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import shapiro
import tbats

import requests
import logging
import time
import googlemaps
import reverse_geocoder
import pprint
import folium
import zipfile
import json

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from xgboost import XGBRegressor
import lightgbm as lgb


matplotlib.rcParams['axes.unicode_minus'] = False # 마이너스 기호도 표시

# 한글 깨짐 방지 목적 #
font_name = font_manager.FontProperties(fname = 'c:/Windows/Fonts/malgun.ttf').get_name()
rc('font',family = font_name)

## 맥
# rc('font', family='/Library/Fonts/AppleGothic.ttf')

import plotly
import cufflinks as cf
import plotly.graph_objs as go
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.offline import iplot
from plotly.offline import init_notebook_mode, plot,iplot

init_notebook_mode(connected = True)
cf.go_offline()

import warnings
warnings.filterwarnings(action='ignore') # 경고 메시지를 숨길 때

# Import Data

In [2]:
df1 = pd.read_csv('가공데이터/merge_카드_유동_환경_edit1.csv')
df1 = df1.iloc[:,1:]
df1['date'] = pd.to_datetime(df1['date'],format = '%Y-%m-%d')

df2 = pd.read_csv('가공데이터/merge_카드_유동_환경_edit2.csv')
df2 = df2.iloc[:,1:]
df2['date'] = pd.to_datetime(df2['date'],format = '%Y-%m-%d')

# Exploratory Data Analysis

## Null value값은 존재하지 않는다.

In [3]:
df2.isnull().sum().sort_values()

date                 0
42M_45A_F_USE_CNT    0
43M_45A_F_USE_CNT    0
44M_45A_F_USE_CNT    0
50M_45A_F_USE_CNT    0
52M_45A_F_USE_CNT    0
60M_45A_F_USE_CNT    0
62M_45A_F_USE_CNT    0
70M_45A_F_USE_CNT    0
71M_45A_F_USE_CNT    0
80M_45A_F_USE_CNT    0
81M_45A_F_USE_CNT    0
92M_45A_F_USE_CNT    0
40M_45A_F_USE_CNT    0
10M_50A_F_USE_CNT    0
21M_50A_F_USE_CNT    0
22M_50A_F_USE_CNT    0
30M_50A_F_USE_CNT    0
31M_50A_F_USE_CNT    0
32M_50A_F_USE_CNT    0
33M_50A_F_USE_CNT    0
34M_50A_F_USE_CNT    0
35M_50A_F_USE_CNT    0
40M_50A_F_USE_CNT    0
42M_50A_F_USE_CNT    0
43M_50A_F_USE_CNT    0
44M_50A_F_USE_CNT    0
20M_50A_F_USE_CNT    0
35M_45A_F_USE_CNT    0
34M_45A_F_USE_CNT    0
                    ..
92M_25A_M_USE_AMT    0
70M_20A_M_USE_AMT    0
71M_20A_M_USE_AMT    0
80M_20A_M_USE_AMT    0
81M_20A_M_USE_AMT    0
92M_20A_M_USE_AMT    0
10M_25A_M_USE_AMT    0
20M_25A_M_USE_AMT    0
21M_25A_M_USE_AMT    0
22M_25A_M_USE_AMT    0
30M_25A_M_USE_AMT    0
31M_25A_M_USE_AMT    0
32M_25A_M_U

## 월을 기준으로 살펴보기

- 동별로 '의료기관(70) 카테고리'에 대한 변동폭(분산)이 큰 동과 작은 동이 발견된다.
- USE_AMT(사용금액)인 경우, 이상치(Outlier)가 많이 관측된다.
- 관측치가 많은 '요식업소(80)' 기준으로 보았을 때도 같은 현상이 발견된다.

In [4]:
a1 = df2.groupby(['month','district'])['70M_25A_M_USE_CNT'].mean().to_frame().reset_index()

for i in np.unique(a1['district']):
    a2 = df2[df2['district'] == i]
    vi_data = [go.Box(x = a2['month'],y=a2['70M_25A_M_USE_CNT'])]
    layout = go.Layout(yaxis = dict(title='USE_CNT'),
                       title = '70cate USE_CNT ' + 'in ' + i)
    fig = go.Figure(data = vi_data,layout = layout)
    iplot(fig)

In [5]:
a1 = df2.groupby(['month','district'])['80M_25A_M_USE_CNT'].mean().to_frame().reset_index()

for i in np.unique(a1['district']):
    a2 = df2[df2['district'] == i]
    vi_data = [go.Box(x = a2['month'],y=a2['80M_25A_M_USE_CNT'])]
    layout = go.Layout(yaxis = dict(title='USE_CNT'),
                       title = '80cate USE_CNT ' + 'in ' + i)
    fig = go.Figure(data = vi_data,layout = layout)
    iplot(fig)

In [6]:
a1 = df2.groupby(['month','district'])['70M_25A_M_USE_AMT'].mean().to_frame().reset_index()

for i in np.unique(a1['district']):
    a2 = df2[df2['district'] == i]
    vi_data = [go.Box(x = a2['month'],y=a2['70M_25A_M_USE_AMT'])]
    layout = go.Layout(yaxis = dict(title='USE_AMT'),
                       title = '70cate USE_AMT ' + 'in ' + i)
    fig = go.Figure(data = vi_data,layout = layout)
    iplot(fig)

## Weekday를 기준으로 살펴보기

- 토요일과 일요일에 카드 소비건수와 카드 사용금액이 적은 반면, 월요일에 가장 많음을 발견할 수 있다.
- 동별로 '의료기관(70) 카테고리'에 대한 변동폭(분산)이 큰 동과 작은 동이 발견된다.
- 관측치가 많은 '요식업소(80)' 기준으로 보았을 때도 같은 현상이 발견된다.

In [7]:
a1 = df1[df1['MCT_CAT_CD'] == 70].groupby(['weekday','district'])['USE_CNT'].mean().to_frame().reset_index()

for i in np.unique(a1['district']):
    a2 = a1[a1['district'] == i]
    vi_data = [go.Bar(x = a2['weekday'],y=a2['USE_CNT'])]
    layout = go.Layout(yaxis = dict(title='USE_CNT'),
                       title = '70 cate USE_CNT ' + 'in ' + i)
    fig = go.Figure(data = vi_data,layout = layout)
    iplot(fig)

In [8]:
a1 = df1[df1['MCT_CAT_CD'] == 70].groupby(['weekday','district'])['USE_AMT'].mean().to_frame().reset_index()

for i in np.unique(a1['district']):
    a2 = a1[a1['district'] == i]
    vi_data = [go.Bar(x = a2['weekday'],y=a2['USE_AMT'])]
    layout = go.Layout(yaxis = dict(title='USE_AMT'),
                       title = '70 cate USE_AMT ' + 'in ' + i)
    fig = go.Figure(data = vi_data,layout = layout)
    iplot(fig)

In [9]:
a1 = df1[df1['MCT_CAT_CD'] == 80].groupby(['weekday','district'])['USE_CNT'].mean().to_frame().reset_index()

for i in np.unique(a1['district']):
    a2 = a1[a1['district'] == i]
    vi_data = [go.Bar(x = a2['weekday'],y=a2['USE_CNT'])]
    layout = go.Layout(yaxis = dict(title='USE_CNT'),
                       title = '80 cate USE_CNT ' + 'in ' + i)
    fig = go.Figure(data = vi_data,layout = layout)
    iplot(fig)

In [10]:
a1 = df1[df1['MCT_CAT_CD'] == 80].groupby(['weekday','district'])['USE_AMT'].mean().to_frame().reset_index()

for i in np.unique(a1['district']):
    a2 = a1[a1['district'] == i]
    vi_data = [go.Bar(x = a2['weekday'],y=a2['USE_AMT'])]
    layout = go.Layout(yaxis = dict(title='USE_AMT'),
                       title = '80 cate USE_AMT ' + 'in ' + i)
    fig = go.Figure(data = vi_data,layout = layout)
    iplot(fig)

## 일일을 기준으로 살펴보기

- 동별로 '의료기관(70) 카테고리'에 대한 변동폭(분산)이 큰 동과 작은 동이 발견된다.
- 일일을 기준으로 보았을 때는 특정한 트렌드 및 계절성이 발견되지 않는다.
- 관측치가 많은 '요식업소(80)' 기준으로 보았을 때도 같은 현상이 발견된다.

In [11]:
a1 = df2.groupby(['date','district'])['70M_25A_M_USE_AMT'].mean().to_frame().reset_index()

for i in np.unique(a1['district']):
    a2 = df2[df2['district'] == i]
    vi_data = [go.Scatter(x = a2['date'],y=a2['70M_25A_M_USE_CNT'])]
    layout = go.Layout(yaxis = dict(title='USE_CNT'),
                       title = '70cate USE_CNT ' + 'in ' + i)
    fig = go.Figure(data = vi_data,layout = layout)
    iplot(fig)

In [13]:
a1 = df2.groupby(['date','district'])['80M_25A_M_USE_AMT'].mean().to_frame().reset_index()

for i in np.unique(a1['district']):
    a2 = df2[df2['district'] == i]
    vi_data = [go.Scatter(x = a2['date'],y=a2['80M_25A_M_USE_CNT'])]
    layout = go.Layout(yaxis = dict(title='USE_CNT'),
                       title = '80cate USE_CNT ' + 'in ' + i)
    fig = go.Figure(data = vi_data,layout = layout)
    iplot(fig)