In [None]:
import requests
# 종관기상관측
def get_weather_data(stnIds, date, hour):
    url = 'http://apis.data.go.kr/1360000/AsosHourlyInfoService/getWthrDataList'
    params ={'serviceKey' : 'your_api_key', # 공공데이터포털 api key
    'pageNo' : '1', 
    'numOfRows' : '10', 
    'dataType' : 'JSON', 
    'dataCd' : 'ASOS', 
    'dateCd' : 'HR', 
    'startDt' : date, 
    'startHh' : hour, 
    'endDt' : date, 
    'endHh' : hour, 
    'stnIds' : stnIds }

    response = requests.get(url, params=params)
    data = response.json()
    items = data["response"]["body"]["items"]["item"]
    for item in items:
        return item['ta'], item['rn'], item['hm'], item['icsr'], item['dsnw'] # 기온, 강수량, 습도, 일사, 적설


In [None]:
import requests
# address = "시 구 동 번지"
def address_to_xy(address):
    url = "https://dapi.kakao.com/v2/local/search/address.json"
    key = "your_api_key" # 카카오 rest api key
    headers = {"Authorization": f"KakaoAK {key}"}
    params = {"query": address}
    response = requests.get(url, headers=headers, params=params)
    result = response.json()["documents"][0]["address"]
    
    return result["x"], result["y"]


In [None]:
import requests
from collections import Counter
# POI 업종명 추출
def address_to_category(address):
    url = "https://dapi.kakao.com/v2/local/search/keyword.json"
    key = "your_api_key" # 카카오 rest api key
    headers = {"Authorization": f"KakaoAK {key}"}
    params = {"query": address,}
    result = []
    response = requests.get(url, headers=headers, params=params)
    documents = response.json()["documents"]
    result += [doc["category_name"] for doc in documents]
    
    return Counter(result)


In [None]:
# 일변량 분석
def univariate_analysis(df, variable):
    print(variable)
    print(df[variable].describe())
    print(df[variable].value_counts().sort_index())
    print("\n")

In [None]:
import scipy.stats as stats
import pandas as pd
# 이변량 분석
def bivariate_analysis(df, Y, X):
    print(Y, "*", X)

    # Y * X
    # 종속변수와 독립변수 추출
    dependent_variable = df[Y]
    independent_variable = df[X]

    # 상관계수 계산
    corr, _ = stats.pearsonr(dependent_variable, independent_variable)
    print('Pearsons correlation: %.3f' % corr)

    # 카이제곱 검정
    contingency_table = pd.crosstab(dependent_variable, independent_variable)
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    print('Chi-square statistic: %.3f' % chi2)
    print('p-value: %.3f' % p)

    # 빈도표 만들기
    frequency_table = pd.crosstab(df[Y], df[X], margins=True)
    print(frequency_table)

    # 공분산 계산
    covariance = np.cov(dependent_variable, independent_variable)
    print('Covariance: %.3f' % covariance[0][1])

    print("\n")

# 데이터 전처리

In [None]:
import pandas as pd
import re
import numpy as np

df_reservation = pd.read_csv('reservation.csv', encoding='CP949')
# 목적지가 서울인 지역
df_reservation = df_reservation[df_reservation['destAddress'].str.split(' ').str[0].isin(['서울', '서울시', '서울특별시'])]

# 지번 이후 상세주소 제거
for i in range(len(df_reservation)):
    address = df_reservation['deptAddress'][i]
    try:
        number_index = re.search(r'\d+\b', address).end()
        df_reservation.at[i, 'deptAddress'] = address[:number_index]
    except AttributeError:
        df_reservation.drop(i, inplace=True)

# 103지점 중 가장 가까운 지점 매핑
df_reservation['stnIds'] = 0
city_to_stnId = {
    '서울': 108,
    '용인': 119, # 수원
    '의정': 108, # 서울
    '부천': 112, # 인천
    '안양': 116, # 관악산
    '화성': 119, # 수원
    '성남': 116, # 관악산
    '군포': 116, # 관악산
    '하남': 108, # 서울
    '오산': 119, # 수원
    '인천': 112,
    '부산': 159,
    '고양': 108, # 서울
    '시흥': 116, # 관악산
    '수원': 119,
    '광주': 202, # 양평
    '김포': 112, # 인천
    '과천': 116, # 관악산
    '파주': 99, 
    '평택': 232, # 천안
    '아산': 232, # 천안
    '대전': 133,
    '안산': 119, # 수원
    '김천': 135, # 추풍령
    '양평': 203, # 이천
    '동두': 98, 
    '가평': 101, # 춘천
    '양주': 98, # 동두천
    '광명': 116, # 관악산
    '천안': 232,
    '포항': 138,
    '남양': 108, # 서울
    '사천': 192, # 진주 
    '안성' :232, # 천안
    '의왕' :119, # 수원
    '김해' :257, # 양산
    '이천' :203, # 이천 
    '경산' :176, # 대구(기) 
    '구리' :108, # 서울 
    '나주' :156, # 광주
    '대구' :176, 
    '춘천' :101, 
    '창원' :155, 
    '구미' :279, 
    '익산' :146, # 전주
    '원주' :114, 
    '동해' :106,
    '청주' :131,
    '제주' :184,
    '세종' :239,
    '제천' :221
}
df_reservation['stnIds'] = df_reservation['deptAddress'].str.split(' ').str[0].str[:2].map(city_to_stnId)

df_reservation['date'] = 0
df_reservation['hour'] = 0
df_reservation['ta'] = 0 # 기온
df_reservation['rn'] = 0 # 강수량
df_reservation['hm'] = 0 # 습도
df_reservation['icsr'] = 0 # 일사
df_reservation['dsnw'] = 0 # 적설

# add time
for i in range(len(df_reservation)):
    reserveDate = df_reservation['reserveDate'][i]
    date = reserveDate.split()[0].replace("-", "")
    hour = reserveDate.split()[1].split(":")[0]
    df_reservation['date'][i] = date
    df_reservation['hour'][i] = hour

# add weather
for i in range(len(df_reservation)):
    try:
        stnIds = df_reservation['stnIds'][i]
        date = df_reservation['date'][i]
        hour = df_reservation['hour'][i]
        ta, rn, hm, icsr, dsnw = get_weather_data(stnIds, date, '{0:0>2}'.format(hour))
        df_reservation['ta'][i] = ta
        df_reservation['rn'][i] = rn
        df_reservation['hm'][i] = hm
        df_reservation['icsr'][i] = icsr
        df_reservation['dsnw'][i] = dsnw
    except KeyError:
        df_reservation.drop(i, inplace=False)
df_reservation = df_reservation.fillna({'rn': 0, 'icsr': 0, 'dsnw': 0})
df_reservation['rn_cat'] = 0

# 강수 여부
def map_rn(x):
    if x == 0:
        return 0
    else:
        return 1

df_reservation['rn_cat'] = df_reservation['rn'].apply(map_rn)
df_reservation = pd.get_dummies(df_reservation, columns=['rn_cat'], prefix='rn_cat')
df_reservation.columns = df_reservation.columns.str.replace('.0', '')

# add dow 
df_reservation['reserveDate'] = pd.to_datetime(df_reservation['reserveDate'])
df_reservation['dayOfWeek'] = df_reservation['reserveDate'].dt.day_name()
df_reservation['weekday'] = df_reservation['reserveDate'].dt.weekday

# 예약 상태
df_reservation.loc[df_reservation['status'] == 0, 'status'] = 0
df_reservation.loc[df_reservation['status'] == 1, 'status'] = 0
df_reservation.loc[df_reservation['status'] == 2, 'status'] = 1
df_reservation.loc[df_reservation['status'] == 3, 'status'] = 1
df_reservation.loc[df_reservation['status'] == 4, 'status'] = 0
df_reservation.loc[df_reservation['status'] == 5, 'status'] = 0

# add x, y
for i in range(len(df_reservation)):
    address = df_reservation['deptAddress'][i]
    try:
        x, y = address_to_xy(address)
        df_reservation['x'][i], df_reservation['y'][i] = x, y
    except IndexError:
        df_reservation.drop(i, inplace=True)
    
# add city, district
df_reservation = pd.read_csv('df_reservation.csv', encoding='utf-8-sig')
df_reservation['city'] = 0
df_reservation['district'] = 0
for i in range(len(df_reservation)):
    address = df_reservation['deptAddress'][i]
    df_reservation['city'][i] = address.split(' ')[0]
    df_reservation['district'][i] = address.split(' ')[1]
    
df_reservation.to_csv('df_reservation.csv', encoding='utf-8-sig')

# 데이터 추출

In [None]:
# 로지스틱 회귀분석
import statsmodels.formula.api as smf
model = smf.logit('status ~ rn_cat_1', data=df_reservation).fit() # 종속변수 예약 상태, 독립변수 강수 여부
print(model.summary())

In [None]:
# count dow
import matplotlib.pyplot as plt

df_dayOfWeek = pd.DataFrame(df_reservation['dayOfWeek'].value_counts())
df_dayOfWeek.columns = ['count']
df_dayOfWeek.index.name = 'dayOfWeek'

df_dayOfWeek.loc['Total'] = df_dayOfWeek.sum()


plt.bar(df_dayOfWeek.index[:-1], df_dayOfWeek['count'][:-1])
plt.title('service_use')
plt.xlabel('dow')
plt.ylabel('sum')
plt.show()

# t-test dow
import pandas as pd
import scipy.stats as stats

weekday_counts = df_reservation.groupby('weekday')['weekday'].count()
_, p_value = stats.ttest_ind(weekday_counts[:5], weekday_counts[5:])
print(p_value)
if p_value < 0.05:
    print('요일별 서비스 이용량에 차이가 있습니다.')
else:
    print('요일별 서비스 이용량에 차이가 없습니다.')


In [None]:
# add category name
result_sunday = Counter()

for i in range(len(df_reservation)):
    print(i)
    result_sunday += address_to_category(df_reservation['destAddress'][i])
    
df = pd.DataFrame(result_sunday.most_common(), columns=['word', 'count'])
df.to_csv('result_sunday.csv', encoding='utf-8-sig')

In [None]:
# 오버레이
import folium
from folium.plugins import HeatMap

def add_marker(m, x, y, popup):
    folium.Marker(
        location=[x, y],
        popup=popup,
        icon=folium.Icon(color='red', icon='info-sign')
    ).add_to(m)

    folium.Circle(
    radius=100,
    location=[x, y],
    popup=popup,
    color='#3186cc',
    fill=True,
    fill_color='#3186cc'
    ).add_to(m)


# 데이터 생성
data = [[y, x] for y, x in zip(df_reservation['y'], df_reservation['x'])]
print(data)
# 지도 생성
m = folium.Map(location=[37.5665, 126.9780], zoom_start=13)
# heatmap 생성
HeatMap(data[:20]).add_to(m)

m.save('heatmap.html')

