In [8]:
import numpy as np
import pandas as pd
from datetime import timedelta
import glob
import os

import warnings
warnings.filterwarnings(action='ignore')

### 1. 전처리
#### 1) 가구수 데이터

In [2]:
# 가구수 데이터
home_1 = pd.read_csv('data/corr/1인가구_증감률(2017_2019, cp949).csv', encoding='cp949')
home_2 = pd.read_csv('data/corr/2인가구_증감률(2017_2019, cp949).csv', encoding='cp949')
home_4 = pd.read_csv('data/corr/4인가구_증감률(2017_2019, cp949).csv', encoding='cp949')
home_4

Unnamed: 0,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,종로구,-4.05,-5.53
1,중구,-3.48,-2.65
2,용산구,-3.87,-2.99
3,성동구,-1.55,-5.98
4,광진구,-3.83,-4.03
5,동대문구,-3.12,-4.59
6,중랑구,-4.89,-5.43
7,성북구,-3.93,-1.08
8,강북구,-4.67,-5.82
9,도봉구,-4.8,-5.06


In [3]:
# 증감률 평균 내서 새로운 컬럼 생성
home_1['1인 가구'] = (home_1['2017년 대비 2018년 증감률'] + home_1['2018년 대비 2019년 증감률'])/2
home_2['2인 가구'] = (home_2['2017년 대비 2018년 증감률'] + home_2['2018년 대비 2019년 증감률'])/2
home_4['4인 가구'] = (home_4['2017년 대비 2018년 증감률'] + home_4['2018년 대비 2019년 증감률'])/2
home_4

Unnamed: 0,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률,4인 가구
0,종로구,-4.05,-5.53,-4.79
1,중구,-3.48,-2.65,-3.065
2,용산구,-3.87,-2.99,-3.43
3,성동구,-1.55,-5.98,-3.765
4,광진구,-3.83,-4.03,-3.93
5,동대문구,-3.12,-4.59,-3.855
6,중랑구,-4.89,-5.43,-5.16
7,성북구,-3.93,-1.08,-2.505
8,강북구,-4.67,-5.82,-5.245
9,도봉구,-4.8,-5.06,-4.93


In [4]:
# 자치구랑 2017-2019 증감률만 남겨두기
home_1 = home_1[['자치구', '1인 가구']].sort_values(by='자치구').reset_index(drop=True)
home_2 = home_2[['자치구', '2인 가구']].sort_values(by='자치구').reset_index(drop=True)
home_4 = home_4[['자치구', '4인 가구']].sort_values(by='자치구').reset_index(drop=True)
home_4

Unnamed: 0,자치구,4인 가구
0,강남구,-3.395
1,강동구,-3.675
2,강북구,-5.245
3,강서구,-4.33
4,관악구,-4.865
5,광진구,-3.93
6,구로구,-3.94
7,금천구,-4.325
8,노원구,-5.02
9,도봉구,-4.93


#### 2) cctv 데이터

In [5]:
cctv = pd.read_csv('data/corr/CCTV_증감률(2017_2019, cp949).csv', encoding='cp949')
cctv

Unnamed: 0,데이터구분,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,CCTV,강 남 구,9.39,16.09
1,CCTV,강 동 구,25.77,20.81
2,CCTV,강 북 구,33.72,24.27
3,CCTV,강 서 구,18.64,22.39
4,CCTV,관 악 구,20.25,30.36
5,CCTV,광 진 구,28.02,25.3
6,CCTV,구 로 구,17.73,19.09
7,CCTV,금 천 구,48.01,46.57
8,CCTV,노 원 구,8.91,22.56
9,CCTV,도 봉 구,9.02,17.37


In [6]:
# 자치구에 들어간 공백 제거
for i in range(25) :
    cctv['자치구'][i] = ''.join(cctv['자치구'][i].split())

In [7]:
# 증감률 평균 내서 새로운 컬럼 생성
cctv['cctv'] = (cctv['2017년 대비 2018년 증감률'] + cctv['2018년 대비 2019년 증감률'])/2

In [8]:
# 자치구랑 2017-2019 증감률만 남겨두기
cctv = cctv[['자치구', 'cctv']].sort_values(by='자치구').reset_index(drop=True)
cctv

Unnamed: 0,자치구,cctv
0,강남구,12.74
1,강동구,23.29
2,강북구,28.995
3,강서구,20.515
4,관악구,25.305
5,광진구,26.66
6,구로구,18.41
7,금천구,47.29
8,노원구,15.735
9,도봉구,13.195


In [9]:
# 병합
data_1 = pd.merge(home_1, cctv, how='left', on='자치구')
data_2 = pd.merge(home_2, cctv, how='left', on='자치구')
data_4 = pd.merge(home_4, cctv, how='left', on='자치구')
data_1

Unnamed: 0,자치구,1인 가구,cctv
0,강남구,2.63,12.74
1,강동구,4.945,23.29
2,강북구,4.64,28.995
3,강서구,7.855,20.515
4,관악구,5.905,25.305
5,광진구,3.45,26.66
6,구로구,5.42,18.41
7,금천구,7.135,47.29
8,노원구,4.08,15.735
9,도봉구,5.265,13.195


---
### 함수 생성

In [10]:
def data_1_percentage (data_name, name, data_1) :
    data_name[name] = (data_name['2017년 대비 2018년 증감률'] + data_name['2018년 대비 2019년 증감률'])/2
    data_name = data_name[['자치구', name]]
    
    data_1 = pd.merge(data_1, data_name, how='left', on='자치구').sort_values(by='자치구').reset_index(drop=True)
    return data_1

def data_2_percentage (data_name, name, data_2) :
    data_name[name] = (data_name['2017년 대비 2018년 증감률'] + data_name['2018년 대비 2019년 증감률'])/2
    data_name = data_name[['자치구', name]]
    
    data_2 = pd.merge(data_2, data_name, how='left', on='자치구').sort_values(by='자치구').reset_index(drop=True)
    return data_2

def data_4_percentage (data_name, name, data_4) :
    data_name[name] = (data_name['2017년 대비 2018년 증감률'] + data_name['2018년 대비 2019년 증감률'])/2
    data_name = data_name[['자치구', name]]
    
    data_4 = pd.merge(data_4, data_name, how='left', on='자치구').sort_values(by='자치구').reset_index(drop=True)
    return data_4

---

#### 3) 가로등 데이터

In [11]:
light = pd.read_csv('data/corr/가로등_증감률(2017_2019, cp949).csv', encoding='cp949')
light

Unnamed: 0,데이터구분,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,가로등,종로구,2.6,0.49
1,가로등,중구,0.61,-0.33
2,가로등,용산구,-0.59,5.37
3,가로등,성동구,2.08,4.67
4,가로등,광진구,-1.61,-3.31
5,가로등,동대문구,0.65,0.76
6,가로등,중랑구,0.54,0.03
7,가로등,성북구,-1.34,5.34
8,가로등,강북구,1.8,-4.11
9,가로등,도봉구,0.16,1.9


In [12]:
data_1_percentage(light, '가로등', data_1)
data_2_percentage(light, '가로등', data_2)
data_4_percentage(light, '가로등', data_4)

Unnamed: 0,자치구,4인 가구,cctv,가로등
0,강남구,-3.395,12.74,-1.425
1,강동구,-3.675,23.29,-2.59
2,강북구,-5.245,28.995,-1.155
3,강서구,-4.33,20.515,16.675
4,관악구,-4.865,25.305,4.18
5,광진구,-3.93,26.66,-2.46
6,구로구,-3.94,18.41,0.78
7,금천구,-4.325,47.29,1.21
8,노원구,-5.02,15.735,1.09
9,도봉구,-4.93,13.195,1.03


#### 4) 어린이집/유치원 데이터

In [13]:
pre_school = pd.read_csv('data/corr/어린이집_유치원 증감률(2017~2019, cp949).csv', encoding='cp949')
pre_school

Unnamed: 0,데이터,구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,어린이집,강남구,11.67,-0.84
1,어린이집,강동구,13.53,-6.4
2,어린이집,강북구,10.39,-6.94
3,어린이집,강서구,7.05,-3.31
4,어린이집,관악구,12.5,-3.7
5,어린이집,광진구,11.25,-1.27
6,어린이집,구로구,12.41,-3.79
7,어린이집,금천구,10.75,-8.14
8,어린이집,노원구,10.59,-3.66
9,어린이집,도봉구,8.08,-12.5


In [14]:
# 어린이집/유치원 데이터 합치기
pre_school = pre_school.groupby([pre_school['구']]).mean().reset_index()

# rename : 자치구로 변경
pre_school.rename({'구' : '자치구'}, axis=1, inplace=True)
pre_school

Unnamed: 0,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,강남구,7.62,-2.27
1,강동구,10.765,-5.285
2,강북구,5.195,-10.135
3,강서구,5.61,-2.72
4,관악구,15.34,-5.075
5,광진구,5.625,-0.635
6,구로구,10.555,-4.17
7,금천구,5.375,-8.235
8,노원구,12.67,-6.295
9,도봉구,11.445,-10.25


In [15]:
data_1 = data_1_percentage(pre_school, '어린이집/유치원', data_1)
data_2 = data_2_percentage(pre_school, '어린이집/유치원', data_2)
data_4 = data_4_percentage(pre_school, '어린이집/유치원', data_4)
data_4

Unnamed: 0,자치구,4인 가구,cctv,어린이집/유치원
0,강남구,-3.395,12.74,2.675
1,강동구,-3.675,23.29,2.74
2,강북구,-5.245,28.995,-2.47
3,강서구,-4.33,20.515,1.445
4,관악구,-4.865,25.305,5.1325
5,광진구,-3.93,26.66,2.495
6,구로구,-3.94,18.41,3.1925
7,금천구,-4.325,47.29,-1.43
8,노원구,-5.02,15.735,3.1875
9,도봉구,-4.93,13.195,0.5975


#### 5) 초중고등학교 데이터

In [16]:
school = pd.read_csv('data/corr/학교 증감률(2017~2019, cp949).csv', encoding='cp949')
school

Unnamed: 0,데이터,구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,초등학교,강남구,0.0,0.0
1,초등학교,강동구,0.0,0.0
2,초등학교,강북구,0.0,0.0
3,초등학교,강서구,0.0,0.0
4,초등학교,관악구,0.0,0.0
...,...,...,...,...
70,고등학교,용산구,0.0,0.0
71,고등학교,은평구,0.0,0.0
72,고등학교,종로구,0.0,0.0
73,고등학교,중구,0.0,0.0


In [17]:
# 초중고등학교 데이터 합치기
school = school.groupby([school['구']]).mean().reset_index()

# rename : 자치구로 변경
school.rename({'구' : '자치구'}, axis=1, inplace=True)
school

Unnamed: 0,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,강남구,0.0,0.0
1,강동구,0.0,0.0
2,강북구,0.0,0.0
3,강서구,0.0,0.0
4,관악구,0.0,0.0
5,광진구,0.0,0.0
6,구로구,0.0,2.563333
7,금천구,0.0,0.0
8,노원구,0.0,0.0
9,도봉구,0.0,0.0


In [18]:
data_1 = data_1_percentage(school, '초중고등학교', data_1)
data_2 = data_2_percentage(school, '초중고등학교', data_2)
data_4 = data_4_percentage(school, '초중고등학교', data_4)

#### 6) 대학교 데이터

In [19]:
univ = pd.read_csv('data/corr/대학교 증감률(2017~2019, cp949).csv', encoding='cp949')
univ

Unnamed: 0,데이터,구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,대학교,강남구,0.0,0.0
1,대학교,강동구,0.0,0.0
2,대학교,강북구,0.0,0.0
3,대학교,강서구,0.0,0.0
4,대학교,관악구,0.0,0.0
5,대학교,광진구,0.0,0.0
6,대학교,구로구,0.0,0.0
7,대학교,금천구,0.0,0.0
8,대학교,노원구,0.0,0.0
9,대학교,도봉구,0.0,0.0


In [20]:
# rename : 자치구로 변경
univ.rename({'구' : '자치구'}, axis=1, inplace=True)

In [21]:
data_1 = data_1_percentage(univ, '대학교', data_1)
data_2 = data_2_percentage(univ, '대학교', data_2)
data_4 = data_4_percentage(univ, '대학교', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교
0,강남구,2.63,12.74,2.675,0.0,0.0
1,강동구,4.945,23.29,2.74,0.0,0.0
2,강북구,4.64,28.995,-2.47,0.0,0.0
3,강서구,7.855,20.515,1.445,0.0,0.0
4,관악구,5.905,25.305,5.1325,0.0,0.0
5,광진구,3.45,26.66,2.495,0.0,0.0
6,구로구,5.42,18.41,3.1925,1.281667,0.0
7,금천구,7.135,47.29,-1.43,0.0,0.0
8,노원구,4.08,15.735,3.1875,0.0,0.0
9,도봉구,5.265,13.195,0.5975,0.0,0.0


#### 7) 백화점 데이터

In [22]:
department = pd.read_csv('data/corr/백화점_증감_갯수(2017_2019, cp949).csv', encoding='cp949')
department

Unnamed: 0,자치구,2017년 대비 2018년 증감 갯수,2018년 대비 2019년 증감 갯수
0,강남구,0.0,0.0
1,강동구,0.0,0.0
2,강북구,0.0,0.0
3,강서구,0.0,0.0
4,관악구,0.0,0.0
5,광진구,0.0,0.0
6,구로구,0.0,0.0
7,금천구,0.0,0.0
8,노원구,0.0,0.0
9,도봉구,0.0,0.0


In [23]:
# rename : 증감개수로 변경
department.rename({'2017년 대비 2018년 증감 갯수' : '2017년 대비 2018년 증감률',
            '2018년 대비 2019년 증감 갯수' : '2018년 대비 2019년 증감률'}, axis=1, inplace=True)

In [24]:
data_1 = data_1_percentage(department, '백화점', data_1)
data_2 = data_2_percentage(department, '백화점', data_2)
data_4 = data_4_percentage(department, '백화점', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점
0,강남구,2.63,12.74,2.675,0.0,0.0,0.0
1,강동구,4.945,23.29,2.74,0.0,0.0,0.0
2,강북구,4.64,28.995,-2.47,0.0,0.0,0.0
3,강서구,7.855,20.515,1.445,0.0,0.0,0.0
4,관악구,5.905,25.305,5.1325,0.0,0.0,0.0
5,광진구,3.45,26.66,2.495,0.0,0.0,0.0
6,구로구,5.42,18.41,3.1925,1.281667,0.0,0.0
7,금천구,7.135,47.29,-1.43,0.0,0.0,0.0
8,노원구,4.08,15.735,3.1875,0.0,0.0,0.0
9,도봉구,5.265,13.195,0.5975,0.0,0.0,0.0


#### 8) 버스정류장 데이터

In [25]:
bus = pd.read_csv('data/corr/버스정류장 증감률(2017~2019, cp949).csv', encoding='cp949')
bus

Unnamed: 0,데이터,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,버스정류장,종로구,0.0,-1.87
1,버스정류장,서대문구,0.0,53.69
2,버스정류장,중구,0.0,-108.06
3,버스정류장,광진구,0.0,-82.88
4,버스정류장,성동구,0.0,21.04
5,버스정류장,용산구,0.0,16.86
6,버스정류장,동대문구,0.0,0.31
7,버스정류장,중랑구,0.0,0.26
8,버스정류장,성북구,0.0,-1.16
9,버스정류장,노원구,0.0,1.29


In [26]:
data_1 = data_1_percentage(bus, '버스정류장', data_1)
data_2 = data_2_percentage(bus, '버스정류장', data_2)
data_4 = data_4_percentage(bus, '버스정류장', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장
0,강남구,2.63,12.74,2.675,0.0,0.0,0.0,18.795
1,강동구,4.945,23.29,2.74,0.0,0.0,0.0,4.675
2,강북구,4.64,28.995,-2.47,0.0,0.0,0.0,-1.34
3,강서구,7.855,20.515,1.445,0.0,0.0,0.0,2.24
4,관악구,5.905,25.305,5.1325,0.0,0.0,0.0,-0.93
5,광진구,3.45,26.66,2.495,0.0,0.0,0.0,-41.44
6,구로구,5.42,18.41,3.1925,1.281667,0.0,0.0,3.41
7,금천구,7.135,47.29,-1.43,0.0,0.0,0.0,-17.955
8,노원구,4.08,15.735,3.1875,0.0,0.0,0.0,0.645
9,도봉구,5.265,13.195,0.5975,0.0,0.0,0.0,0.64


#### 9) 병원 데이터

In [27]:
hospital = pd.read_csv('data/corr/병원 증감률(2017~2019, cp949).csv', encoding='cp949')
hospital

Unnamed: 0,데이터,구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,병원,강남구,8.22,1.35
1,병원,강동구,2.86,-6.06
2,병원,강북구,6.25,0.0
3,병원,강서구,3.12,0.0
4,병원,관악구,6.67,11.76
5,병원,광진구,0.0,8.33
6,병원,구로구,9.52,12.5
7,병원,금천구,0.0,14.29
8,병원,노원구,-4.55,0.0
9,병원,도봉구,0.0,0.0


In [28]:
# rename : 자치구로 변경
hospital.rename({'구' : '자치구'}, axis=1, inplace=True)

In [29]:
data_1 = data_1_percentage(hospital, '병원', data_1)
data_2 = data_2_percentage(hospital, '병원', data_2)
data_4 = data_4_percentage(hospital, '병원', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장,병원
0,강남구,2.63,12.74,2.675,0.0,0.0,0.0,18.795,4.785
1,강동구,4.945,23.29,2.74,0.0,0.0,0.0,4.675,-1.6
2,강북구,4.64,28.995,-2.47,0.0,0.0,0.0,-1.34,3.125
3,강서구,7.855,20.515,1.445,0.0,0.0,0.0,2.24,1.56
4,관악구,5.905,25.305,5.1325,0.0,0.0,0.0,-0.93,9.215
5,광진구,3.45,26.66,2.495,0.0,0.0,0.0,-41.44,4.165
6,구로구,5.42,18.41,3.1925,1.281667,0.0,0.0,3.41,11.01
7,금천구,7.135,47.29,-1.43,0.0,0.0,0.0,-17.955,7.145
8,노원구,4.08,15.735,3.1875,0.0,0.0,0.0,0.645,-2.275
9,도봉구,5.265,13.195,0.5975,0.0,0.0,0.0,0.64,0.0


#### 10) 영화관 데이터

In [30]:
theater = pd.read_csv('data/corr/영화관 증감률(2017~2019, cp949).csv', encoding='cp949')
theater

Unnamed: 0,데이터,구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,영화관,강남구,-6.67,0.0
1,영화관,강동구,0.0,0.0
2,영화관,강북구,0.0,0.0
3,영화관,강서구,0.0,0.0
4,영화관,관악구,0.0,0.0
5,영화관,광진구,20.0,0.0
6,영화관,구로구,0.0,0.0
7,영화관,금천구,0.0,0.0
8,영화관,노원구,0.0,0.0
9,영화관,도봉구,0.0,0.0


In [31]:
# rename : 자치구로 변경
theater.rename({'구' : '자치구'}, axis=1, inplace=True)

In [32]:
data_1 = data_1_percentage(theater, '영화관', data_1)
data_2 = data_2_percentage(theater, '영화관', data_2)
data_4 = data_4_percentage(theater, '영화관', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장,병원,영화관
0,강남구,2.63,12.74,2.675,0.0,0.0,0.0,18.795,4.785,-3.335
1,강동구,4.945,23.29,2.74,0.0,0.0,0.0,4.675,-1.6,0.0
2,강북구,4.64,28.995,-2.47,0.0,0.0,0.0,-1.34,3.125,0.0
3,강서구,7.855,20.515,1.445,0.0,0.0,0.0,2.24,1.56,0.0
4,관악구,5.905,25.305,5.1325,0.0,0.0,0.0,-0.93,9.215,0.0
5,광진구,3.45,26.66,2.495,0.0,0.0,0.0,-41.44,4.165,10.0
6,구로구,5.42,18.41,3.1925,1.281667,0.0,0.0,3.41,11.01,0.0
7,금천구,7.135,47.29,-1.43,0.0,0.0,0.0,-17.955,7.145,0.0
8,노원구,4.08,15.735,3.1875,0.0,0.0,0.0,0.645,-2.275,0.0
9,도봉구,5.265,13.195,0.5975,0.0,0.0,0.0,0.64,0.0,0.0


#### 11) 음식점

In [33]:
store = pd.read_csv('data/corr/음식점_증감률(2017_2019, cp949).csv', encoding='cp949')
store

Unnamed: 0,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,강남구,-3.34,1.89
1,강동구,-1.93,0.3
2,강북구,-3.36,-1.34
3,강서구,6.43,6.42
4,관악구,1.94,3.25
5,광진구,3.73,1.2
6,구로구,-2.68,-0.4
7,금천구,-2.29,-1.08
8,노원구,-1.11,2.74
9,도봉구,2.91,2.02


In [34]:
data_1 = data_1_percentage(store, '음식점', data_1)
data_2 = data_2_percentage(store, '음식점', data_2)
data_4 = data_4_percentage(store, '음식점', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장,병원,영화관,음식점
0,강남구,2.63,12.74,2.675,0.0,0.0,0.0,18.795,4.785,-3.335,-0.725
1,강동구,4.945,23.29,2.74,0.0,0.0,0.0,4.675,-1.6,0.0,-0.815
2,강북구,4.64,28.995,-2.47,0.0,0.0,0.0,-1.34,3.125,0.0,-2.35
3,강서구,7.855,20.515,1.445,0.0,0.0,0.0,2.24,1.56,0.0,6.425
4,관악구,5.905,25.305,5.1325,0.0,0.0,0.0,-0.93,9.215,0.0,2.595
5,광진구,3.45,26.66,2.495,0.0,0.0,0.0,-41.44,4.165,10.0,2.465
6,구로구,5.42,18.41,3.1925,1.281667,0.0,0.0,3.41,11.01,0.0,-1.54
7,금천구,7.135,47.29,-1.43,0.0,0.0,0.0,-17.955,7.145,0.0,-1.685
8,노원구,4.08,15.735,3.1875,0.0,0.0,0.0,0.645,-2.275,0.0,0.815
9,도봉구,5.265,13.195,0.5975,0.0,0.0,0.0,0.64,0.0,0.0,2.465


#### 12) 제과제빵 데이터

In [35]:
bread = pd.read_csv('data/corr/제과제빵_증감률(2017_2019, cp949).csv', encoding='cp949')
bread

Unnamed: 0,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,강남구,-9.66,-11.45
1,강동구,-25.0,-11.11
2,강북구,-27.27,-12.5
3,강서구,-7.69,-25.0
4,관악구,-10.0,0.0
5,광진구,17.39,-3.7
6,구로구,-28.57,0.0
7,금천구,0.0,0.0
8,노원구,-9.09,0.0
9,도봉구,-10.0,0.0


In [36]:
data_1 = data_1_percentage(bread, '제과제빵', data_1)
data_2 = data_2_percentage(bread, '제과제빵', data_2)
data_4 = data_4_percentage(bread, '제과제빵', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장,병원,영화관,음식점,제과제빵
0,강남구,2.63,12.74,2.675,0.0,0.0,0.0,18.795,4.785,-3.335,-0.725,-10.555
1,강동구,4.945,23.29,2.74,0.0,0.0,0.0,4.675,-1.6,0.0,-0.815,-18.055
2,강북구,4.64,28.995,-2.47,0.0,0.0,0.0,-1.34,3.125,0.0,-2.35,-19.885
3,강서구,7.855,20.515,1.445,0.0,0.0,0.0,2.24,1.56,0.0,6.425,-16.345
4,관악구,5.905,25.305,5.1325,0.0,0.0,0.0,-0.93,9.215,0.0,2.595,-5.0
5,광진구,3.45,26.66,2.495,0.0,0.0,0.0,-41.44,4.165,10.0,2.465,6.845
6,구로구,5.42,18.41,3.1925,1.281667,0.0,0.0,3.41,11.01,0.0,-1.54,-14.285
7,금천구,7.135,47.29,-1.43,0.0,0.0,0.0,-17.955,7.145,0.0,-1.685,0.0
8,노원구,4.08,15.735,3.1875,0.0,0.0,0.0,0.645,-2.275,0.0,0.815,-4.545
9,도봉구,5.265,13.195,0.5975,0.0,0.0,0.0,0.64,0.0,0.0,2.465,-5.0


#### 13) 지하철역 데이터

In [37]:
station = pd.read_csv('data/corr/지하철역 증감률(2017~2019, cp949).csv', encoding='cp949')
station

Unnamed: 0,데이터,호선,구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,지하철역,1,금천구,0.0,0.0
1,지하철역,1,구로구,0.0,0.0
2,지하철역,1,노원구,0.0,0.0
3,지하철역,1,용산구,0.0,0.0
4,지하철역,1,동작구,0.0,0.0
...,...,...,...,...,...
395,지하철역,우이신설,은평구,0.0,0.0
396,지하철역,우이신설,강북구,0.0,0.0
397,지하철역,우이신설,성북구,0.0,0.0
398,지하철역,우이신설,강동구,0.0,0.0


In [38]:
# 지하철역 데이터 합치기
station = station.groupby([station['구']]).mean().reset_index()

# rename : 자치구로 변경
station.rename({'구' : '자치구'}, axis=1, inplace=True)
station

Unnamed: 0,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,강남구,0.0,0.0
1,강동구,6.25,0.0
2,강북구,0.0,0.0
3,강서구,0.0,0.0
4,관악구,0.0,0.0
5,광진구,0.0,0.0
6,구로구,0.0,0.0
7,금천구,0.0,0.0
8,노원구,0.0,0.0
9,도봉구,0.0,0.0


In [39]:
data_1 = data_1_percentage(station, '지하철역', data_1)
data_2 = data_2_percentage(station, '지하철역', data_2)
data_4 = data_4_percentage(station, '지하철역', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장,병원,영화관,음식점,제과제빵,지하철역
0,강남구,2.63,12.74,2.675,0.0,0.0,0.0,18.795,4.785,-3.335,-0.725,-10.555,0.0
1,강동구,4.945,23.29,2.74,0.0,0.0,0.0,4.675,-1.6,0.0,-0.815,-18.055,3.125
2,강북구,4.64,28.995,-2.47,0.0,0.0,0.0,-1.34,3.125,0.0,-2.35,-19.885,0.0
3,강서구,7.855,20.515,1.445,0.0,0.0,0.0,2.24,1.56,0.0,6.425,-16.345,0.0
4,관악구,5.905,25.305,5.1325,0.0,0.0,0.0,-0.93,9.215,0.0,2.595,-5.0,0.0
5,광진구,3.45,26.66,2.495,0.0,0.0,0.0,-41.44,4.165,10.0,2.465,6.845,0.0
6,구로구,5.42,18.41,3.1925,1.281667,0.0,0.0,3.41,11.01,0.0,-1.54,-14.285,0.0
7,금천구,7.135,47.29,-1.43,0.0,0.0,0.0,-17.955,7.145,0.0,-1.685,0.0,0.0
8,노원구,4.08,15.735,3.1875,0.0,0.0,0.0,0.645,-2.275,0.0,0.815,-4.545,0.0
9,도봉구,5.265,13.195,0.5975,0.0,0.0,0.0,0.64,0.0,0.0,2.465,-5.0,0.0


#### 14) 카페 데이터

In [40]:
cafe = pd.read_csv('data/corr/카페_증감률(2017_2019, cp949).csv', encoding='cp949')
cafe

Unnamed: 0,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,강남구,18.01,11.34
1,강동구,0.0,11.09
2,강북구,2.38,2.79
3,강서구,14.82,12.41
4,관악구,11.32,9.04
5,광진구,14.4,17.25
6,구로구,-1.38,4.42
7,금천구,15.3,7.41
8,노원구,9.24,13.02
9,도봉구,9.38,11.02


In [41]:
data_1 = data_1_percentage(cafe, '카페', data_1)
data_2 = data_2_percentage(cafe, '카페', data_2)
data_4 = data_4_percentage(cafe, '카페', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장,병원,영화관,음식점,제과제빵,지하철역,카페
0,강남구,2.63,12.74,2.675,0.0,0.0,0.0,18.795,4.785,-3.335,-0.725,-10.555,0.0,14.675
1,강동구,4.945,23.29,2.74,0.0,0.0,0.0,4.675,-1.6,0.0,-0.815,-18.055,3.125,5.545
2,강북구,4.64,28.995,-2.47,0.0,0.0,0.0,-1.34,3.125,0.0,-2.35,-19.885,0.0,2.585
3,강서구,7.855,20.515,1.445,0.0,0.0,0.0,2.24,1.56,0.0,6.425,-16.345,0.0,13.615
4,관악구,5.905,25.305,5.1325,0.0,0.0,0.0,-0.93,9.215,0.0,2.595,-5.0,0.0,10.18
5,광진구,3.45,26.66,2.495,0.0,0.0,0.0,-41.44,4.165,10.0,2.465,6.845,0.0,15.825
6,구로구,5.42,18.41,3.1925,1.281667,0.0,0.0,3.41,11.01,0.0,-1.54,-14.285,0.0,1.52
7,금천구,7.135,47.29,-1.43,0.0,0.0,0.0,-17.955,7.145,0.0,-1.685,0.0,0.0,11.355
8,노원구,4.08,15.735,3.1875,0.0,0.0,0.0,0.645,-2.275,0.0,0.815,-4.545,0.0,11.13
9,도봉구,5.265,13.195,0.5975,0.0,0.0,0.0,0.64,0.0,0.0,2.465,-5.0,0.0,10.2


#### 15) 편의점 데이터

In [42]:
conv = pd.read_csv('data/corr/편의점_증감률(2017_2019, cp949).csv', encoding='cp949')
conv

Unnamed: 0,자치구,2017년 대비 2018년 증감률,2018년 대비 2019년 증감률
0,강남구,13.03,4.67
1,강동구,13.55,16.05
2,강북구,8.51,17.65
3,강서구,15.35,22.05
4,관악구,16.25,12.19
5,광진구,24.9,9.35
6,구로구,8.64,22.73
7,금천구,21.14,26.17
8,노원구,18.98,11.04
9,도봉구,20.0,9.09


In [43]:
data_1 = data_1_percentage(conv, '편의점', data_1)
data_2 = data_2_percentage(conv, '편의점', data_2)
data_4 = data_4_percentage(conv, '편의점', data_4)
data_1

Unnamed: 0,자치구,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장,병원,영화관,음식점,제과제빵,지하철역,카페,편의점
0,강남구,2.63,12.74,2.675,0.0,0.0,0.0,18.795,4.785,-3.335,-0.725,-10.555,0.0,14.675,8.85
1,강동구,4.945,23.29,2.74,0.0,0.0,0.0,4.675,-1.6,0.0,-0.815,-18.055,3.125,5.545,14.8
2,강북구,4.64,28.995,-2.47,0.0,0.0,0.0,-1.34,3.125,0.0,-2.35,-19.885,0.0,2.585,13.08
3,강서구,7.855,20.515,1.445,0.0,0.0,0.0,2.24,1.56,0.0,6.425,-16.345,0.0,13.615,18.7
4,관악구,5.905,25.305,5.1325,0.0,0.0,0.0,-0.93,9.215,0.0,2.595,-5.0,0.0,10.18,14.22
5,광진구,3.45,26.66,2.495,0.0,0.0,0.0,-41.44,4.165,10.0,2.465,6.845,0.0,15.825,17.125
6,구로구,5.42,18.41,3.1925,1.281667,0.0,0.0,3.41,11.01,0.0,-1.54,-14.285,0.0,1.52,15.685
7,금천구,7.135,47.29,-1.43,0.0,0.0,0.0,-17.955,7.145,0.0,-1.685,0.0,0.0,11.355,23.655
8,노원구,4.08,15.735,3.1875,0.0,0.0,0.0,0.645,-2.275,0.0,0.815,-4.545,0.0,11.13,15.01
9,도봉구,5.265,13.195,0.5975,0.0,0.0,0.0,0.64,0.0,0.0,2.465,-5.0,0.0,10.2,14.545


In [44]:
# 데이터 저장
data_1.to_csv('data/corr/FIN/1인 가구 증감률.csv', encoding='cp949', index=False)
data_2.to_csv('data/corr/FIN/2인 가구 증감률.csv', encoding='cp949', index=False)
data_4.to_csv('data/corr/FIN/4인 가구 증감률.csv', encoding='cp949', index=False)

### 2. Correlation

In [9]:
df1 = pd.read_csv('data/corr/FIN/1인 가구 증감률.csv', encoding='cp949', index_col=0)
df2 = pd.read_csv('data/corr/FIN/2인 가구 증감률.csv', encoding='cp949', index_col=0)
df4 = pd.read_csv('data/corr/FIN/4인 가구 증감률.csv', encoding='cp949', index_col=0)
df1

Unnamed: 0_level_0,1인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장,병원,영화관,음식점,제과제빵,지하철역,카페,편의점
자치구,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
강남구,2.63,12.74,2.675,0.0,0.0,0.0,18.795,4.785,-3.335,-0.725,-10.555,0.0,14.675,8.85
강동구,4.945,23.29,2.74,0.0,0.0,0.0,4.675,-1.6,0.0,-0.815,-18.055,3.125,5.545,14.8
강북구,4.64,28.995,-2.47,0.0,0.0,0.0,-1.34,3.125,0.0,-2.35,-19.885,0.0,2.585,13.08
강서구,7.855,20.515,1.445,0.0,0.0,0.0,2.24,1.56,0.0,6.425,-16.345,0.0,13.615,18.7
관악구,5.905,25.305,5.1325,0.0,0.0,0.0,-0.93,9.215,0.0,2.595,-5.0,0.0,10.18,14.22
광진구,3.45,26.66,2.495,0.0,0.0,0.0,-41.44,4.165,10.0,2.465,6.845,0.0,15.825,17.125
구로구,5.42,18.41,3.1925,1.281667,0.0,0.0,3.41,11.01,0.0,-1.54,-14.285,0.0,1.52,15.685
금천구,7.135,47.29,-1.43,0.0,0.0,0.0,-17.955,7.145,0.0,-1.685,0.0,0.0,11.355,23.655
노원구,4.08,15.735,3.1875,0.0,0.0,0.0,0.645,-2.275,0.0,0.815,-4.545,0.0,11.13,15.01
도봉구,5.265,13.195,0.5975,0.0,0.0,0.0,0.64,0.0,0.0,2.465,-5.0,0.0,10.2,14.545


In [10]:
# 상관계수 측정
corr1 = df1.corr(method='pearson')
corr2 = df2.corr(method='pearson')
corr4 = df4.corr(method='pearson')
corr2

Unnamed: 0,2인 가구,cctv,어린이집/유치원,초중고등학교,대학교,백화점,버스정류장,병원,영화관,음식점,제과제빵,지하철역,카페,편의점
2인 가구,1.0,0.327351,0.092261,0.350073,,-0.134271,-0.240309,0.058194,-0.038948,0.061029,-0.135929,0.109487,-0.019614,0.267358
cctv,0.327351,1.0,-0.133019,-0.083253,,0.090069,-0.440723,0.309809,0.149197,-0.091794,-0.129988,0.046036,0.082922,0.234082
어린이집/유치원,0.092261,-0.133019,1.0,0.241349,,-0.493889,0.060692,0.057091,-0.201394,0.098965,0.092796,0.051255,0.143882,-0.222962
초중고등학교,0.350073,-0.083253,0.241349,1.0,,0.0,0.184963,0.166348,0.05222,-0.425666,-0.260907,-0.085246,-0.413334,-0.117178
대학교,,,,,,,,,,,,,,
백화점,-0.134271,0.090069,-0.493889,0.0,,1.0,-0.002081,0.21488,-0.15571,-0.039045,-0.227601,0.0,-0.412798,-0.332203
버스정류장,-0.240309,-0.440723,0.060692,0.184963,,-0.002081,1.0,-0.036092,0.008332,-0.108315,-0.252188,0.075986,-0.187915,-0.022839
병원,0.058194,0.309809,0.057091,0.166348,,0.21488,-0.036092,1.0,-0.059827,0.0542,-0.392614,-0.141384,-0.045705,0.06639
영화관,-0.038948,0.149197,-0.201394,0.05222,,-0.15571,0.008332,-0.059827,1.0,0.154821,0.148593,0.025524,0.308785,0.333391
음식점,0.061029,-0.091794,0.098965,-0.425666,,-0.039045,-0.108315,0.0542,0.154821,1.0,0.036652,-0.061289,0.608818,0.091678


In [11]:
# 상관계수 df 생성
idx = corr1.index.to_list()
idx[0] = '가구 수'
idx
corr = pd.DataFrame(idx, columns=['요소'])
corr['1인 가구 상관계수'] = corr1['1인 가구'].values
corr['2인 가구 상관계수'] = corr2['2인 가구'].values
corr['4인 가구 상관계수'] = corr4['4인 가구'].values
corr.fillna(0, inplace=True)
corr.drop(0, axis=0, inplace=True)
corr.reset_index(drop=True, inplace=True)
corr

Unnamed: 0,요소,1인 가구 상관계수,2인 가구 상관계수,4인 가구 상관계수
0,cctv,0.461408,0.327351,0.206458
1,어린이집/유치원,-0.2006,0.092261,0.333847
2,초중고등학교,0.281919,0.350073,0.321225
3,대학교,0.0,0.0,0.0
4,백화점,0.07657,-0.134271,0.110704
5,버스정류장,-0.114068,-0.240309,-0.005293
6,병원,0.06563,0.058194,0.153373
7,영화관,0.075359,-0.038948,-0.211472
8,음식점,0.018174,0.061029,-0.250236
9,제과제빵,-0.373063,-0.135929,-0.130313


In [5]:
# # 점수화(-3~3)
# list1 = corr['1인 가구 상관계수'].values
# list2 = corr['2인 가구 상관계수'].values
# list3 = corr['4인 가구 상관계수'].values

# def corr_score(list_name, corr_list_name) :
#     for i in range(13):
#         if list_name[i] >= 0.7 :
#             corr_list_name.append(3)
#         elif list_name[i] >= 0.3 :
#             corr_list_name.append(2)
#         elif list_name[i] >= 0.1 :
#             corr_list_name.append(1)
#         elif list_name[i] == 0 :
#             corr_list_name.append(0)
#         elif list_name[i] >= -0.1 :
#             corr_list_name.append(-1)
#         elif list_name[i] >= -0.3 :
#             corr_list_name.append(-2)
#         elif list_name[i] >= -0.7 :
#             corr_list_name.append(-3)
            
#     return corr_list_name

# corr_list1 = []
# corr_list2 = []
# corr_list3 = []

# corr_list1 = corr_score(list1, corr_list1)
# corr_list2 = corr_score(list2, corr_list2)
# corr_list3 = corr_score(list3, corr_list3)

In [None]:
# score_idx = corr['요소'].values
# score_df = pd.DataFrame([ x for x in zip(score_idx, corr_list1, corr_list2, corr_list3)], columns=['요소', '1인 가구', '2인 가구', '4인 가구'])
# score_df

In [7]:
# score_df.to_csv('data/corr/FIN/가구별 가중치(-3~3).csv', encoding='cp949', index=False)

In [None]:
# # 점수화(0~6)
# list1 = corr['1인 가구 상관계수'].values
# list2 = corr['2인 가구 상관계수'].values
# list3 = corr['4인 가구 상관계수'].values

# def corr_score(list_name, corr_list_name) :
#     for i in range(13):
#         if list_name[i] >= 0.7 :
#             corr_list_name.append(6)
#         elif list_name[i] >= 0.3 :
#             corr_list_name.append(5)
#         elif list_name[i] >= 0.1 :
#             corr_list_name.append(4)
#         elif list_name[i] == 0 :
#             corr_list_name.append(3)
#         elif list_name[i] >= -0.1 :
#             corr_list_name.append(2)
#         elif list_name[i] >= -0.3 :
#             corr_list_name.append(1)
#         elif list_name[i] >= -0.7 :
#             corr_list_name.append(0)
            
#     return corr_list_name

# corr_list1 = []
# corr_list2 = []
# corr_list3 = []

# corr_list1 = corr_score(list1, corr_list1)
# corr_list2 = corr_score(list2, corr_list2)
# corr_list3 = corr_score(list3, corr_list3)

In [6]:
# score_idx = corr['요소'].values
# score_df = pd.DataFrame([ x for x in zip(score_idx, corr_list1, corr_list2, corr_list3)], columns=['요소', '1인 가구', '2인 가구', '4인 가구'])
# score_df

Unnamed: 0,요소,1인 가구,2인 가구,4인 가구
0,cctv,2,2,1
1,어린이집/유치원,-2,-1,2
2,초중고등학교,1,2,2
3,대학교,0,0,0
4,백화점,-1,-2,1
5,버스정류장,-2,-2,-1
6,병원,-1,-1,1
7,영화관,-1,-1,-2
8,음식점,-1,-1,-2
9,제과제빵,-3,-2,-2


In [9]:
# score_df.to_csv('data/corr/FIN/가구별 가중치(0~6).csv', encoding='cp949', index=False)

In [12]:
# 점수화(0.5단위)
list1 = corr['1인 가구 상관계수'].values
list2 = corr['2인 가구 상관계수'].values
list3 = corr['4인 가구 상관계수'].values

def corr_score(list_name, corr_list_name) :
    for i in range(13):
        if list_name[i] >= 0.7 :
            corr_list_name.append(4)
        elif list_name[i] >= 0.3 :
            corr_list_name.append(3.5)
        elif list_name[i] >= 0.1 :
            corr_list_name.append(3)
        elif list_name[i] == 0 :
            corr_list_name.append(2.5)
        elif list_name[i] >= -0.1 :
            corr_list_name.append(2)
        elif list_name[i] >= -0.3 :
            corr_list_name.append(1.)
        elif list_name[i] >= -0.7 :
            corr_list_name.append(1)
            
    return corr_list_name

corr_list1 = []
corr_list2 = []
corr_list3 = []

corr_list1 = corr_score(list1, corr_list1)
corr_list2 = corr_score(list2, corr_list2)
corr_list3 = corr_score(list3, corr_list3)

In [13]:
score_idx = corr['요소'].values
score_df = pd.DataFrame([ x for x in zip(score_idx, corr_list1, corr_list2, corr_list3)], columns=['요소', '1인 가구', '2인 가구', '4인 가구'])
score_df

Unnamed: 0,요소,1인 가구,2인 가구,4인 가구
0,cctv,3.5,3.5,3.0
1,어린이집/유치원,1.0,2.0,3.5
2,초중고등학교,3.0,3.5,3.5
3,대학교,2.5,2.5,2.5
4,백화점,2.0,1.0,3.0
5,버스정류장,1.0,1.0,2.0
6,병원,2.0,2.0,3.0
7,영화관,2.0,2.0,1.0
8,음식점,2.0,2.0,1.0
9,제과제빵,1.0,1.0,1.0


In [14]:
score_df.to_csv('data/corr/FIN/가구별 가중치(0.5단위).csv', encoding='cp949', index=False)