In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import glob

import pandas as pd
import numpy as np

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

## 01 데이터 병합 및 전처리

### 연령 기준 설정

In [2]:
# 해당연도 출생아
birth_age = '0세'

# 유소년 인구 기준 : 14세 이하
yount_max_age = 14
youth_people_age = ['{}세'.format(age) for age in range(0, yount_max_age + 1)]

# 가임 여성 인구 수 : 20세 ~ 39세
max_young_women_age = 39
young_women_age = ['{}세'.format(age) for age in range(20, max_young_women_age + 1)]

# 고령인구 수 : 65세 이상
old_min_age = 65
old_people_age = ['{}세'.format(age) for age in range(old_min_age, 100)]
old_people_age.append('100세 이상')

### 인구데이터 병합

In [3]:
def data_merge(file_name):
    data_files = glob.glob(f'./Data/01 RAW/*{file_name}*.csv')
    target_file = glob.glob(f'./Data/01 RAW/*{file_name}_전체.csv')
    
    data_files.remove(target_file[0])
    data_files.sort()

    df = pd.DataFrame()

    for file in data_files:
        tmp_df = pd.read_csv(file, encoding= 'cp949')
        df = pd.concat([df, tmp_df], axis= 0)

    df = df.loc[df['연령별'] != '연령별']
    df = df.loc[df['연령별'] != '계']
    df.reset_index(inplace= True, drop= True)

    df.dropna(inplace= True)            # 추후에 수정 해야함, 광역시의 경우 앞에 광역시명 붙이기

    df = df.astype({
    '2012' : np.int64, 
    '2013' : np.int64, 
    '2014' : np.int64, 
    '2015' : np.int64, 
    '2016' : np.int64, 
    '2017' : np.int64,
    '2018' : np.int64, 
    '2019' : np.int64, 
    '2020' : np.int64, 
    '2021' : np.int64, 
    '2022' : np.int64
})

    df.to_csv('./Data/01 RAW/주민등록인구(행정구역별)_{}_전체.csv'.format(file_name), index= False, encoding= 'utf-8-sig')
    return df

In [4]:
# 총인구
total_df = data_merge('총인구')
print(total_df.columns)

# 남자
men_df = data_merge('남자')
print(men_df.columns)

# 여자
women_df = data_merge('여자')
print(women_df.columns)

Index(['행정구역(시군구)별', '연령별', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022'],
      dtype='object')
Index(['행정구역(시군구)별', '연령별', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022'],
      dtype='object')
Index(['행정구역(시군구)별', '연령별', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022'],
      dtype='object')


### 각 기준 별 인구 데이터 추출

In [5]:
# 가임여성 인구수
young_women_df = women_df[women_df['연령별'].isin(young_women_age)]
young_women_df = young_women_df.groupby('행정구역(시군구)별').sum()
young_women_df = young_women_df.drop(['연령별'], axis= 1)
young_women_df.head()

Unnamed: 0_level_0,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
행정구역(시군구)별,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
가평군,6122,6035,5808,5772,5771,5795,5702,5471,5265,5054,4913
강남구,100087,97967,99540,97733,94421,90806,87491,85959,82908,78992,76222
강동구,75845,73979,71972,67952,65184,63570,62126,63076,67281,66827,64911
강릉시,25821,25004,24290,23631,23098,22846,22426,22212,21679,21114,20346
강북구,50159,48191,46487,45123,44053,43511,42717,41484,40300,38643,37591


In [6]:
# 고령인구
old_people_df = total_df[total_df['연령별'].isin(old_people_age)]
old_people_df = old_people_df.groupby('행정구역(시군구)별').sum()
old_people_df = old_people_df.drop(['연령별'], axis= 1)
old_people_df.reset_index(inplace= True, drop= False)
old_people_df.head()

Unnamed: 0,행정구역(시군구)별,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,가평군,12076,12518,12874,13350,13569,14271,14802,15423,16324,17035,17841
1,강남구,50206,53300,57291,60191,61822,64946,66957,70896,74959,78078,81564
2,강동구,44850,47722,50167,51848,53121,56080,58669,63341,69903,73943,78191
3,강릉시,33639,34855,36009,36924,37679,39784,41214,43113,45484,47318,49575
4,강북구,46147,48465,50755,52503,53964,56437,58103,60474,63313,64218,66170


## 02 시계열 클러스터분석

In [7]:
index_values = old_people_df['행정구역(시군구)별'].values.tolist()
old_people_timeseries = old_people_df[[str(year) for year in range(2012, 2023)]].values.tolist()
young_women_timeseries = young_women_df[[str(year) for year in range(2012, 2023)]].values.tolist()

time_series_data = list()
for young, old in zip(old_people_timeseries, young_women_timeseries):
    time_series_data.append([young, old])

In [8]:
from tslearn.clustering import TimeSeriesKMeans

In [9]:
km = TimeSeriesKMeans(n_clusters=4, verbose=False, random_state=42, n_jobs=-1)
y_pred = km.fit_predict(time_series_data)

print('index 개수: {} / 클러스터 개수: {}'.format(len(index_values), len(y_pred)))

index 개수: 233 / 클러스터 개수: 233


In [10]:
result_list = list()
for index, num in zip(index_values, y_pred):
    result_list.append([index, num])

result_df = pd.DataFrame(result_list, columns= ['city', 'cluster_n'])
result_df.head()

Unnamed: 0,city,cluster_n
0,가평군,0
1,강남구,1
2,강동구,1
3,강릉시,3
4,강북구,3


In [11]:
result_df['cluster_n'].value_counts()

cluster_n
0    112
3     74
1     37
2     10
Name: count, dtype: int64