# 데이터 수집 및 전처리

# 데이터 병합

## 데이터셋 목록 (출처, 데이터명, 컬럼명, 데이터시작지점, 최종지점, 수집주기 등) 

|활용|파일명|데이터|시작 시점|최종지점|수집주기|기준일|
|-|-|-|-|-|-|-|
|범죄 발생 건수 집계|crime_location.csv|2010|2020|1년|2021|
|<font color = 'gray'>사회적 요소 집계 <font/>|income_level_by_region.csv|2000|2019|1년|2021|
|<font color = 'gray'>사회적 요소 집계 <font/>|economic_activity_by_region.csv|2000|2020|1년|2021|
|<font color = 'gray'>사회적 요소 집계 <font/>|olice_officer_num_by_region.csv|2009|2017|1년|2021|
|인구적 요소|population_by_region.csv|1992|2020|1년|2021|
|인구적 요소|population_movement_by_region.csv|2001|2020|1년|2021|
|인구적 요소|외국인_거주자.csv|2003|2020|1년|2021|
|<font color = 'gray'>공간적 요소 집계<font/>|local_data.csv|2000|2020|1년|2021|
|<font color = 'gray'>공간적 요소 집계<font/>|universities_by_region.csv|2003|2020|1년|2021|
|기타 요인|시도_산업_조직형태별_사업체수_종사자수.csv|2000|2019|1년|2021|
|기타 요인|number_of_psychotropic_drug_requests_by_region.csv|2010|2020|1년|2021|





## 필요 라이브러리, 데이터 불러오기 및 경로 설정

In [402]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib import font_manager,rc
import seaborn as sns
from sklearn.preprocessing import minmax_scale
from pyarrow import csv
import json
import collections
import cx_Oracle
import warnings
from sklearn.linear_model import LinearRegression
warnings.filterwarnings("ignore")

In [146]:
data_path ='../data/'
# Json 파일 읽어오기
def read_json(jsonPath ,mod = 'r',encoding ='utf-8'):
    with open(jsonPath, mod, encoding = encoding) as common:
        config = json.load(common)
    return config
config = read_json(data_path + "json/config.json")

# Orcale 연동
CONN_INFO = {
        'NAME': 'XEPDB1',
        'USER': 'iitp',
        'PASSWORD': 'iitp',
        'HOST': '172.16.5.231',
        'PORT': '11521',
}
CONN_STR = '{USER}/{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(**CONN_INFO)
conn = cx_Oracle.connect(CONN_STR)
cursor = conn.cursor()



def transColName(cols):
    cols = cols.replace(' (%)', '')
    cols = cols.replace(' (명)', '')
    cols = cols.replace('[명]', '')
    cols = cols.replace('[백분율]', '')
    cols = cols.replace(')', '')
    cols = cols.replace('(', '_')
    cols = cols.replace(' ', '_')
    cols = cols.replace('1', '')
    return cols 
    

## 공간적 요소 병합

In [19]:
local_path = config['data_path']+ 'csv/localData/cleaned/'

flag = True
for fileN in os.listdir(local_path):
    readed_file = csv.read_csv(local_path +fileN).to_pandas()
    readed_file = readed_file.dropna()
    tmp = pd.DataFrame(columns=['시도','년도',readed_file.columns[-1]])
    for city in readed_file.시도.unique():
        for year in readed_file[readed_file.시도 == city].연도.unique():
            tmp= tmp.append(pd.Series(
                [
                    city,
                    year,
                    readed_file[((readed_file.시도 == city) & (readed_file.연도== year))].iloc[:,-1].sum()
                    ] , index = tmp.columns)
                ,ignore_index= True)
    if flag :
        result = tmp.copy()
        flag = False
        continue
    
    result = pd.merge(result, tmp, left_on=['시도','년도'], right_on=['시도','년도'], how='left')
result = result.fillna(0) 
# 병합할 칼럼들 ( 유사 의미를 가지고 있는 칼럼 병합)
restrant = ['일반음식점','휴게음식점','관광식당']
bar = ['단란주점', '유흥주점', '외국인전용유흥음식점업']

result['음식점'] =result[restrant].sum(axis = 1)
result['유흥가'] = result[bar].sum(axis = 1)
#병합된 칼럼 제거
result = result.drop(restrant+bar,axis= 1)

# 단위개 개수이기 때문에 소수점 제거
result.iloc[:,1:] = result.iloc[:,1:].astype(int)

result.to_csv(config['data_path']+ 'csv/semicleaned/local.csv',encoding='utf-8-sig',index=False)

## 범죄 발생지 병합

In [28]:
# 범죄 발생지 병합
merged_df = pd.DataFrame()
flag = True
for fp in os.listdir(config['data_path']+ 'csv/crime_location'):     
    merged_csv = csv.read_csv(config['data_path']+ 'csv/crime_location/'+fp).to_pandas()
    merged_csv['시도'] = merged_csv['시도'].map(config['cities_mapping'])
    if flag :
        merged_df = merged_csv
        flag = False
    else:
        merged_df = pd.concat([merged_df,merged_csv], ignore_index=True)

# 데이터 병합
merged_df['폭행'] = merged_df['폭행'].add(merged_df['상해'])
merged_df['공갈'] = merged_df['공갈'].add(merged_df['협박'])
merged_df = merged_df.drop(['상해','협박'],axis=1)
merged_df = merged_df.rename(columns={'폭행':'폭행및상해','공갈':'공갈및협박'})



# -로 저장되어있는 값들 NA값으로 변경
def cleaning(x) :
  if str(x).find("-") != -1 :
    return str(x).replace("-",'')
  return x

for colName in merged_df.columns:
    merged_df[colName] = merged_df[colName].apply(cleaning)

# result.drop_duplicates()
merged_df.drop_duplicates().to_csv(config['data_path']+ 'csv/semicleaned/crime_location.csv',encoding='utf-8-sig',index=False)


## 모든 데이터 병합

### 제거한 데이터

- 실질적 일반 사람에게 피해가 없는 `범죄 파라미터` 제거
- 특정 대학의 수는 의미가 없다 판단하여 대학교 총계를 제외한 `대학수 제거`
- 성비를 이용한 인구수는 기타 데이터와 연계시 데이터 부족에 의하여 제거 `성별로 나누어진 데이터 제거`
- 시도간 전입,전출 칼럼을 보유하면 기다 전입 `전출 데이터`가 불필요하다 판단이 되어 제거

In [29]:
# 모든 데이터 병합
result = pd.DataFrame()
flag = True
for fp in os.listdir(config['data_path']+ 'csv/semicleaned'):
    
    merged_csv = csv.read_csv(config['data_path']+ 'csv/semicleaned/'+fp).to_pandas()
    merged_csv['시도'] = merged_csv['시도'].map(config['cities_mapping'])
    if flag :
        result = merged_csv
        flag = False
    else:
        result = pd.merge(
            result, merged_csv,
            left_on=['년도','시도'],
            right_on=['년도','시도'],
            how='left')
# 만약 중복되어 삽입된 행이 있다면 제거
result.drop_duplicates()
# 불필요하다 판단된 칼럼 제거
result = result.drop(config['dropCols'],axis = 1).drop_duplicates()
result.to_csv(config['data_path']+ 'csv/cleaned/crime_merged.csv',encoding='utf-8-sig',index=False)

##  상관계수 값이 0.7이상인 칼럼만 추출 후 상위 5개 칼럼 추출

In [17]:
df = csv.read_csv(config['data_path']+ 'csv/cleaned/crime_merged.csv').to_pandas()
df = df[(df.년도 >= 2002)&(df.년도 <= 2019)]
crime_name = df.columns[2:15] 
independent_val = df.columns[15:] 


df['범죄소계'] = df[crime_name].sum(axis = 1)
crime_name = crime_name.to_list()+['범죄소계']

corr_df = pd.DataFrame(columns=['시도','범죄','칼럼리스트'])

for city in df['시도'].unique():
    tmp_df = df[df.시도 == city]
    for crime_ in crime_name:
        tmp_list = []
        for key,value in  (tmp_df[[crime_] + independent_val.to_list()].corr().iloc[0,1:].abs() >= 0.7).to_dict().items():
            if value == True :
                tmp_list += [transColName(key)]
        corr_df = corr_df.append(
            pd.Series(
                [city,
                crime_,
                tmp_list],
                index = corr_df.columns
            ),
            ignore_index=True
        )

corr_result = pd.DataFrame(columns=['범죄','칼럼리스트'])

for c_name in corr_df.범죄.unique():
    corr_df[corr_df.범죄 == c_name]
    tmp_list = []
    for i in corr_df[corr_df.범죄 == c_name].칼럼리스트:
        tmp_list += i

    corr_Top_Five = sorted(collections.Counter(tmp_list).items(), key=lambda x: x[1])[-5:]
    t_list = []
    for Top_Five_Name in corr_Top_Five:
        t_list+= [Top_Five_Name[0]]
    # print(c_name,t_list)
    corr_result = corr_result.append(
        pd.Series(
            [c_name,
            t_list],
            index = corr_result.columns
        ),
        ignore_index=True
    )
corr_result.to_csv(config['data_path']+ 'csv/cleaned/Top_Five_Cols.csv',encoding = 'utf-8-sig',index =False)


## 범죄별 인덱싱

In [149]:
merged_df = csv.read_csv(config['data_path']+ 'csv/cleaned/crime_merged.csv').to_pandas()
# 범죄 소계 데이터가 2002 ~ 2019까지 있기 떄문에 그 이외의 데이터 제거
merged_df = merged_df[(merged_df.년도 >=2002)& (merged_df.년도 <=2019)]


yearNcities_name= merged_df.columns[:2].to_list()
crimes = merged_df.columns[2:15].to_list()
indiventent_val_name = merged_df.columns[15:].to_list()
idx = 0
# 범죄 소계로 초기화

crime_Specification=pd.DataFrame([[idx,'범죄소계']],columns = ['idx','범죄명'])

result = merged_df[yearNcities_name + indiventent_val_name]
result['범죄수'] = merged_df[crimes].sum(axis = 1)
result['범죄종류'] = idx
result = result[yearNcities_name + ['범죄종류','범죄수'] + indiventent_val_name]
idx+=1

for crime_name in  crimes:
    for_Merge = merged_df[yearNcities_name + [crime_name] + indiventent_val_name]
    for_Merge = for_Merge.rename(columns={crime_name:'범죄수'})
    for_Merge['범죄종류'] = idx   

    crime_Specification = crime_Specification.append(pd.Series(
            [
                idx,
                crime_name
            ] , index = crime_Specification.columns)
            ,ignore_index= True)
    result =pd.concat(
        [
        result,
        for_Merge[yearNcities_name + ['범죄종류','범죄수'] + indiventent_val_name]
        ],
        ignore_index=True)
    idx+=1
result.to_csv(config['data_path']+ 'csv/cleaned/crime_indexed.csv',encoding='utf-8-sig',index=False)
crime_Specification.to_csv(config['data_path']+ 'csv/cleaned/crime_Specification.csv',encoding='utf-8-sig',index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['범죄수'] = merged_df[crimes].sum(axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['범죄종류'] = idx


# 예측한 데이터프레임에 기존 데이터 삽입

In [385]:
def df_round(df , cols = ['이혼율','고용률','실업률','한국인_남녀비율','한국인_인구밀도','경찰청_인원_명당_담당_인구']):
    count_col_name= df.columns.to_list()
    for colN in cols:
        count_col_name.remove(colN)
    for colN in count_col_name:
        df[colN] = df[colN].round()
    df['시도'] = df['시도'].astype(int)
    df['년도'] = df['년도'].astype(int)
    return df
def alter_pridict(df,raw_df):
    for colName in raw_df.columns[2:]:
        for _,items in raw_df[raw_df[colName].isna() !=True][['시도','년도',colName]].iterrows():
            df.loc[
            (df.시도==items[0]) & (df.년도==items[1]),colName
            ] = items[2]
    return df   

sido_code = pd.read_sql("select * from sido_code",con=conn) 
sido_code['CODE'] = sido_code['CODE'].astype(int)
sejong_code = sido_code[sido_code.KOR_NAME == '세종'].iloc[0]['CODE']

indi_val = df_round(pd.read_sql("select * from independent_val",con=conn).astype(float) )
indi_val = indi_val[indi_val.시도 != sejong_code]

indi_zero = df_round(pd.read_sql("select * from independent_zero",con=conn).astype(float) )
indi_zero = indi_zero[indi_zero.시도 != sejong_code]

indi_mean = df_round(pd.read_sql("select * from independent_mean",con=conn).astype(float) )
indi_mean = indi_mean[indi_mean.시도 != sejong_code]

raw_02_19 = indi_val[(indi_val.년도 >= 2002)].sort_values(['시도' ,'년도'])
zero_02_19 = indi_zero[(indi_zero.년도 >= 2002)].sort_values(['시도' ,'년도'])
mean_02_19 = indi_mean[(indi_mean.년도 >= 2002)].sort_values(['시도' ,'년도'])
 
alter_pridict(zero_02_19,raw_02_19).to_csv(config['data_path']+ 'csv/cleaned/independent/zero_insert_rawData.csv',encoding='utf-8-sig',index=False)
alter_pridict(mean_02_19,raw_02_19).to_csv(config['data_path']+ 'csv/cleaned/independent/mean_insert_rawData.csv',encoding='utf-8-sig',index=False)


# 결측값 체우기

In [403]:
crime_df  = csv.read_csv('../data/csv/cleaned/crime_merged.csv').to_pandas()
crime_df = crime_df[crime_df.시도 != '세종']
indi_df = crime_df.drop(crime_df.columns[2:15],axis = 1 )

def predict_(df_,sido,years):
    df_.sort_values(by = '년도',inplace= True)    
    x = df_.년도
    y = df_.iloc[:,2:]
    # 모델 생성
    model = LinearRegression()
    model.fit(x.values.reshape(-1,1),y)
    # 예측
    result = model.predict(np.array(years).reshape(-1,1))
    return pd.concat(
        [
            pd.DataFrame({'시도': [sido], '년도' : [years]}),
            pd.DataFrame(result, columns=indi_df.columns[2:])
            ],axis=1)

def predict_regression(df_,sido,years):
    df_.sort_values(by = '년도',inplace= True)    
    x = df_.년도
    y = df_.iloc[:,2:]
    # 모델 생성
    model = LinearRegression()
    model.fit(x.values.reshape(-1,1),y)
    # 예측
    result = model.predict(years.reshape(-1,1))

    year_sido=  pd.DataFrame(np.arange(2000,2020),columns=['년도'])
    year_sido['시도'] = sido    
    return pd.concat(
            [
                year_sido[year_sido.columns[::-1]],pd.DataFrame(result, columns=indi_df.columns[2:])
                ],axis=1)

# result = pd.DataFrame(columns = indi_df.columns)
# 결측값 0으로 삽입
result = pd.DataFrame(columns = indi_df.columns)
for sido in indi_df.시도.unique():
    for_predict = indi_df[indi_df.시도 == sido].fillna(0)
    for year in range(2021,2031):
        for_predict = for_predict.append(predict_(for_predict,sido,year))
    result= result.append(for_predict)
result.to_csv(config['data_path']+ 'csv/cleaned/independent/zero_predict.csv',encoding='utf-8-sig',index=False)

# 결측값 평균값으로 삽입
result = pd.DataFrame(columns = indi_df.columns)
for sido in indi_df.시도.unique():
    for_predict = indi_df[indi_df.시도 == sido].fillna(indi_df.where(pd.notnull(indi_df),indi_df.mean(),axis=  'columns'))
    for year in range(2021,2031):
        for_predict = for_predict.append(predict_(for_predict,sido,year))
    result= result.append(for_predict)
result.to_csv(config['data_path']+ 'csv/cleaned/independent/mean_predict.csv',encoding='utf-8-sig',index=False)

# 결측값 회기분석으로 삽입
result = pd.DataFrame(columns = indi_df.columns)
for sido in indi_df.시도.unique():
    # 선형회기로 데이터프레임 생성 후 결측값 대체
    for_predict = alter_pridict(
                        predict_regression(
                            indi_df[indi_df.시도 == sido].dropna() ,sido, np.arange(2000,2020)
                            ) , indi_df[indi_df.시도 == sido])
    for year in range(2021,2031):
        for_predict = for_predict.append(predict_(for_predict,sido,year))
    result= result.append(for_predict)
result.to_csv(config['data_path']+ 'csv/cleaned/independent/regression_predict.csv',encoding='utf-8-sig',index=False)



# 데이터들 인구수로 나누어 정규화

In [408]:
def div_by_pop(df):    
    percent_cols = [
    '이혼율','고용률 (%)','실업률 (%)','1인당 지역내총생산', '1인당 지역총소득', '1인당 개인소득', '1인당 민간소비','경찰청 인원 1명당 담당 인구','한국인(남녀비율[백분율])','한국인(인구밀도)'
    ]
    percent_cols = list(map(transColName,percent_cols))

    other_cols = df.columns.drop(percent_cols)
    population = df[transColName('총인구수 (명)')]
    other_cols.drop(transColName('한국인(총인구수[명])'))
    div_by_population = df[other_cols].iloc[:,2:].div(population,axis = 0).iloc[:,:-1]
    df[div_by_population.columns] = div_by_population
    
    drop_list= ['한국인(총인구수[명])' ,'총인구수 (명)','경찰청 인원 1명당 담당 인구']
    drop_list= list(map(transColName,drop_list))
    df.drop(drop_list,axis =1,inplace=True)
    return df

    
mean_ = csv.read_csv(config['data_path']+ 'csv/cleaned/independent/mean_insert_rawData.csv').to_pandas()
mean_div = div_by_pop(mean_.copy())
pd.concat( [mean_div, mean_.iloc[:,-1]] , axis = 1).to_csv(config['data_path']+ 'csv/cleaned/independent/mean_insert_div_pop.csv' , encoding='utf-8-sig',index = False)

zero_ = csv.read_csv(config['data_path']+ 'csv/cleaned/independent/zero_insert_rawData.csv').to_pandas()
zero_div = div_by_pop(zero_.copy())
pd.concat( [zero_div, zero_.iloc[:,-1]] , axis = 1).to_csv(config['data_path']+ 'csv/cleaned/independent/zero_insert_div_pop.csv' , encoding='utf-8-sig',index = False)



##############

zero_predict = csv.read_csv(config['data_path']+ 'csv/cleaned/independent/zero_predict.csv').to_pandas()
zero_predict.columns = percent_cols = list(map(transColName,zero_predict.columns))
zero_predict_div = div_by_pop(zero_predict.copy())
pd.concat( [zero_predict_div, zero_predict.iloc[:,-1]] , axis = 1).to_csv(config['data_path']+ 'csv/cleaned/independent/zero_predict_div_pop.csv' , encoding='utf-8-sig',index = False)

mean_predict = csv.read_csv(config['data_path']+ 'csv/cleaned/independent/mean_predict.csv').to_pandas()
mean_predict.columns = percent_cols = list(map(transColName,mean_predict.columns))
mean_predict_div = div_by_pop(mean_predict.copy())
pd.concat( [mean_predict_div, mean_predict.iloc[:,-1]] , axis = 1).to_csv(config['data_path']+ 'csv/cleaned/independent/mean_predict_div_pop.csv' , encoding='utf-8-sig',index = False)

regression_predict = csv.read_csv(config['data_path']+ 'csv/cleaned/independent/regression_predict.csv').to_pandas()
regression_predict.columns = percent_cols = list(map(transColName,regression_predict.columns))
regression_predict_div = div_by_pop(regression_predict.copy())
pd.concat( [regression_predict_div, regression_predict.iloc[:,-1]] , axis = 1).to_csv(config['data_path']+ 'csv/cleaned/independent/regression_predict_div_pop.csv' , encoding='utf-8-sig',index = False)



