# Import Library

In [141]:
# Import Libraries
import os
import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Visuzliation Setting
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib import rc
from matplotlib import colors
import seaborn as sns

---

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [142]:
root = os.path.join(os.getcwd(), 'DATA')

In [226]:
from datetime import datetime


def set_week(df, date):
    '''
    df : datetime 형식의 컬럼을 가지고 있는 dataframe
    date : df에서 datetime 형식을 가진 컬럼명
    return : date의 연도 컬럼과 주차 컬럼을 추가한 dataframe
    '''
    df[date] = pd.to_datetime(df[date])
    df[date] = df[date].dt.date
    df['year'] = df.apply(func=lambda x: x[date].isocalendar()[0], axis=1)
    df['week'] = df.apply(func=lambda x: x[date].isocalendar()[1], axis=1)
    df.drop(date, axis=1, inplace=True)
    

def check_week(df):
    '''
    df에 date가 전부 있는지 확인
    '''
    cnt = 0
    year, week = 2015, datetime(2015, 12, 28).isocalendar()[1]
    if df[(df['year'] == year) & (df['week'] == week)].empty:
        print((year, week), end="")
        cnt += 1
    for year in range(2016, 2020):
        for week in range(datetime(year, 1, 1).isocalendar()[1], datetime(year, 12, 31).isocalendar()[1] + 1):
            if df[(df['year'] == year) & (df['week'] == week)].empty:
                print((year, week), end="")
                cnt += 1
    if cnt > 0:
        print()
    print("missing", cnt, "values")    

# Training Data
빅콘테스트 제공 데이터 전처리

## Import Data

In [152]:
df_raw = pd.read_excel(os.path.join(root, 'train.xlsx'))
df_train = df_raw.copy()

### One hot encoding
P_IMPORT_TYPE 이라는 특수한 컬럼에 대한 전처리

In [189]:
import_type_list = set()
for tmp in df_train.P_IMPORT_TYPE.unique():
    for a in tmp.split(','):
        import_type_list.add(a)

In [190]:
for name in import_type_list:
    df_train[name] = 0
    df_train.loc[df_train['P_IMPORT_TYPE'].str.contains(name, regex=False), name] = 1

### Add Columns

In [156]:
set_week(df_train, 'REG_DATE')

In [178]:
check_week(df_train)

missing 0 values


In [179]:
# 어종별 수 컬럼 추가
df_train['name_cnt'] = 0;
value_dict = {}
for name, value in zip(df_train['P_NAME'].value_counts().index,df_train['P_NAME'].value_counts()):
    value_dict[name] = value

def value(col):
    return value_dict[col]

df_train['name_cnt'] = df_train['P_NAME'].apply(value)

In [186]:
# 가공 여부 컬럼 추가
df_train['is_processed'] = (df_train['CTRY_1'] != df_train['CTRY_2'])

In [None]:
df_train['import_cnt'] = 0
for i, row in df_train.iterrows():
    for name in import_type_list:
        if row[name] == 1:
            df_train.at[i, 'import_cnt'] += 1

---

# Weather data

## Import Data

In [15]:
df_weather_code = pd.read_csv(os.path.join(root, 'raw_weather_code.csv'), header=0, index_col=0)

In [138]:
weather_list = [pd.read_csv(os.path.join(root, 'raw_weather_20151228_20161227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20161228_20171227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20171228_20181227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20181228_20191227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20191228_20201227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20201228_20210818.csv'), encoding='euc-kr')]


## Preprocess
- '지점'에 따른 나라명 컬럼(CTRY_1)과 해안가여부(is_waterfront) 추가
- 각 나라, 일자 별로 평균 강수량, 풍속, 기온 계산

In [191]:
# 지점에 따라 나라명 추가
def set_country(row):
    data = df_weather_code[df_weather_code['지점'] == row['지점']]
    if data.empty:
        return ""
    return data.iloc[0]['국가명']


def set_waterfront(row):
    data = df_weather_code[df_weather_code['지점'] == row['지점']]
    if data.empty or data.iloc[0]['해안가여부'] != 1:
        return False
    return True
    

def preprocess_weather(df):
    df = df[(df['강수량'] >= 0) & (df['풍속'] >= 0)] # 이상치/결측치 제거
    set_week(df, '일시')  # 날짜 정보 처리
    
    # 나라명 추가
    df['CTRY_1'] = ""
    for i, row in df.iterrows():
        df.at[i, 'CTRY_1'] = set_country(row)
    
    # 해안가 여부 추가
    df['is_waterfront'] = False
    for i, row in df.iterrows():
        df.at[i, 'is_waterfront'] = set_waterfront(row)
        
#     df = df[df['is_waterfront']]  # 해안가가 아닌 데이터 제외
    
    # 평균
    columns = ['year', 'week', 'CTRY_1']
    df['rain'] = df[columns + ['강수량']].groupby(columns).transform('mean')
    df['wind'] = df[columns + ['풍속']].groupby(columns).transform('mean')
    df['temperature'] = df[columns + ['기온']].groupby(columns).transform('mean')

    # 컬럼/행 정리
    df.drop(columns=['지점명', '지점', '강수량', '풍속', '기온', 'is_waterfront'], inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

In [197]:
for i in range(len(weather_list)):
    weather_list[i] = preprocess_weather(weather_list[i])
df_weather = pd.concat(weather_list)

## Check

In [198]:
df_weather.describe()

Unnamed: 0,year,week,rain,wind,temperature
count,3042.0,3042.0,3042.0,3042.0,3042.0
mean,2018.241617,25.81098,5.599044,3.729863,15.529427
std,1.597784,14.990738,24.266469,2.941408,10.109316
min,2015.0,1.0,0.0,0.0,-21.6
25%,2017.0,13.0,1.60113,1.931809,8.719672
50%,2018.0,25.0,2.925263,2.5,15.845238
75%,2020.0,38.0,6.95063,5.372435,25.754902
max,2021.0,53.0,915.0,60.0,30.1


In [229]:
for country in df_weather['CTRY_1'].unique():
    if country == "":
        continue
    print()
    print(country)
    check_week(df_weather[df_weather['CTRY_1'] == country])


노르웨이
missing 0 values

태국
missing 0 values

베트남
missing 0 values

중국
missing 0 values

페루
(2015, 53)(2017, 52)(2018, 1)(2019, 1)
missing 4 values

칠레
missing 0 values


## Filling Missing Values

페루의 경우 결측치가 존재  
전주/차주 데이터 값의 평균으로 채워넣기

In [238]:
df_weather['year'].dtype

dtype('int64')

In [239]:
df_weather.append({'year': 2015, 'week': 53, "CTRY_1": "페루"}, ignore_index=True)
df_weather.append({'year': 2017, 'week': 52, "CTRY_1": "페루"}, ignore_index=True)
df_weather.append({'year': 2018, 'week': 1, "CTRY_1": "페루"}, ignore_index=True)
df_weather.append({'year': 2019, 'week': 1, "CTRY_1": "페루"}, ignore_index=True)
check_week(df_weather[df_weather['CTRY_1'] == "페루"])

(2015, 53)(2017, 52)(2018, 1)(2019, 1)
missing 4 values


In [243]:
df_weather[df_weather['CTRY_1'] == "페루"].sort_values(by=['year', 'week'])

Unnamed: 0,year,week,CTRY_1,is_waterfront,rain,wind,temperature
457,2016,50,페루,False,0.1,4.0,25.6
468,2017,11,페루,False,0.0,3.0,19.0
467,2017,49,페루,False,1.0,3.0,25.2
463,2018,7,페루,True,161.0,60.0,23.2
464,2018,50,페루,False,888.0,5.0,16.7
458,2019,49,페루,False,300.0,60.0,27.9
225,2021,28,페루,False,915.0,3.0,2.3


In [244]:
df_weather[df_weather['CTRY_1'] == "칠레"].sort_values(by=['year', 'week'])

Unnamed: 0,year,week,CTRY_1,is_waterfront,rain,wind,temperature
476,2015,53,칠레,True,0.580000,6.833333,16.963333
504,2015,53,칠레,False,0.580000,6.833333,16.963333
470,2016,1,칠레,True,2.600000,6.000000,19.366667
503,2016,2,칠레,True,1.000000,2.666667,15.333333
529,2016,3,칠레,False,1.500000,3.500000,15.200000
...,...,...,...,...,...,...,...
257,2021,31,칠레,False,1.979452,7.287671,8.743836
251,2021,32,칠레,True,0.752778,8.722222,10.780556
286,2021,32,칠레,False,0.752778,8.722222,10.780556
235,2021,33,칠레,True,3.554839,10.612903,10.017742


## Add to Training Data

In [None]:
df_train = pd.merge(df_train, df_weather, how='left', on=['year', 'week', 'CTRY_1'])

In [None]:
pd.set_option("display.max_columns", None)
df_train.sample(5)

---

# Salinity

## Import Data

In [None]:
# df_salinity = pd.read_csv(os.path.join(root, 'raw_salinity.csv'))

## Drop Columns & Rows

In [None]:
# df_salinity.drop(df_salinity.columns[2], inplace=True, axis=1)
# df_salinity = df_salinity[(2015 <= df_salinity['obs_year']) & (df_salinity['obs_year'] <= 2021)]

2020, 2021 데이터의 부재로 인해 보류

---

# Oil

## Import Data

In [None]:
df_oil = pd.read_csv(os.path.join(root, 'raw_oil.csv'))
df_oil_dubai = pd.read_csv(os.path.join(root, 'raw_oil_dubai.csv'))
df_oil_brent = pd.read_csv(os.path.join(root, 'raw_oil_brent.csv'))

## Preprocess

In [None]:
def preprocess_oil(df):
    df['date'] = datetime(df['날짜'].str[:4] + df['날짜'].str[6:8] + df['날짜'].str[10:12])
    set_week(df, 'date')  # 날짜 데이터 정리
    df['oil'] = df['year', 'week', '종가'].groupby(['year', 'week']).transform('mean')
    df.drop(columns = ['오픈', '고가', '저가', '거래량', '변동 %'], inplace=True, axis=1)
    df.sort_index(ascending=False)
    return df

In [None]:
df_oil = preprocess_oil(df_oil)
df_oil_dubai = preprocess_oil(df_oil_dubai)
df_oil_brent = preprocess_oil(df_oil_brent)

## Check

In [None]:
check_week(df_oil)
check_week(df_oil_dubai)
check_week(df_oil_brent)

oil과 비교하였을 때 dubai의 경우 28개, brent의 경우 3개의 데이터가 적다  
따라서 df_oil 데이터를 사용하도록 한다

## Add to Training Data

In [None]:
df_train = pd.merge(df_train, df_oil, how='left', on=['year', 'week'])

In [None]:
pd.set_option("display.max_columns", None)
df_train.sample(5)

---

# Korea Weather

## Import Data

In [None]:
df_weather_kr = pd.read_csv(os.path.join(root, 'raw_weather_korea.csv'), encoding='euc-kr')

## Preprocess Date

In [None]:
df_weather_kr['date'] = datetime(df_weather_kr['일시'].str[:4] + df_weather_kr['일시'].str[5:7] + df_weather_kr['일시'].str[8:])
set_week(df_weather_kr, 'date')

## Group
일자 별로 평균 구하기

In [None]:
df_weather_kr['wind_kr'] = df_weather_kr[['일시', '평균 풍속(m/s)']].groupby(['일시']).transform('mean')
df_weather_kr['temperature_kr'] = df_weather_kr[['일시', '평균 기온(°C)']].groupby(['일시']).transform('mean')
df_weather_kr['water_temp_kr'] = df_weather_kr[['일시', '평균 수온(°C)']].groupby(['일시']).transform('mean')

## Drop Column
어차피 전부 해안가 대한민국이라 지점은 필요없다  
사용한 컬럼은 제거한다

In [None]:
drop = ['지점', '평균 풍속(m/s)', '평균 기온(°C)', '평균 수온(°C)']

In [None]:
df_weather_kr.drop(columns=drop, inplace=True, axis=1)

## Add to Training Data

In [None]:
df_train = pd.merge(df_train, df_weather_kr, how='left', on=['year', 'week'])

In [None]:
pd.set_option("display.max_columns", None)
df_train.sample(5)

---

# 소비자물가지수

## Import Data

In [None]:
cpi_purpose = pd.read_csv("/content/drive/MyDrive/빅콘/Seafood_Import_Price_Prediction/DATA/cpi/raw_cpi_purpose.csv",encoding='cp949')  # CPI

cpi_purpose

## 소비자물가 총지수

### Drop Columns/Rows

In [None]:
cpi_purpose.drop('시도별',axis=1,inplace=True)

cpi_purpose_copy = cpi_purpose.copy()

cpi_purpose_copy.drop('지출목적별',axis=1,inplace=True)

cpi_total = cpi_purpose_copy.iloc[0,:] # 전국 소비자물가 총지수만 추출

cpi_total = pd.DataFrame(cpi_total.values,columns=['cpi'],index=cpi_total.index)

cpi_total

### Pivoting

In [None]:
cpi_purpose_copy = cpi_purpose.copy()

cpi_purpose_copy.drop('지출목적별',axis=1,inplace=True)

cpi_total = cpi_purpose_copy.iloc[0,:] # 전국 소비자물가 총지수만 추출

cpi_total = pd.DataFrame(cpi_total.values,columns=['cpi'],index=cpi_total.index)

cpi_total

## 지출목적별 소비자물가지수

In [354]:
cpi_purpose2 = cpi_purpose.iloc[[1,11],:]

cpi_purpose2 # 전국 식료품별, 음식서비스별 cpi만 추출

Unnamed: 0,지출목적별,2015. 12,2016. 01,2016. 02,2016. 03,2016. 04,2016. 05,2016. 06,2016. 07,2016. 08,2016. 09,2016. 10,2016. 11,2016. 12,2017. 01,2017. 02,2017. 03,2017. 04,2017. 05,2017. 06,2017. 07,2017. 08,2017. 09,2017. 10,2017. 11,2017. 12,2018. 01,2018. 02,2018. 03,2018. 04,2018. 05,2018. 06,2018. 07,2018. 08,2018. 09,2018. 10,2018. 11,2018. 12,2019. 01,2019. 02,2019. 03,2019. 04,2019. 05,2019. 06,2019. 07,2019. 08,2019. 09,2019. 10,2019. 11,2019. 12,2020. 01,2020. 02,2020. 03,2020. 04,2020. 05,2020. 06,2020. 07,2020. 08,2020. 09,2020. 10,2020. 11,2020. 12,2021. 01,2021. 02,2021. 03,2021. 04,2021. 05,2021. 06
1,01 식료품 · 비주류음료,99.95,100.76,103.43,102.25,102.3,101.24,99.66,99.65,101.02,105.94,104.48,102.93,104.02,107.16,107.26,106.32,105.19,104.85,104.09,104.39,107.5,108.84,106.16,103.2,104.44,106.0,109.07,107.32,107.86,106.85,105.48,105.8,111.34,115.2,112.16,108.67,108.86,108.8,109.54,108.48,109.4,108.83,107.64,106.68,107.7,110.52,110.74,107.89,109.01,110.81,110.27,111.26,111.29,111.48,111.18,111.28,114.78,119.68,119.82,115.35,115.79,118.04,120.97,120.61,120.34,119.7,118.43
11,11 음식 및 숙박,101.06,101.4,101.67,101.98,102.26,102.4,102.5,102.74,102.91,102.92,103.02,103.07,103.27,103.75,104.02,104.31,104.49,104.8,104.87,105.2,105.5,105.36,105.57,105.67,106.07,106.55,106.94,107.29,107.7,108.07,108.17,108.52,108.76,108.58,108.74,108.96,109.35,109.7,109.89,109.63,109.8,110.01,110.11,110.35,110.64,110.09,110.27,110.29,110.45,110.97,110.93,110.82,110.9,110.86,110.95,111.13,111.35,111.25,111.51,111.41,111.67,111.96,112.26,112.42,112.93,113.11,113.37


In [355]:
cpi_purpose3 = pd.DataFrame(cpi_purpose2.iloc[0,:].values,index = cpi_purpose2.columns,columns=['식료품 cpi'])

cpi_purpose3['음식 및 숙박 cpi'] = cpi_purpose2.iloc[1,:].values

cpi_purpose3.drop('지출목적별',axis=0,inplace=True) # 지출목적별 행 삭제

cpi_purpose3 # 식료품, 음식 cpi

Unnamed: 0,식료품 cpi,음식 및 숙박 cpi
2015. 12,99.95,101.06
2016. 01,100.76,101.4
2016. 02,103.43,101.67
2016. 03,102.25,101.98
2016. 04,102.3,102.26
...,...,...
2021. 02,120.97,112.26
2021. 03,120.61,112.42
2021. 04,120.34,112.93
2021. 05,119.7,113.11


## Preprocess Date

In [None]:
cpi_total.reset_index(inplace=True)  # 날짜 인덱스 > 컬럼으로 변경
cpi_purpose3.reset_index(inplace=True)  # 날짜 인덱스 > 컬럼으로 변경

In [None]:
cpi_total['year'] = cpi_total['index'].str[:4].astype('int')
cpi_total['month'] = cpi_total['index'].str[5:].astype('int')
cpi_total.drop(columns=['index'], inplace=True, axis=1)

cpi_purpose3['year'] = cpi_purpose3['index'].str[:4].astype('int')
cpi_purpose3['month'] = cpi_purpose3['index'].str[5:].astype('int')
cpi_purpose3.drop(columns=['index'], inplace=True, axis=1)

## Add to Training Data

In [None]:
df_train = pd.merge(df_train, df_cpi1, how='left', on=['year', 'month'])
df_train = pd.merge(df_train, df_cpi2, how='left', on=['year', 'month'])

In [None]:
pd.set_option("display.max_columns", None)
df_train.sample(5)

---

# Final

## One-hot Encoding

In [None]:
one_hot = ['CTRY_1', 'CTRY_2', 'P_PURPOSE', 'CATEGORY_1', 'CATEGORY_2', 'P_NAME']

In [None]:
df_train = pd.get_dummies(df_train, columns=one_hot)

## Drop Columns

In [None]:
drop = ['REG_DATE', 'P_TYPE', 'P_IMPORT_TYPE']

In [None]:
df_train.drop(columns = drop, inplace=True)

In [None]:
pd.set_option("display.max_columns", None)
df_train.sample(5)

## Save Files

In [None]:
def save_file(df, file_name):
    df.to_csv(os.path.join(root, file_name), encoding='utf-8', index=False)

In [None]:
save_file(df_weather, 'preprocessed_weather.csv')  # 제조국 날씨
save_file(df_oil, 'preprocessed_oil.csv')  # 원유 종가
save_file(df_weather_kr, 'preprocessed_weather_korea.csv')  # 한국 날씨
save_file(df_cpi1, 'preprocessed_cpi_region.csv')  # 전체 소비자물가지수
save_file(df_cpi2, 'preprocessed_cpi_purpose.csv')  # 음식/음식서비스 소비자물가지수
save_file(df_train, 'preprocessed_train.csv')  # 최종 df