# Import Library

In [1]:
# Import Libraries
import os
import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Visuzliation Setting
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib import rc
from matplotlib import colors
import seaborn as sns

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


---

In [3]:
root = os.path.join('/content/drive/MyDrive/BigContest', 'DATA')

In [4]:
from datetime import date, timedelta


def set_week(df, date):
    '''
    df : datetime 형식의 컬럼을 가지고 있는 dataframe
    date : df에서 datetime 형식을 가진 컬럼명
    return : date의 연도 컬럼과 주차 컬럼을 추가한 dataframe
    '''
    df[date] = pd.to_datetime(df[date])
    df[date] = df[date].dt.date
    df['year'] = df.apply(func=lambda x: x[date].isocalendar()[0], axis=1)
    df['week'] = df.apply(func=lambda x: x[date].isocalendar()[1], axis=1)
    df.drop(date, axis=1, inplace=True)
    

def check_week(df):
    '''
    df에 date가 전부 있는지 확인
    '''
    cnt = 0
    sdate = date(2015, 12, 28)   # start date
    edate = date(2019, 12, 30)   # end date
    delta = edate - sdate       # as timedelta
    mem = set()
    
    for i in range(delta.days + 1):
        day = sdate + timedelta(days=i)
        year, week = day.isocalendar()[0], day.isocalendar()[1]
        if year * 100 + week in mem:
          continue
        mem.add(year * 100 + week)
        if df[(df['year'] == year) & (df['week'] == week)].empty:
          print((year, week), end="")
          cnt += 1
    if cnt > 0:
        print()
    print("missing", cnt, "values")    

# Training Data
빅콘테스트 제공 데이터 전처리

## Import Data

In [5]:
df_raw = pd.read_excel(os.path.join(root, 'train.xlsx'))
df_train = df_raw.copy()

### One hot encoding
P_IMPORT_TYPE 이라는 특수한 컬럼에 대한 전처리

In [6]:
import_type_list = set()
for tmp in df_train.P_IMPORT_TYPE.unique():
    for a in tmp.split(','):
        import_type_list.add(a)

In [7]:
for name in import_type_list:
    df_train[name] = 0
    df_train.loc[df_train['P_IMPORT_TYPE'].str.contains(name, regex=False), name] = 1

### Add Columns

In [8]:
set_week(df_train, 'REG_DATE')

In [9]:
check_week(df_train)

(2017, 2)
missing 1 values


In [10]:
# 어종별 수 컬럼 추가
df_train['name_cnt'] = 0;
value_dict = {}
for name, value in zip(df_train['P_NAME'].value_counts().index,df_train['P_NAME'].value_counts()):
    value_dict[name] = value

def value(col):
    return value_dict[col]

df_train['name_cnt'] = df_train['P_NAME'].apply(value)

In [11]:
# 가공 여부 컬럼 추가
df_train['is_processed'] = (df_train['CTRY_1'] != df_train['CTRY_2'])

In [12]:
df_train['import_cnt'] = 0
for i, row in df_train.iterrows():
    for name in import_type_list:
        if row[name] == 1:
            df_train.at[i, 'import_cnt'] += 1

---

# Weather data

## Import Data

In [13]:
df_weather_code = pd.read_csv(os.path.join(root, 'raw_weather_code.csv'), header=0, index_col=0)

In [14]:
weather_list = [pd.read_csv(os.path.join(root, 'raw_weather_20151228_20161227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20161228_20171227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20171228_20181227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20181228_20191227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20191228_20201227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20201228_20210818.csv'), encoding='euc-kr')]


## Preprocess
- '지점'에 따른 나라명 컬럼(CTRY_1)과 해안가여부(is_waterfront) 추가
- 각 나라, 일자 별로 평균 강수량, 풍속, 기온 계산

In [15]:
# 지점에 따라 나라명 추가
def set_country(row):
    data = df_weather_code[df_weather_code['지점'] == row['지점']]
    if data.empty:
        return ""
    return data.iloc[0]['국가명']


def set_waterfront(row):
    data = df_weather_code[df_weather_code['지점'] == row['지점']]
    if data.empty or data.iloc[0]['해안가여부'] != 1:
        return False
    return True
    

def preprocess_weather(df):
    df = df[(df['강수량'] >= 0) & (df['풍속'] >= 0)] # 이상치/결측치 제거
    set_week(df, '일시')  # 날짜 정보 처리
    
    # 1차 평균
    columns = ['year', 'week', '지점']
    df['rain'] = df[columns + ['강수량']].groupby(columns).transform('mean')
    df['wind'] = df[columns + ['풍속']].groupby(columns).transform('mean')
    df['temperature'] = df[columns + ['기온']].groupby(columns).transform('mean')
    
    # 나라명 추가
    df['CTRY_1'] = ""
    for i, row in df.iterrows():
        df.at[i, 'CTRY_1'] = set_country(row)
    
    # 해안가 여부 추가
    df['is_waterfront'] = False
    for i, row in df.iterrows():
        df.at[i, 'is_waterfront'] = set_waterfront(row)
        
#     df = df[df['is_waterfront']]  # 해안가가 아닌 데이터 제외
    
    # 2차 평균
    columns = ['year', 'week', 'CTRY_1']
    df['rain'] = df[columns + ['강수량']].groupby(columns).transform('mean')
    df['wind'] = df[columns + ['풍속']].groupby(columns).transform('mean')
    df['temperature'] = df[columns + ['기온']].groupby(columns).transform('mean')

    # 컬럼/행 정리
    df.drop(columns=['지점명', '지점', '강수량', '풍속', '기온', 'is_waterfront'], inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

In [None]:
for i in range(len(weather_list)):
    weather_list[i] = preprocess_weather(weather_list[i])

In [None]:
df_weather = pd.concat(weather_list)

## Check

In [None]:
df_weather.describe()

In [None]:
for country in df_weather['CTRY_1'].unique():
    if country == "":
        continue
    print()
    print(country, "총", len(df_weather[df_weather['CTRY_1'] == country]), "개")
    check_week(df_weather[df_weather['CTRY_1'] == country])

## Filling Missing Values

- 노르웨이, 중국은 결측치 없음
- 태국, 베트남, 칠레는 결측치 1개
> 전/차주 데이터 평균으로 채워넣기
- 페루는 결측치 299개
> 사용 불가

In [None]:
def get_avg(year, week, country, value):
  return (df_weather[(df_weather['year'] == year) & (df_weather['week'] == week + 1) & (df_weather['CTRY_1'] == country)].iloc[0][value] 
          + df_weather[(df_weather['year'] == year) & (df_weather['week'] == week - 1) & (df_weather['CTRY_1'] == country)].iloc[0][value]) / 2

In [None]:
df_weather = df_weather.append({'year': 2016,
                                'week': 11,
                                'CTRY_1': '태국',
                                'rain': get_avg(2016, 11, '태국', 'rain'),
                                'wind': get_avg(2016, 11, '태국', 'wind'),
                                'temperature': get_avg(2016, 11, '태국', 'temperature')},
                               ignore_index=True)
df_weather = df_weather.append({'year': 2016,
                                'week': 6,
                                'CTRY_1': '베트남',
                                'rain': get_avg(2016, 6, '베트남', 'rain'),
                                'wind': get_avg(2016, 6, '베트남', 'wind'),
                                'temperature': get_avg(2016, 6, '베트남', 'temperature')},
                               ignore_index=True)
df_weather = df_weather.append({'year': 2016,
                                'week': 5,
                                'CTRY_1': '칠레',
                                'rain': get_avg(2016, 5, '칠레', 'rain'),
                                'wind': get_avg(2016, 5, '칠레', 'wind'),
                                'temperature': get_avg(2016, 5, '칠레', 'temperature')},
                               ignore_index=True)

for country in ["태국", "베트남", "칠레"]:
    print()
    print(country, "총", len(df_weather[df_weather['CTRY_1'] == country]), "개")
    check_week(df_weather[df_weather['CTRY_1'] == country])

## Add to Training Data

In [None]:
df_train = pd.merge(df_train, df_weather, how='left', on=['year', 'week', 'CTRY_1'])

---

# Salinity

## Import Data

In [None]:
# df_salinity = pd.read_csv(os.path.join(root, 'raw_salinity.csv'))

## Drop Columns & Rows

In [None]:
# df_salinity.drop(df_salinity.columns[2], inplace=True, axis=1)
# df_salinity = df_salinity[(2015 <= df_salinity['obs_year']) & (df_salinity['obs_year'] <= 2021)]

2020, 2021 데이터의 부재로 인해 보류

---

# Oil

## Import Data

In [None]:
df_oil = pd.read_csv(os.path.join(root, 'raw_oil.csv'))
df_oil_dubai = pd.read_csv(os.path.join(root, 'raw_oil_dubai.csv'))
df_oil_brent = pd.read_csv(os.path.join(root, 'raw_oil_brent.csv'))

## Preprocess

In [None]:
from datetime import datetime
def preprocess_oil(df):
    df['date'] = pd.to_datetime(df['날짜'].str[:4] + df['날짜'].str[6:8] + df['날짜'].str[10:12])
    set_week(df, 'date')  # 날짜 데이터 정리
    df['oil'] = df[['year', 'week', '종가']].groupby(['year', 'week']).transform('mean')
    df.drop(columns = ['날짜', '오픈', '고가', '저가', '거래량', '변동 %', '종가'], inplace=True, axis=1)
    df.drop_duplicates(inplace=True)
    return df

In [None]:
df_oil = preprocess_oil(df_oil)
df_oil_dubai = preprocess_oil(df_oil_dubai)
df_oil_brent = preprocess_oil(df_oil_brent)

## Check

In [None]:
for i in range(1, 53):
  if i not in (list(df_oil_brent[df_oil_brent['year'] == 2019].sort_values(by=['year', 'week'])['week'])):
    print(i)

In [None]:
check_week(df_oil)
check_week(df_oil_dubai)
check_week(df_oil_brent)

모든 데이터가 결측치는 없지만,  
df_oil 에 해당하는 wti 종가가 가장 예민하게 반응하는 값이므로  
이를 사용하도록 한다

## Add to Training Data

In [None]:
df_train = pd.merge(df_train, df_oil, how='left', on=['year', 'week'])

---

# Korea Weather

## Import Data

In [None]:
df_weather_kr = pd.read_csv(os.path.join(root, 'raw_weather_korea.csv'), encoding='euc-kr')

## Preprocess Date

In [None]:
df_weather_kr['date'] = pd.to_datetime(df_weather_kr['일시'].str[:4] + df_weather_kr['일시'].str[5:7] + df_weather_kr['일시'].str[8:])
set_week(df_weather_kr, 'date')

In [33]:
df_weather_kr.sample(5)

Unnamed: 0,지점,일시,평균 풍속(m/s),평균 기온(°C),평균 수온(°C),year,week
31081,22189,2020-10-28,4.5,19.3,21.4,2020,44
22222,22185,2017-10-07,3.0,18.7,19.2,2017,40
30604,22189,2019-06-14,3.2,20.1,19.9,2019,24
6125,22103,2016-07-07,2.3,22.0,22.3,2016,27
35467,22298,2021-03-30,4.2,9.4,9.2,2021,13


## Group
일자 별로 평균 구하기

In [34]:
df_weather_kr['wind_kr'] = df_weather_kr[['year', 'week', '평균 풍속(m/s)']].groupby(['year', 'week']).transform('mean')
df_weather_kr['temperature_kr'] = df_weather_kr[['year', 'week', '평균 기온(°C)']].groupby(['year', 'week']).transform('mean')
df_weather_kr['water_temp_kr'] = df_weather_kr[['year', 'week', '평균 수온(°C)']].groupby(['year', 'week']).transform('mean')

## Drop Column
어차피 전부 해안가 대한민국이라 지점은 필요없다  
사용한 컬럼은 제거한다

In [35]:
drop = ['지점', '일시', '평균 풍속(m/s)', '평균 기온(°C)', '평균 수온(°C)']

In [36]:
df_weather_kr.drop(columns=drop, inplace=True, axis=1)
df_weather_kr.drop_duplicates(inplace=True)

## Check

In [37]:
check_week(df_weather_kr)

missing 0 values


## Add to Training Data

In [38]:
df_train = pd.merge(df_train, df_weather_kr, how='left', on=['year', 'week'])

---

# 소비자물가지수

## Import Data

In [40]:
cpi = pd.read_csv(os.path.join(root, 'raw_cpi_purpose.csv'), encoding='cp949')  # CPI

## 소비자물가 총지수

### Drop Columns/Rows

In [41]:
cpi_purpose.drop('시도별',axis=1,inplace=True)

cpi_purpose_copy = cpi_purpose.copy()

cpi_purpose_copy.drop('지출목적별',axis=1,inplace=True)

cpi_total = cpi_purpose_copy.iloc[0,:] # 전국 소비자물가 총지수만 추출

cpi_total = pd.DataFrame(cpi_total.values,columns=['cpi'],index=cpi_total.index)

cpi_total

NameError: ignored

### Pivoting

In [None]:
cpi_purpose_copy = cpi_purpose.copy()

cpi_purpose_copy.drop('지출목적별',axis=1,inplace=True)

cpi_total = cpi_purpose_copy.iloc[0,:] # 전국 소비자물가 총지수만 추출

cpi_total = pd.DataFrame(cpi_total.values,columns=['cpi'],index=cpi_total.index)

cpi_total

## 지출목적별 소비자물가지수

In [None]:
cpi_purpose2 = cpi_purpose.iloc[[1,11],:]

cpi_purpose2 # 전국 식료품별, 음식서비스별 cpi만 추출

In [None]:
cpi_purpose3 = pd.DataFrame(cpi_purpose2.iloc[0,:].values,index = cpi_purpose2.columns,columns=['식료품 cpi'])

cpi_purpose3['음식 및 숙박 cpi'] = cpi_purpose2.iloc[1,:].values

cpi_purpose3.drop('지출목적별',axis=0,inplace=True) # 지출목적별 행 삭제

cpi_purpose3 # 식료품, 음식 cpi

## Preprocess Date

In [None]:
cpi_total.reset_index(inplace=True)  # 날짜 인덱스 > 컬럼으로 변경
cpi_purpose3.reset_index(inplace=True)  # 날짜 인덱스 > 컬럼으로 변경

In [None]:
cpi_total['year'] = cpi_total['index'].str[:4].astype('int')
cpi_total['month'] = cpi_total['index'].str[5:].astype('int')
cpi_total.drop(columns=['index'], inplace=True, axis=1)

cpi_purpose3['year'] = cpi_purpose3['index'].str[:4].astype('int')
cpi_purpose3['month'] = cpi_purpose3['index'].str[5:].astype('int')
cpi_purpose3.drop(columns=['index'], inplace=True, axis=1)

## Add to Training Data

In [None]:
df_train = pd.merge(df_train, df_cpi1, how='left', on=['year', 'month'])
df_train = pd.merge(df_train, df_cpi2, how='left', on=['year', 'month'])

In [None]:
pd.set_option("display.max_columns", None)
df_train.sample(5)

---

# Final

## One-hot Encoding

In [152]:
one_hot = ['CTRY_1', 'CTRY_2', 'P_PURPOSE', 'CATEGORY_1', 'CATEGORY_2', 'P_NAME']

In [153]:
df_train = pd.get_dummies(df_train, columns=one_hot)

## Drop Columns

In [154]:
drop = ['REG_DATE', 'P_TYPE', 'P_IMPORT_TYPE']

In [155]:
df_train.drop(columns = drop, inplace=True)

KeyError: ignored

# Exchange

In [88]:
root_2 = '/content/drive/MyDrive/BigContest/DATA_기타/환율/'

In [89]:
exchange_list = [pd.read_csv(root_2 + 'exchange_chile_7.csv'),
                 pd.read_csv(root_2 + 'exchange_chn_7.csv'),
                 pd.read_csv(root_2 + 'exchange_nor_7.csv'),
                 pd.read_csv(root_2 + 'exchange_peru_7.csv'),
                 pd.read_csv(root_2 + 'exchange_thai_7.csv'),
                 pd.read_csv(root_2 + 'exchange_vit_final.csv')]

In [90]:
for lst in exchange_list:
  lst.drop('Unnamed: 0', axis=1, inplace=True)

for lst in exchange_list:
  set_week(lst, '날짜')

for lst in exchange_list:
  check_week(lst)

missing 0 values
missing 0 values
missing 0 values
missing 0 values
missing 0 values
missing 0 values


In [91]:
CTRY_list = ['칠레', '중국', '노르웨이', '페루', '태국', '베트남']

for num, nation in enumerate(CTRY_list):
  exchange_list[num]['CTRY_2']=nation

In [92]:
for i in range(1):
  concat_df = pd.concat([exchange_list[i], exchange_list[i+1],exchange_list[i + 2],exchange_list[i+ 3],exchange_list[i + 4],exchange_list[i + 5]], axis=0)
final_merge = pd.merge(df_train, concat_df, how='left', on=['year','week','CTRY_2'])

In [97]:
final_merge

Unnamed: 0,P_TYPE,CTRY_1,CTRY_2,P_PURPOSE,CATEGORY_1,CATEGORY_2,P_NAME,P_IMPORT_TYPE,P_PRICE,횟감,활,난포선,캐비아대용,곤이,알,필렛(F),머리,목살,냉동,다리,턱살,머리_외화획득용,한쪽껍질붙은,눈살,냉장,슬라이스(S),간,집게다리,포장횟감,절단,머리살,지느러미,훈제,개아지살,자숙,껍질,살,내장,외투막,염장,줄기,동체,볼살,건조,창난,꼬리_외화획득용,year,week,name_cnt,is_processed,import_cnt,rain,wind,temperature,oil,wind_kr,temperature_kr,water_temp_kr,종가
0,수산물,아르헨티나,아르헨티나,판매용,갑각류,새우,아르헨티나붉은새우,냉동,7.480000,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2015,53,79,False,1,,,,37.08,5.352941,7.973109,13.732174,
1,수산물,바레인,바레인,판매용,갑각류,게,꽃게,냉동,2.920000,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2015,53,596,False,1,,,,37.08,5.352941,7.973109,13.732174,
2,수산물,바레인,바레인,판매용,갑각류,게,꽃게,"냉동,절단",3.356352,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2015,53,596,False,2,,,,37.08,5.352941,7.973109,13.732174,
3,수산물,칠레,칠레,판매용,패류 멍게류,해삼,해삼,"건조,자숙",18.260870,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2015,53,328,False,2,0.580000,6.833333,16.963333,37.08,5.352941,7.973109,13.732174,0.0014
4,수산물,중국,중국,판매용,어류,서대 박대 페루다,서대,냉동,4.791340,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2015,53,359,False,1,1.101826,2.242009,6.697717,37.08,5.352941,7.973109,13.732174,0.1541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59057,수산물,러시아,러시아,판매용,갑각류,게,왕게,활,38.437099,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020,1,543,False,1,,,,61.66,6.534266,6.942657,13.188652,
59058,수산물,중국,중국,판매용,연체류 해물모듬,낙지,낙지,활,8.282274,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020,1,1617,False,1,2.045970,2.498507,5.346567,61.66,6.534266,6.942657,13.188652,0.1431
59059,수산물,러시아,러시아,판매용,갑각류,게,대게,활,23.396192,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020,1,603,False,1,,,,61.66,6.534266,6.942657,13.188652,
59060,수산물,중국,중국,판매용,패류 멍게류,바지락,바지락,활,0.852967,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020,1,601,False,1,2.045970,2.498507,5.346567,61.66,6.534266,6.942657,13.188652,0.1431


## Save Files

In [None]:
def save_file(df, file_name):
    df.to_csv(os.path.join(root, file_name), encoding='utf-8', index=False)

In [None]:
save_file(df_weather, 'preprocessed_weather.csv')  # 제조국 날씨
save_file(df_oil, 'preprocessed_oil.csv')  # 원유 종가
save_file(df_weather_kr, 'preprocessed_weather_korea.csv')  # 한국 날씨
save_file(df_cpi1, 'preprocessed_cpi_region.csv')  # 전체 소비자물가지수
save_file(df_cpi2, 'preprocessed_cpi_purpose.csv')  # 음식/음식서비스 소비자물가지수
save_file(df_train, 'preprocessed_train.csv')  # 최종 df