# Import

## Library

In [1]:
# Import Libraries
import os
import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Visuzliation Setting
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib import rc
from matplotlib import colors
import seaborn as sns

In [2]:
root = os.path.join(os.getcwd(), 'DATA')

## Data

In [3]:
df_raw = pd.read_excel(os.path.join(root, 'train.xlsx'))
df_train = df_raw.copy()

# Preprocess

In [4]:
df_train['year'] = df_train['REG_DATE'].dt.year
df_train['month'] = df_train['REG_DATE'].dt.month
df_train['day'] = df_train['REG_DATE'].dt.day

In [5]:
import_type_list = set()
for tmp in df_train.P_IMPORT_TYPE.unique():
    for a in tmp.split(','):
        import_type_list.add(a)

In [6]:
for name in import_type_list:
    df_train[name] = 0
    df_train.loc[df_train['P_IMPORT_TYPE'].str.contains(name, regex=False), name] = 1

In [7]:
df_train['VALUE_COUNT'] = 0;
value_dict = {}
for name, value in zip(df_train['P_NAME'].value_counts().index,df_train['P_NAME'].value_counts()):
    value_dict[name] = value

def value(col):
    return value_dict[col]

df_train['VALUE_COUNT'] = df_train['P_NAME'].apply(value)

## add weather data

In [96]:
df_weather_code = pd.read_excel(os.path.join(root, '강수량번호국가매칭.xlsx'), header=None, index_col=0)

In [97]:
rwt_20152016 = pd.read_csv(os.path.join(root, 'GTS_SYNOP_TIM_20151228-20161227.csv'),encoding='euc-kr') 
rwt_20162017 = pd.read_csv(os.path.join(root, 'GTS_SYNOP_TIM_20161228-20171227.csv'),encoding='euc-kr') 
rwt_20172018 = pd.read_csv(os.path.join(root, 'GTS_SYNOP_TIM_20171228-20181227.csv'),encoding='euc-kr') 
rwt_20182019 = pd.read_csv(os.path.join(root, 'GTS_SYNOP_TIM_20181228-20191227.csv'),encoding='euc-kr') 
rwt_20192020 = pd.read_csv(os.path.join(root, 'GTS_SYNOP_TIM_20191228-20201227.csv'),encoding='euc-kr') 
rwt_20202021 = pd.read_csv(os.path.join(root, 'GTS_SYNOP_TIM_20201228-20210818.csv'),encoding='euc-kr')
rwt_list = [rwt_20152016, rwt_20162017, rwt_20172018, rwt_20182019, rwt_20192020, rwt_20202021]

In [98]:
# 지점에 따라 나라명 추가
def set_country(row):
    data = df_weather_code[df_weather_code[1] == row['지점']]
    if data.empty:
        return ""
    return data.iloc[0][2]


def preprocess_weather(df_weather):
    # 날짜 정보 정리
    df_weather['year'] = df_weather['일시'].astype('str').str[:4].astype('int')
    df_weather['month'] = df_weather['일시'].astype('str').str[5:7].astype('int')
    df_weather['day'] = df_weather['일시'].astype('str').str[8:10].astype('int')
    # 1차 평균
    df_weather['rain'] = df_weather[['지점', 'year', 'month', 'day', '강수량']].groupby(['지점', 'year', 'month', 'day']).transform('mean')
    df_weather['wind'] = df_weather[['지점', 'year', 'month', 'day', '풍속']].groupby(['지점', 'year', 'month', 'day']).transform('mean')
    df_weather['temperature'] = df_weather[['지점', 'year', 'month', 'day', '기온']].groupby(['지점', 'year', 'month', 'day']).transform('mean')
    # 컬럼/행 정리
    df_weather.drop(columns = ['지점명', '일시', '강수량', '풍속', '기온'], inplace=True)
    df_weather.drop_duplicates(inplace=True)
    # 나라명 추가
    df_weather['country'] = ""
    for i, row in df_weather.iterrows():
        df_weather.at[i, 'country'] = set_country(row)
    # 2차 평균
    df_weather['rain'] = df_weather[['country', 'year', 'month', 'day', 'rain']].groupby(['country', 'year', 'month', 'day']).transform('mean')
    df_weather['wind'] = df_weather[['country', 'year', 'month', 'day', 'wind']].groupby(['country', 'year', 'month', 'day']).transform('mean')
    df_weather['temperature'] = df_weather[['country', 'year', 'month', 'day', 'temperature']].groupby(['country', 'year', 'month', 'day']).transform('mean')
    # 컬럼/행 정리
    df_weather.drop(columns = ['지점'], inplace=True)
    df_weather.drop_duplicates(inplace=True)
    # 인덱스 정리
    df_weather.reset_index(drop=True, inplace=True)

In [99]:
for df in rwt_list:
    preprocess_weather(df)
rwt = pd.concat(rwt_list)

In [107]:
rwt.describe()

Unnamed: 0,year,month,day,rain,wind,temperature
count,13947.0,13947.0,13947.0,9561.0,11895.0,11895.0
mean,2018.258407,6.276762,15.697426,-20.15821,3.535292,17.914166
std,1.613987,3.414373,8.809565,79.127632,5.846954,9.676148
min,2015.0,1.0,1.0,-999.0,-156.116667,-10.577841
25%,2017.0,3.0,8.0,-6.661111,2.168701,11.250038
50%,2018.0,6.0,16.0,1.165833,2.958333,20.135
75%,2020.0,9.0,23.0,3.915909,5.76875,25.950781
max,2021.0,12.0,31.0,915.0,15.125,33.145833


#### 확인필요
- outlier 찾아내기 > 값이 너무 크거나 작은 경우 제외
- 날짜별로 확인 후 비어있는 값 채워넣기 (전/다음날 이용)
    - 13947개 데이터 중 rain, wind, temperature 갯수 보면 몇개 비어있는지 확인 가능
- 합치기..

In [None]:
# 
def set_rwt(row):
    country = df_weather_code[df_weather_code[2] == row['CTRY_1']]
    if country.empty:
        return np.NaN
    data = df_weather[(df_weather['year'] == row['year']) 
                      & (df_weather['month'] == row['month']) 
                      & (df_weather['day'] == row['day'])
                      & (df_weather['지점'] == country[1])]
    return

In [None]:
df_train['rain'] = np.NaN
df_train['wind'] = np.NaN
df_train['temperature'] = np.NaN

**전처리해햘 부분**
- "지점/지점명"을 기본데이터의 CTYR_1의 나라명과 맞춰줘야함
    - df_weather_code을 사용하여 변경 필요
    - df_wather# 에 CTTRY_1 컬럼을 추가하여 사용
- 하루에 한 지점에서 측정한 데이터가 여러개임
    - 하루에 한 데이터가 나오도록 전처리 필요
- 강수량의 경우 NaN 값이 너무 많음
    - 위 전처리들을 한 후에도 많다면 사용불가
    - 위 전처리 후에는 적다면 전날/다음날 데이터 기반으로 채워넣기 진행

## add salinity

In [None]:
df_salinity = pd.read_csv(os.path.join(root, ''))

## onehot encoding

In [None]:
one_hot = ['CTRY_1', 'CTRY_2', 'P_PURPOSE', 'CATEGORY_1', 'CATEGORY_2', 'P_NAME']

In [None]:
df_train = pd.get_dummies(df_train, columns=one_hot)

## drop columns

In [None]:
drop = ['REG_DATE', 'P_TYPE', 'P_IMPORT_TYPE']

In [None]:
df_train.drop(columns = drop, inplace=True)

# Train

## Set Metric

In [108]:
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

# Function to print the MAE (Mean Absolute Error) score
def print_score(m : LinearRegression):
    res = ['mae train: ', mean_absolute_error(m.predict(X_train), y_train), 
           'mae val: ', mean_absolute_error(m.predict(X_valid), y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

## Set Data

In [None]:
target = df_train['P_PRICE']
df_train.drop(columns = 'P_PRICE', inplace=True)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_train, target, random_state = 0)

In [None]:
#Standardization
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

# Function for splitting training and validation data
def split_vals(a, n : int): 
    return a[:n].copy(), a[n:].copy()

val_perc = 0.1 # % to use for validation set
n_valid = int(val_perc * 100000) 
n_trn = len(original)-n_valid

# Split data
raw_train, raw_valid = split_vals(sample, n_trn)
X_train, X_valid = split_vals(original, n_trn)
y_train, y_valid = split_vals(target, n_trn)

X_train=sc.fit_transform(X_train)
X_valid=sc.transform(X_valid)

# Check dimensions of samples
print('Sample train shape: ', X_train.shape, 
      '\nSample target shape: ', y_train.shape, 
      '\nSample validation shape: ', X_valid.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostRegressor

In [None]:
model = RandomForestRegressor(n_estimators=500,random_state=0)
model.fit(x_train,y_train)
cross = cross_val_score(model,x_test,y_test,cv = 5)

print('cross_tree',cross.mean())

In [None]:
from sklearn.svm import SVR
model = SVR()
model.fit(x_train,y_train)
cross_svr = cross_val_score(model,x_test,y_test,cv = 5)
print('cross_svr',cross_svr.mean())

In [None]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=500,learning_rate=0.05,max_depth=3)
model.fit(x_train,y_train)
cross_xgboost = cross_val_score(model,x_test,y_test,cv = 5)

print('cross_xgboost : ',cross_xgboost.mean())

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train,y_train)

cross_linear = cross_val_score(model,x_test,y_test,cv = 5)
print(cross_linear)
print('cross_linear : ',cross_linear.mean())

In [None]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor().fit(x_train,y_train)
cross_MLP = cross_val_score(model,x_test,y_test,cv = 5)

print('croos_MLP : ',cross_MLP.mean())