# Setting

In [1]:
from utility import *

In [2]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
def RMSE(y, y_pred):
    return mean_squared_error(y, y_pred)**0.5


def train_model(train_data, target_data, model=LinearRegression()):  # baseline model : LInearRegression
    x_train, x_test, y_train, y_test = train_test_split(train_data, target_data, random_state=0)

    model.fit(x_train, y_train)
    print("Model Training Complete!")

    pred_train, pred_test = model.predict(x_train), model.predict(x_test)
    
    plt.figure(figsize=(10, 8))
#     plt.scatter(pred_train, y_train, s=10)
    sns.regplot(pred_train, y_train, color='g')
    plt.xlabel("Predicted price")
    plt.ylabel("Actual price")
    plt.show()

    # cvs = cross_val_score(model, x_test, y_test, cv = 5)
    # print(">> cross_val_score mean =", cvs.mean())
    print(">> RMSE train =", RMSE(y_train, pred_train))
    print(">> RMSE validation =", RMSE(y_test, pred_test))
    print(">> MAE train =", mean_absolute_error(pred_train, y_train))
    print(">> MAE validation =", mean_absolute_error(pred_test, y_test))
    print("-------------------------------------------------")
    
    return model

    
def print_importance(model, df, added_columns):
    importance = model.coef_
    fs_data = []
    for i, x in enumerate(importance):
        fs_data.append([abs(x), df.columns[i]])
    fs_data.sort(key=lambda x: x[0], reverse=True)
   
    # 추가한 컬럼의 중요도
    for i in range(len(fs_data)):
        if fs_data[i][1] in added_columns:
            print(fs_data[i][1], ":", fs_data[i][0] ,">", i, "순위")
    print("-------------------------------------------------")
    print("총", len(fs_data) , "개")
    
    return fs_data

In [4]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import randint

In [5]:
root = os.path.join(os.getcwd(), 'DATA')

# Import Data

In [6]:
df_train = pd.read_csv(os.path.join(root, 'preprocessed_train_notencoded.csv'))
df_weather_code = pd.read_csv(os.path.join(root, 'raw_weather_code.csv'), header=0, index_col=0)
weather_list = [pd.read_csv(os.path.join(root, 'raw_weather_20151228_20161227.csv'), encoding='cp949') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20161228_20171227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20171228_20181227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20181228_20191227.csv'), encoding='cp949') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20191228_20201227.csv'), encoding='euc-kr') , 
                pd.read_csv(os.path.join(root, 'raw_weather_20201228_20210818.csv'), encoding='euc-kr')]
df_exchange = pd.read_csv(os.path.join(root, 'preprocessed_exchange.csv'))
df_oil = pd.read_csv(os.path.join(root, 'preprocessed_oil.csv'))
df_weather_kr = pd.read_csv(os.path.join(root, 'preprocessed_weather_korea.csv'))
df_cpi = pd.read_csv(os.path.join(root, 'preprocessed_cpi.csv'))
df_weather_with_wf = pd.read_csv(os.path.join(root, 'df_weather_with_wf.csv')) # 3-2 전처리 가설 검증 df_weather_with_wf

In [7]:
final_squid = pd.read_csv(os.path.join(root, 'final_squid.csv'))
final_salmon = pd.read_csv(os.path.join(root, 'final_salmon.csv'))
final_whiteleg_shrimp = pd.read_csv(os.path.join(root, 'final_whiteleg_shrimp.csv'))

# Best Model Selection

In [8]:
def model_scaler(data, col, scaler = None):
  
  '''
  정규화 함수
  data : dataframe
  column : P_PRICE
  scaler : standard, robust, minmax, log

  '''
 
  features = data.drop(col, axis=1)
  target = data[col]

  if scaler == 'standard':
    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    return features, target

  elif scaler == 'robust':
    scaler = RobustScaler()
    features = scaler.fit_transform(features)

    return features, target

  elif scaler == 'minmax':
    scaler = MinMaxScaler()
    features = scaler.fit_transform(features)

    return features, target

  elif scaler == 'log':
    features = np.log1p(features)

    return features, target

  elif scaler == 'None':

    return features, target


################################################################################################################################################

def model_train(data, col, scaler, model = None):

  '''
  
  data : dataframe
  column : P_PRICE
  scaler : standard, robust, minmax, log
  model_name : linear, ridge, lasso, elastic, decisiontree,
               randomforest, ada, gradient, xgb, lgbm

  '''

  features, target = model_scaler(data, col, scaler)
  x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)
  
  if model == 'linear': 
    
    model = LinearRegression()
    neg_mse_scores = cross_val_score(model, features, target, scoring = 'neg_mean_squared_error', cv = 10)
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)


    print('RMSE : {:.4f}'.format(avg_rmse))

  elif model == 'ridge':
    
    params = {
              'alpha': (0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1, 1, 10, 100, 200, 50, 30, 20, 29, 58),            
              'fit_intercept':(True, False),
              'normalize':(True, False),

              }

    ridge = Ridge(random_state=0)
    final = RandomizedSearchCV(ridge, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 1000, n_jobs = -1 ,random_state=0)
    final.fit(x_train, y_train)
    pred = final.predict(x_test)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
    print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))
    
  elif model == 'lasso':

    params = {
              'alpha': (0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1, 1, 10, 100, 200, 50, 30, 20, 29, 58),            
              'fit_intercept':(True, False),
              'normalize':(True, False),

              }

    lasso = Lasso(random_state=0)
    final = RandomizedSearchCV(lasso, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 1000, n_jobs = -1 ,random_state=0)
    final.fit(features, target)
    pred = final.predict(x_test)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
    print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))
  
  elif model == 'elastic':
    

    params = {
       'alpha': (0.1, 0.01, 0.5, 1, 3, 5, 10),
       'l1_ratio':(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.1, 0.1)
    }

    elastic = ElasticNet()
    final = RandomizedSearchCV(elastic, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 1000, n_jobs = -1 ,random_state=0)
    final.fit(features, target)
    pred = final.predict(x_test)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
    print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))
  elif model == 'decisiontree':
    
    params = {
              'max_depth': [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],            
               #'min_child_samples': randint(5, 50),
              'min_samples_split':[1,3,5,7,10,15,20,25,30,45,50,60,70,80,90,100],
              'min_samples_leaf': [1,3,5,7,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100],
              
    }

    dt = DecisionTreeRegressor(random_state=0)
    final = RandomizedSearchCV(dt, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 1000, n_jobs = -1 ,random_state=0)
    final.fit(features, target)
    pred = final.predict(x_test)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
    print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))
  elif model == 'randomforest':
    
    params = {
              'max_depth': [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],           
              'n_estimators':[1,5,10,30,50,70,100,200,500,750,1000],
               #'min_child_samples': randint(5, 50),
              'min_samples_leaf':[1,3,5,7,10,20,30,50,70,100],
              'min_samples_split': [1,3,5,7,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100],
              'max_leaf_nodes': [1,3,5,7,10,20,30,50,70,100,200,500,700,800,900,1000],

              }

    rf = RandomForestRegressor(random_state=0)
    final = RandomizedSearchCV(rf, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)
    final.fit(features, target)
    pred = final.predict(x_test)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
    print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))


  elif model == 'gradinet':

    params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
              'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
              'subsample' : (0.01, 0.1, 0.5, 0.08, 0.35, 0.3, 0.001, 0.03, 0.006, 0.153, 0.193, 0.0012, 0.0083 ,1),
              'min_samples_split' : [1,3,5,7,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100],
              'max_depth' : [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],  
              }   

    grad = GradientBoostingRegressor()
    final = RandomizedSearchCV(grad, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)
    final.fit(features, target)
    pred = final.predict(x_test)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
    print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))

  elif model == 'xgb':
    
    params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
              'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
              'max_depth' :  [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],  
              'min_child_weight' :[0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
              }   

    xgb = XGBRegressor()
    final = RandomizedSearchCV(xgb, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)
    final.fit(features, target)

    pred = final.predict(x_test)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
    print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))

  elif model == 'lgbm':
    params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
              'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
              'max_depth' : [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],  
              'min_child_weight' : [0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
              'num_leaves': [1,3,5,7,9,15,25,50,100,200,300,500,700],
              'min_child_samples':[1,3,5,7,9,15,25,50,100,200,300,500,700],  
              }   

    lgbm = LGBMRegressor()
    final = RandomizedSearchCV(lgbm, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)
    final.fit(features, target)

    pred = final.predict(x_test)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
    print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))

In [9]:
lr = LinearRegression()

params = {
              'alpha': (0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1, 1, 10, 100, 200, 50, 30, 20, 29, 58),            
              'fit_intercept':(True, False),
              'normalize':(True, False),

              }

ridge = Ridge(random_state=0)
ridge = RandomizedSearchCV(ridge, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 1000, n_jobs = -1 ,random_state=0)
   
    
params = {
              'alpha': (0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1, 1, 10, 100, 200, 50, 30, 20, 29, 58),            
              'fit_intercept':(True, False),
              'normalize':(True, False),

            }

lasso = Lasso(random_state=0)
lasso = RandomizedSearchCV(lasso, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 1000, n_jobs = -1 ,random_state=0)
   
  
 
params = {
       'alpha': (0.1, 0.01, 0.5, 1, 3, 5, 10),
       'l1_ratio':(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.1, 0.1)
    }

elastic = ElasticNet()
elastic = RandomizedSearchCV(elastic, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 1000, n_jobs = -1 ,random_state=0)

    
params = {
              'max_depth': [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],            
               #'min_child_samples': randint(5, 50),
              'min_samples_split':[1,3,5,7,10,15,20,25,30,45,50,60,70,80,90,100],
              'min_samples_leaf': [1,3,5,7,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100],
              
}

dt = DecisionTreeRegressor(random_state=0)
dt = RandomizedSearchCV(dt, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 1000, n_jobs = -1 ,random_state=0)
    

params = {
              'max_depth': [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],           
              'n_estimators':[1,5,10,30,50,70,100,200,500,750,1000],
               #'min_child_samples': randint(5, 50),
              'min_samples_leaf':[1,3,5,7,10,20,30,50,70,100],
              'min_samples_split': [1,3,5,7,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100],
              'max_leaf_nodes': [1,3,5,7,10,20,30,50,70,100,200,500,700,800,900,1000],

              }

rf = RandomForestRegressor(random_state=0)
rf = RandomizedSearchCV(rf, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)

   
params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
              'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
              'subsample' : (0.01, 0.1, 0.5, 0.08, 0.35, 0.3, 0.001, 0.03, 0.006, 0.153, 0.193, 0.0012, 0.0083 ,1),
              'min_samples_split' : [1,3,5,7,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100],
              'max_depth' : [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],  
              }   

grad = GradientBoostingRegressor()
grad = RandomizedSearchCV(grad, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)
 

    
params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
              'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
              'max_depth' :  [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],  
              'min_child_weight' :[0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
              }   

xgb = XGBRegressor()
xgb = RandomizedSearchCV(xgb, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)
   
    
params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
              'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
              'max_depth' : [1,3,5,7,9,15,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500,550,600,700,800,900,1000,1100,1200,1500],  
              'min_child_weight' : [0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
              'num_leaves': [1,3,5,7,9,15,25,50,100,200,300,500,700],
              'min_child_samples':[1,3,5,7,9,15,25,50,100,200,300,500,700],  
              }   

lgbm = LGBMRegressor()
lgbm = RandomizedSearchCV(lgbm, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)


## Squid

In [10]:
model_train(final_squid,'P_PRICE','None','linear')

RMSE : 0.1504


In [11]:
model_train(final_squid,'P_PRICE','None','ridge')

Best Params: {'normalize': True, 'fit_intercept': True, 'alpha': 0.1}
Best Score: 0.12461289559596812
Predict RMSE: 0.16395828012218305


In [12]:
model_train(final_squid,'P_PRICE','None','lasso')

Best Params: {'normalize': True, 'fit_intercept': True, 'alpha': 0.0001}
Best Score: 0.1571647029703455
Predict RMSE: 0.16141134764055998


In [13]:
model_train(final_squid,'P_PRICE','None','elastic')

Best Params: {'l1_ratio': 0.0001, 'alpha': 0.01}
Best Score: 0.19448430765721741
Predict RMSE: 0.1802555221723289


In [14]:
model_train(final_squid,'P_PRICE','None','decisiontree')

Best Params: {'min_samples_split': 45, 'min_samples_leaf': 7, 'max_depth': 200}
Best Score: 0.18554653606317617
Predict RMSE: 0.14895550821111503


In [15]:
model_train(final_squid,'P_PRICE','None','randomforest')

Best Params: {'n_estimators': 70, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_leaf_nodes': 50, 'max_depth': 900}
Best Score: 0.1811504340147707
Predict RMSE: 0.0690766350174073


In [16]:
model_train(final_squid,'P_PRICE','None','gradinet')

Best Params: {'subsample': 1, 'n_estimators': 500, 'min_samples_split': 100, 'max_depth': 15, 'learning_rate': 0.1}
Best Score: 0.17620129798404802
Predict RMSE: 0.0015613023963505597


In [17]:
model_train(final_squid,'P_PRICE','None','xgb')

Best Params: {'n_estimators': 30, 'min_child_weight': 9, 'max_depth': 300, 'learning_rate': 0.351}
Best Score: 0.16511883322292426
Predict RMSE: 0.029415624763589603


In [18]:
model_train(final_squid,'P_PRICE','None','lgbm')

Best Params: {'num_leaves': 9, 'n_estimators': 500, 'min_child_weight': 1, 'min_child_samples': 7, 'max_depth': 9, 'learning_rate': 0.351}
Best Score: 0.1678278178093434
Predict RMSE: 1.605712197403056e-06


In [None]:
single_models = [ 
    ( 'linear_reg' , lr), 
    ( 'ridge' , ridge), 
    ( 'lasso' , lasso), 
    ( 'elasticnet' , elastic), 
    ( 'decisiontree' , dt), 
    ( 'randomforest' , rf),
    ( 'gradient' , grad),
    ( 'xgb' , xgb),
    ( 'lgbm' , lgbm),
]


squid_voting = VotingRegressor(single_models, n_jobs= -1)

features = final_squid.drop('P_PRICE', axis=1)
target = final_squid['P_PRICE']

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

squid_voting.fit(x_train,y_train)

pred = squid_voting.predict(x_test) 

print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))

Score와 RMSE가 모두 좋은 xgb 모델을 최종 모델로 선택


Voting 모델과 비교 예정

## Salmon

In [None]:
model_train(final_salmon,'P_PRICE','None','linear')

In [None]:
model_train(final_salmon,'P_PRICE','None','ridge')

In [None]:
model_train(final_salmon,'P_PRICE','None','lasso')

In [None]:
model_train(final_salmon,'P_PRICE','None','elastic')

In [None]:
model_train(final_salmon,'P_PRICE','None','decisiontree')

In [None]:
model_train(final_salmon,'P_PRICE','None','randomforest')

In [None]:
model_train(final_salmon,'P_PRICE','None','gradinet')

In [None]:
model_train(final_salmon,'P_PRICE','None','xgb')

In [None]:
model_train(final_salmon,'P_PRICE','None','lgbm')

In [None]:
single_models = [ 
    ( 'linear_reg' , lr), 
    ( 'ridge' , ridge), 
    ( 'lasso' , lasso), 
    ( 'elasticnet' , elastic), 
    ( 'decisiontree' , dt), 
    ( 'randomforest' , rf),
    ( 'gradient' , grad),
    ( 'xgb' , xgb),
    ( 'lgbm' , lgbm),
]


salmon_voting = VotingRegressor(single_models, n_jobs= -1)

features = final_salmon.drop('P_PRICE', axis=1)
target = final_salmon['P_PRICE']

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

salmon_voting.fit(x_train,y_train)

pred = salmon_voting.predict(x_test) 

print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))

Score와 RMSE가 모두 좋은 xgb 모델을 최종 모델로 선택

Voting 모델과 비교 예정

## Whiteleg_shrimp

In [None]:
model_train(final_whiteleg_shrimp,'P_PRICE','None','linear')

In [None]:
model_train(final_whiteleg_shrimp,'P_PRICE','None','ridge')

In [None]:
model_train(final_whiteleg_shrimp,'P_PRICE','None','lasso')

In [None]:
model_train(final_whiteleg_shrimp,'P_PRICE','None','elastic')

In [None]:
model_train(final_whiteleg_shrimp,'P_PRICE','None','decisiontree')

In [None]:
model_train(final_whiteleg_shrimp,'P_PRICE','None','randomforest')

In [None]:
model_train(final_whiteleg_shrimp,'P_PRICE','None','gradinet')

In [None]:
model_train(final_whiteleg_shrimp,'P_PRICE','None','xgb')

In [None]:
model_train(final_whiteleg_shrimp,'P_PRICE','None','lgbm')

In [None]:
single_models = [ 
    ( 'linear_reg' , lr), 
    ( 'ridge' , ridge), 
    ( 'lasso' , lasso), 
    ( 'elasticnet' , elastic), 
    ( 'decisiontree' , dt), 
    ( 'randomforest' , rf),
    ( 'gradient' , grad),
    ( 'xgb' , xgb),
    ( 'lgbm' , lgbm),
]


whiteleg_shrimp_voting = VotingRegressor(single_models, n_jobs= -1)

features = final_whiteleg_shrimp.drop('P_PRICE', axis=1)
target = final_whiteleg_shrimp['P_PRICE']

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

whiteleg_shrimp_voting.fit(x_train,y_train)

pred = whiteleg_shrimp_voting.predict(x_test) 

print('Predict RMSE:',(np.sqrt(mean_squared_error(y_test, pred))))

Score와 RMSE가 모두 좋은 xgb 모델을 최종 모델로 선택

Voting 모델과 비교 예정

## Best Model

### Squid

In [None]:
params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
              'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
              'max_depth' :  [1,3,5,7,9,15,25,50,100,200,300,400,450,500,550,700,800,900,1000],  
              'min_child_weight' :[0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
              }


xgb = XGBRegressor()
squid_final = RandomizedSearchCV(xgb, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)

features = final_squid.drop('P_PRICE',axis=1)
target = final_squid['P_PRICE']

squid_final.fit(features, target)

In [None]:
# params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
#               'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
#               'max_depth' : [1,3,5,7,9,15,25,50,100,200,300,400,450,500,550,700,800,900,1000],  
#               'min_child_weight' : [0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
#               'num_leaves': [1,3,5,7,9,15,25,50,100,200,300,500,700],
#               'min_child_samples':[1,3,5,7,9,15,25,50,100,200,300,500,700],  
#               }   

# lgbm = LGBMRegressor()
# squid_final = RandomizedSearchCV(lgbm, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)

# features = final_squid.drop('P_PRICE',axis=1)
# target = final_squid['P_PRICE']


# squid_final.fit(features, target)

### Salmon

In [None]:
params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
              'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
              'max_depth' :  [1,3,5,7,9,15,25,50,100,200,300,400,450,500,550,700,800,900,1000],  
              'min_child_weight' :[0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
              }   

xgb = XGBRegressor()
salmon_final = RandomizedSearchCV(xgb, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)

features = final_salmon.drop('P_PRICE',axis=1)
target = final_salmon['P_PRICE']

salmon_final.fit(features, target)

In [None]:
# params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
#               'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
#               'max_depth' : [1,3,5,7,9,15,25,50,100,200,300,400,450,500,550,700,800,900,1000],  
#               'min_child_weight' : [0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
#               'num_leaves': [1,3,5,7,9,15,25,50,100,200,300,500,700],
#               'min_child_samples':[1,3,5,7,9,15,25,50,100,200,300,500,700],  
#               }   

# lgbm = LGBMRegressor()
# salmon_final = RandomizedSearchCV(lgbm, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)

# features = final_salmon.drop('P_PRICE',axis=1)
# target = final_salmon['P_PRICE']


# salmon_final.fit(features, target)

### Whiteleg Shrimp

In [None]:
params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
              'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
              'max_depth' :  [1,3,5,7,9,15,25,50,100,200,300,400,450,500,550,700,800,900,1000],  
              'min_child_weight' :[0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
              }   

xgb = XGBRegressor()
whiteleg_shrimp_final = RandomizedSearchCV(xgb, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)

features = final_whiteleg_shrimp.drop('P_PRICE',axis=1)
target = final_whiteleg_shrimp['P_PRICE']

whiteleg_shrimp_final.fit(features, target)

In [None]:
# params = {'n_estimators' : [1,5,10,30,50,70,100,200,500,1000],
#               'learning_rate' :(0.01, 0.0001, 0.003, 0.5, 0.04, 0.008, 0.001, 0.351, 0.096, 0.853, 0.185, 0.01825, 0.012385, 0.1),
#               'max_depth' : [1,3,5,7,9,15,25,50,100,200,300,400,450,500,550,700,800,900,1000],  
#               'min_child_weight' : [0,0.05,0.5,1,3,5,7,9,15,25,50,100,200,300,500,700],
#               'num_leaves': [1,3,5,7,9,15,25,50,100,200,300,500,700],
#               'min_child_samples':[1,3,5,7,9,15,25,50,100,200,300,500,700],  
#               }   

# lgbm = LGBMRegressor()
# whiteleg_shrimp_final = RandomizedSearchCV(lgbm, param_distributions = params, cv = 10, scoring = 'neg_mean_squared_error', n_iter = 50, n_jobs = -1 ,random_state=0)

# features = final_whiteleg_shrimp.drop('P_PRICE',axis=1)
# target = final_whiteleg_shrimp['P_PRICE']

# whiteleg_shrimp_final.fit(features, target)

# Test

## Test Data

### Import Data

In [None]:
df_test = pd.read_excel(os.path.join(root, 'test.xlsx'))

### Preprocessing

In [None]:
# test data용 
def check_week(df):
    """
    dataframe에 sdate 과 edate 사이에 모든 데이터가 있는지 확인하는 함수
    :param df: 검사하고자 하는 dataframe (set_week 형태)
    :return: 데이터가 전체 존재하는지 여부
    """
    cnt = 0
    sdate = date(2020, 1, 6)  # start date
    edate = date(2020, 12, 28)  # end date
    delta = edate - sdate  # as timedelta
    mem = set()

    for i in range(delta.days + 1):
        day = sdate + timedelta(days=i)
        year, week = day.isocalendar()[0], day.isocalendar()[1]
        if year * 100 + week in mem:
            continue
        mem.add(year * 100 + week)
        if df[(df['year'] == year) & (df['week'] == week)].empty:
            print((year, week), end="")
            cnt += 1
    if cnt > 0:
        print()
    print("missing", cnt, "values")

In [None]:
set_week(df_test, 'REG_DATE')

In [None]:
check_week(df_test)

#### Squid

In [None]:
df_test_squid = df_test[(df_test['P_NAME']=='오징어') & ((df_test['CTRY_1']=='페루') | (df_test['CTRY_1']=='중국') | (df_test['CTRY_1']=='칠레')) & ((df_test['CTRY_2']=='페루') | (df_test['CTRY_2']=='중국') | (df_test['CTRY_2']=='칠레'))]

df_test_squid = df_test_squid[['CTRY_1','CTRY_2','P_PRICE','year','week']]

df_test_squid

##### Merge Features

In [None]:
df_test_squid_add = df_test_squid.copy()

In [None]:
df_test_squid_add = pd.merge(df_test_squid_add, df_weather_with_wf, how='left', on=['year', 'week', 'CTRY_1'])

In [None]:
df_test_squid_add = pd.merge(df_test_squid_add, df_weather_kr, how='left', on=['year', 'week'])

In [None]:
df_test_squid_add = pd.merge(df_test_squid_add, df_oil, how='left', on=['year', 'week'])

In [None]:
df_test_squid_add = pd.merge(df_test_squid_add, df_cpi, how='left', on=['year', 'week'])

In [None]:
df_test_squid_add = pd.merge(df_test_squid_add, df_exchange, how='left', on=['year', 'week', 'CTRY_2'])

##### Filling Missing Values

In [None]:
df_test_squid_add.isna().sum() # 페루 날씨 데이터 결측값 존재

In [None]:
df_test_squid_add = df_test_squid_add.sort_values('CTRY_1')

df_test_squid_add.fillna(method='ffill',inplace=True)  # 가까운 나라인 칠레 데이터로 대체

In [None]:
df_test_squid_add.isna().sum() # 결측값 처리 완료

##### Grouping

In [None]:
df_test_squid_add = df_test_squid_add.groupby(['year','week']).mean()

In [None]:
df_test_squid_add['temp_kr'] = (df_test_squid_add['temperature_kr'] + df_test_squid_add['water_temp_kr'] + df_test_squid_add['wind_kr']) / 3
df_test_squid_add['cpi'] = df_test_squid_add['cpi_fish'] / df_test_squid_add['cpi_total']

In [None]:
df_test_squid_add.drop(columns=['temperature_kr', 'cpi_fish', 'water_temp_kr','wind_kr' ,'cpi_total'], axis=1, inplace=True)

##### Final Test DF

In [None]:
final_test_squid = df_test_squid_add

#### Salmon

In [None]:
df_test_salmon = df_test[(df_test['P_NAME']=='연어') & (df_test['CTRY_1']=='노르웨이') & (df_test['CTRY_2']=='노르웨이')]

df_test_salmon = df_test_salmon[['CTRY_1','CTRY_2','P_PRICE','year','week']]

##### Merge Features

In [None]:
df_test_salmon_add = df_test_salmon.copy()

In [None]:
df_test_salmon_add = pd.merge(df_test_salmon_add, df_weather_with_wf, how='left', on=['year', 'week', 'CTRY_1'])

In [None]:
df_test_salmon_add = pd.merge(df_test_salmon_add, df_weather_kr, how='left', on=['year', 'week'])

In [None]:
df_test_salmon_add = pd.merge(df_test_salmon_add, df_oil, how='left', on=['year', 'week'])

In [None]:
df_test_salmon_add = pd.merge(df_test_salmon_add, df_cpi, how='left', on=['year', 'week'])

In [None]:
df_test_salmon_add = pd.merge(df_test_salmon_add, df_exchange, how='left', on=['year', 'week','CTRY_2'])

##### Filling Missing Values

In [None]:
df_test_salmon_add.isna().sum() # 날씨 데이터 결측값 존재

In [None]:
# 평균 값으로 대체

df_test_salmon_add['rain'].fillna(np.mean(df_test_salmon_add['rain']),inplace=True)
df_test_salmon_add['wind'].fillna(np.mean(df_test_salmon_add['wind']),inplace=True)
df_test_salmon_add['temperature'].fillna(np.mean(df_test_salmon_add['temperature']),inplace=True)

In [None]:
df_test_salmon_add.isna().sum() # 결측값 처리 완료

##### Grouping

In [None]:
df_test_salmon_add = df_test_salmon_add.groupby(['year','week']).mean()

In [None]:
df_test_salmon_add['temp_kr'] = (df_test_salmon_add['temperature_kr'] + df_test_salmon_add['water_temp_kr'] + df_test_salmon_add['wind_kr']) / 3
df_test_salmon_add['cpi'] = df_test_salmon_add['cpi_fish'] / df_test_salmon_add['cpi_total'] 

In [None]:
df_test_salmon_add.drop(columns=['temperature_kr', 'cpi_fish', 'water_temp_kr','wind_kr', 'cpi_total'], axis=1, inplace=True)

##### Final Test DF

In [None]:
final_test_salmon = df_test_salmon_add

#### Whiteleg Shrimp

In [None]:
df_test_whiteleg_shrimp = df_test[(df_test['P_NAME']=='흰다리새우') & ((df_test['CTRY_1']=='베트남') | (df_test['CTRY_1']=='태국')) & ((df_test['CTRY_2']=='베트남') | (df_test['CTRY_2']=='태국'))]

df_test_whiteleg_shrimp = df_test_whiteleg_shrimp[['CTRY_1','CTRY_2','P_PRICE','year','week']]

##### Merge Features

In [None]:
df_test_whiteleg_shrimp_add = df_test_whiteleg_shrimp.copy()

In [None]:
df_test_whiteleg_shrimp_add = pd.merge(df_test_whiteleg_shrimp_add, df_weather_with_wf, how='left', on=['year', 'week', 'CTRY_1'])

In [None]:
df_test_whiteleg_shrimp_add = pd.merge(df_test_whiteleg_shrimp_add, df_weather_kr, how='left', on=['year', 'week'])

In [None]:
df_test_whiteleg_shrimp_add = pd.merge(df_test_whiteleg_shrimp_add, df_oil, how='left', on=['year', 'week'])

In [None]:
df_test_whiteleg_shrimp_add = pd.merge(df_test_whiteleg_shrimp_add, df_cpi, how='left', on=['year', 'week'])

In [None]:
df_test_whiteleg_shrimp_add = pd.merge(df_test_whiteleg_shrimp_add, df_exchange, how='left', on=['year', 'week','CTRY_2'])

##### Filling Missing Values

In [None]:
df_test_whiteleg_shrimp_add.isna().sum()

##### Grouping

In [None]:
df_test_whiteleg_shrimp_add = df_test_whiteleg_shrimp_add.groupby(['year','week']).mean()

In [None]:
df_test_whiteleg_shrimp_add['temp_kr'] = (df_test_whiteleg_shrimp_add['temperature_kr'] + df_test_whiteleg_shrimp_add['water_temp_kr'] + df_test_whiteleg_shrimp_add['wind_kr']) / 3
df_test_whiteleg_shrimp_add['cpi'] = df_test_whiteleg_shrimp_add['cpi_fish'] / df_test_whiteleg_shrimp_add['cpi_total'] 

In [None]:
df_test_whiteleg_shrimp_add.drop(columns=['temperature_kr', 'cpi_fish', 'water_temp_kr','wind_kr', 'cpi_total'], axis=1, inplace=True)

##### Final Test DF

In [None]:
final_test_whiteleg_shrimp = df_test_whiteleg_shrimp_add

## evaluation

### Squid

In [None]:
squid_test_features = final_test_squid.drop('P_PRICE',axis=1)
squid_test_target = final_test_squid['P_PRICE']

pred = squid_final.predict(squid_test_features)

print('Best Params:', squid_final.best_params_)
print('Best Score:', np.sqrt(-1 *squid_final.best_score_))
print('Predict RMSE:',(np.sqrt(mean_squared_error(squid_test_target, np.exp(pred)-1))))

In [None]:
pred = squid_voting.predict(squid_test_features)

print('Predict RMSE:',(np.sqrt(mean_squared_error(squid_test_target, np.exp(pred)-1))))

RMSE이 더 작은 기존의 best 모델 선택

### Salmon

In [None]:
salmon_test_features = final_test_salmon.drop('P_PRICE',axis=1)
salmon_test_target = final_test_salmon['P_PRICE']

pred = salmon_final.predict(salmon_test_features)

print('Best Params:', salmon_final.best_params_)
print('Best Score:', np.sqrt(-1 *salmon_final.best_score_))
print('Predict RMSE:',(np.sqrt(mean_squared_error(salmon_test_target, np.exp(pred)-1))))

In [None]:
pred = salmon_voting.predict(salmon_test_features)

print('Predict RMSE:',(np.sqrt(mean_squared_error(salmon_test_target, np.exp(pred)-1))))

RMSE이 더 작은 기존의 best 모델 선택

### Whiteleg Shrimp

In [None]:
whiteleg_shrimp_test_features = final_test_whiteleg_shrimp.drop('P_PRICE',axis=1)
whiteleg_shrimp_test_target = final_test_whiteleg_shrimp['P_PRICE']

pred = whiteleg_shrimp_final.predict(whiteleg_shrimp_test_features)

print('Best Params:', whiteleg_shrimp_final.best_params_)
print('Best Score:', np.sqrt(-1 *whiteleg_shrimp_final.best_score_))
print('Predict RMSE:',(np.sqrt(mean_squared_error(whiteleg_shrimp_test_target,np.exp(pred)-1))))

In [None]:
pred = whiteleg_shrimp_voting.predict(whiteleg_shrimp_test_features)

print('Predict RMSE:',(np.sqrt(mean_squared_error(whiteleg_shrimp_test_target, np.exp(pred)-1))))

RMSE이 더 작은 기존의 best 모델 선택

# Result

In [None]:
# test data에 맞게 year,week filtering

oil_predict = df_oil[(df_oil['year']==2021) & (df_oil['week']<=26)].groupby(['year','week']).mean()
weather_kr_predict = df_weather_kr[(df_weather_kr['year']==2021) & (df_weather_kr['week']<=26)].groupby(['year','week']).mean()
cpi_predict = df_cpi[(df_cpi['year']==2021) & (df_cpi['week']<=26)].groupby(['year','week']).mean()

In [None]:
# 파생 변수 추가 및 기존 변수들 제거

weather_kr_predict['temp_kr'] = (weather_kr_predict['temperature_kr'] + weather_kr_predict['water_temp_kr'] + weather_kr_predict['wind_kr']) / 3

weather_kr_predict.drop(columns=['temperature_kr', 'wind_kr','water_temp_kr'], axis=1, inplace=True)

cpi_predict['cpi'] = cpi_predict['cpi_fish'] / cpi_predict['cpi_total'] 

cpi_predict.drop(columns=['cpi_fish', 'cpi_total'], axis=1, inplace=True)

## Merge Features

In [None]:
df_predict = oil_predict.copy()

df_predict = pd.merge(df_predict,weather_kr_predict,how='left',on=['year','week'])

In [None]:
df_predict = pd.merge(df_predict,cpi_predict,how='left',on=['year','week'])

### Squid

오징어 모델용 test features들을 생성. -> 최종 결과값 도출

#### squid test features

In [None]:
# 오징어 주요 제조국 및 수출국 filtering

squid_exchange_predict = df_exchange[(df_exchange['CTRY_2'] == '페루') | (df_exchange['CTRY_2'] == '중국') | (df_exchange['CTRY_2'] == '칠레')]
squid_weather_predict = df_weather_with_wf[(df_weather_with_wf['CTRY_1'] == '페루') | (df_weather_with_wf['CTRY_1'] == '중국') | (df_weather_with_wf['CTRY_1'] == '칠레')]

In [None]:
# test data에 맞게 year,week filtering

squid_exchange_predict = squid_exchange_predict[(squid_exchange_predict['year']==2021) & (squid_exchange_predict['week']<=26)]
squid_weather_predict = squid_weather_predict[(squid_weather_predict['year']==2021) & (squid_weather_predict['week']<=26)]

In [None]:
# grouping

squid_exchange_predict = squid_exchange_predict.groupby(['year','week']).mean()
squid_weather_predict = squid_weather_predict.groupby(['year','week']).mean()

In [None]:
# Merge Features

squid_predict = pd.DataFrame()

squid_predict['rain'] = squid_weather_predict['rain']
squid_predict['wind'] = squid_weather_predict['wind']
squid_predict['temperature'] = squid_weather_predict['temperature']
squid_predict['oil'] = df_predict['oil']
squid_predict['exchange'] = squid_exchange_predict
squid_predict['temp_kr'] = df_predict['temp_kr']
squid_predict['cpi'] = df_predict['cpi']

#### Result

In [None]:
squid_pred = squid_final.predict(squid_predict)

pd.DataFrame({'pred':np.exp(squid_pred)-1})

## Salmon

연어 모델용 test features들을 생성. -> 최종 결과값 도출

In [None]:
# 연어 주요 제조국 및 수출국 filtering

salmon_exchange_predict = df_exchange[(df_exchange['CTRY_2'] == '노르웨이')]
salmon_weather_predict = df_weather_with_wf[(df_weather_with_wf['CTRY_1'] == '노르웨이')]

In [None]:
# test data에 맞게 year,week filtering

salmon_exchange_predict = salmon_exchange_predict[(salmon_exchange_predict['year']==2021) & (salmon_exchange_predict['week']<=26)]
salmon_weather_predict = salmon_weather_predict[(salmon_weather_predict['year']==2021) & (salmon_weather_predict['week']<=26)]

In [None]:
# grouping

salmon_exchange_predict = salmon_exchange_predict.groupby(['year','week']).mean()
salmon_weather_predict = salmon_weather_predict.groupby(['year','week']).mean()

In [None]:
# Merge Features

salmon_predict = pd.DataFrame()

salmon_predict['rain'] = salmon_weather_predict['rain']
salmon_predict['wind'] = salmon_weather_predict['wind']
salmon_predict['temperature'] = salmon_weather_predict['temperature']
salmon_predict['oil'] = df_predict['oil']
salmon_predict['exchange'] = salmon_exchange_predict
salmon_predict['temp_kr'] = df_predict['temp_kr']
salmon_predict['cpi'] = df_predict['cpi']

#### Result

In [None]:
salmon_pred = salmon_final.predict(salmon_predict)

pd.DataFrame({'pred':np.exp(salmon_pred)-1})

## whiteleg_shrimp

흰다리새우 모델용 test features들을 생성. -> 최종 결과값 도출

In [None]:
# 흰다리새우 주요 제조국 및 수출국 filtering

whiteleg_shrimp_exchange_predict = df_exchange[(df_exchange['CTRY_2'] == '태국') | (df_exchange['CTRY_2'] == '베트남')]
whiteleg_shrimp_weather_predict = df_weather_with_wf[(df_weather_with_wf['CTRY_1'] == '태국') | (df_weather_with_wf['CTRY_1'] == '베트남')]

In [None]:
# test data에 맞게 year,week filtering

whiteleg_shrimp_exchange_predict = whiteleg_shrimp_exchange_predict[(whiteleg_shrimp_exchange_predict['year']==2021) & (whiteleg_shrimp_exchange_predict['week']<=26)]
whiteleg_shrimp_weather_predict = whiteleg_shrimp_weather_predict[(whiteleg_shrimp_weather_predict['year']==2021) & (whiteleg_shrimp_weather_predict['week']<=26)]

In [None]:
# grouping

whiteleg_shrimp_exchange_predict = whiteleg_shrimp_exchange_predict.groupby(['year','week']).mean()
whiteleg_shrimp_weather_predict = whiteleg_shrimp_weather_predict.groupby(['year','week']).mean()

In [None]:
# Merge Features

whiteleg_shrimp_predict = pd.DataFrame()

whiteleg_shrimp_predict['rain'] = whiteleg_shrimp_weather_predict['rain']
whiteleg_shrimp_predict['wind'] = whiteleg_shrimp_weather_predict['wind']
whiteleg_shrimp_predict['temperature'] = whiteleg_shrimp_weather_predict['temperature']
whiteleg_shrimp_predict['oil'] = df_predict['oil']
whiteleg_shrimp_predict['exchange'] = whiteleg_shrimp_exchange_predict
whiteleg_shrimp_predict['temp_kr'] = df_predict['temp_kr']
whiteleg_shrimp_predict['cpi'] = df_predict['cpi']

#### Result

In [None]:
whiteleg_shrimp_pred = whiteleg_shrimp_final.predict(whiteleg_shrimp_predict)

pd.DataFrame({'pred': np.exp(whiteleg_shrimp_pred)-1})