<a href="https://colab.research.google.com/github/ar851060/ESUN_AI_2021_winter/blob/main/8_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_decomposition import PLSRegression
from xgboost import XGBRegressor
from sklearn.linear_model import LassoCV, LinearRegression, ElasticNetCV
from sklearn.model_selection import ParameterGrid, cross_val_score
from sklearn.metrics import mean_squared_error
import pickle
import timeit
import pathlib

In [None]:
seed = 20211225
models = {
    # "PLS": PLSRegression(copy = False),
    # "Linear": LinearRegression(copy_X = False, n_jobs = -1),
    # "Lasso": LassoCV(selection = "random", n_jobs = -1, random_state = seed),
    # "Net": ElasticNetCV(selection = "random", n_jobs = -1, random_state = seed),
    # "Bag": RandomForestRegressor(n_jobs = -1, max_features = None, random_state = seed, verbose = 2),
    "RF": RandomForestRegressor(n_jobs = -1, max_features = "sqrt", random_state = seed, max_samples = 0.5, n_estimators=100, max_depth = None),
}
# models = {"XGB": XGBRegressor(tree_method = "gpu_hist", n_jobs = -1, random_state = seed)}

In [None]:
# params = {"XGB": {"n_estimators":[100,500,1000], "learning_rate":[0.01,0.001], "max_depth":[3,5]}}
params = {"PLS": {"n_components":[5,8,11]},
      "Bag": {"n_estimators":[500,800], "max_depth":[3,5]},
      "RF": {"n_estimators":[500], "max_depth":[None]}}

In [None]:
# 調參
def tuning(tag, name, params=params, models=models):
  best_score = -999
  best_param = None
  param = list(ParameterGrid(params[name]))
  scores = []
  for p in param:
    model = models[name].set_params(**p)
    trainx = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/trainx_"+tag+"_rank.csv").fillna(0)
    trainx = trainx[trainx['dt']<23]
    trainy = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/trainy_"+tag+"_rank.csv")
    temp = cross_val_score(model, trainx, trainy, scoring = "neg_mean_squared_error", verbose = 3)
    del model
    temp = np.mean(temp)
    if temp > best_score:
      best_score = temp
      best_param = p
    p["score"] = temp
    scores.append(p)
  print("best parameters are:")
  print(best_param)
  print("best score:")
  print(temp)
  pd.DataFrame(scores).to_csv("/content/drive/Shareddrives/統學期末比賽/Output/tuning_score/"+name+"_"+tag+"_total.csv")
  return best_param


In [None]:
# 訓練模型
def training(tag, name, best_param, models = models):
  trainx = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/trainx_"+tag+"_rank.csv").drop(["chid","shop_tag"], axis = 1).fillna(0)
  trainx = trainx[trainx['dt']<23]
  trainy = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/trainy_"+tag+"_rank.csv")
  trainy = trainy.iloc[trainx.index]
  if best_param is None:
    model = models[name]
  else:
    best_param.pop("score")
    model = models[name].set_params(**best_param)
  start = timeit.default_timer()
  model.fit(trainx, trainy)
  stop = timeit.default_timer()
  del trainx
  del trainy
  valx = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/valx_"+tag+"_rank.csv").drop(["shop_tag"], axis = 1).fillna(0)
  valx = valx[valx['txn_amt']>0]
  valy = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/valy_"+tag+"_rank.csv")
  valy = valy.iloc[valx.index]
  print("predict validation...")
  pre = model.predict(valx)
  # test_tag = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/test_tag_"+tag+"_base.csv")
  # test_tag = test_tag[test_tag['shop_tag']==tag].drop("shop_tag",axis = 1)
  # pre = list(pre)
  # test_tag["prediction"] = pre
  # path = pathlib.Path("/content/drive/Shareddrives/統學期末比賽/Output/val_score/"+name)
  # path.mkdir(parents=True, exist_ok=True)
  # test_tag.to_csv("/content/drive/Shareddrives/統學期末比賽/Output/val_score/"+name+"/"+name+"_"+tag+"_t.csv")
  sc = mean_squared_error(valy, pre)
  print("validation score is")
  print(sc)
  pd.DataFrame([name, sc, stop - start], index = ["models", "score", "time"]).T.to_csv("/content/drive/Shareddrives/統學期末比賽/Output/validation_score/"+name+"_"+tag+"_total.csv")
  cols = valx.columns
  del valx
  del valy
  # print("importance...")
  # pd.DataFrame(model.feature_importances_, index = cols).to_csv("/content/drive/Shareddrives/統學期末比賽/Output/importance/"+name+"_"+tag+"_correct.csv")

In [None]:
def predict_output(tag, name, best_param, models = models):
  print("refit...")
  trainx = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/trainx_"+tag+"_rank.csv").drop(["chid","shop_tag"], axis = 1).fillna(0)
  trainx = trainx[trainx['dt']<23]
  trainy = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/trainy_"+tag+"_rank.csv")
  trainy = trainy.iloc[trainx.index]
  if best_param is None:
    model = models[name]
  else:
    model = models[name].set_params(**best_param)
  model.fit(trainx, trainy)
  del trainx
  del trainy
  print("predict...")
  testx = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/testx_"+tag+"_rank.csv").drop(["shop_tag"], axis = 1).fillna(0)
  test_tag = pd.read_csv("/content/drive/Shareddrives/統學期末比賽/train_test/"+tag+"/test_tag_"+tag+"_rank.csv")
  test_tag = test_tag[test_tag['shop_tag']==tag].drop("shop_tag",axis = 1)
  pre = list(model.predict(testx))
  test_tag["prediction"] = pre
  path = pathlib.Path("/content/drive/Shareddrives/統學期末比賽/Output/Result/"+name)
  path.mkdir(parents=True, exist_ok=True)
  test_tag.to_csv("/content/drive/Shareddrives/統學期末比賽/Output/Result/"+name+"/"+name+"_"+tag+"_total.csv")
  del model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
linlist = ["Linear","Lasso","Net"]
for name in models.keys():
  print(name+" starts")
  # pred = map(str, [2,6,10,12,13,15,18,19,21,22,25,26,36,39,48])
  pred = ["37"]
  # pred = ["39","48"]
  for tag in pred:
    # print("tuning parameters...")
    # if not name in linlist:
    #   param = tuning(tag,name)
    # else:
    #   param = None
    param = None
    # train and validation
    print("training model...")
    training(tag, name, param)

    # test
    print("predict test...")
    # model = predict_output(tag, name, param)
    model = predict_output(tag, name, param)
    # save model
    print("saving model...")
    _ = pickle.dump(model, open("/content/drive/Shareddrives/統學期末比賽/Output/model/"+name+"_"+tag+"_total.sav",'wb'))
    del model
    print('success in '+name+" at "+tag)
  print(name+" all done!!")


# Lasso index
# Linear columns    



RF starts
training model...


  del sys.path[0]


predict validation...
validation score is
13.796493311103363
predict test...
refit...


  # This is added back by InteractiveShellApp.init_path()


predict...
saving model...
success in RF at 37
RF all done!!
