### 1. 라이브러리 선언

In [1]:
from sklearn import tree
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np

### 2. 데이터 정제

In [3]:
featuresData = pd.read_csv("../dataset/kopo_decision_tree_all_new.csv")
groupKey = ["REGIONID","PRODUCTGROUP","PRODUCT","ITEM"]
groupData = featuresData.groupby(groupKey)["YEARWEEK"].agg(["size"]).reset_index()
groupData.rename(columns={"size":"KNOB"}, inplace=True)
mergedData = pd.merge(left=featuresData, right=groupData, on= groupKey, how = "left")
maxKnob = mergedData.KNOB.max()

### 3. 작업할 데이터 선정

In [4]:
cleansedData = mergedData[mergedData.KNOB >= maxKnob]
cleansedData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,KNOB
2352,A01,PG02,P03,ITEM043,201501,2015,1,87,Y,1,Y,0.197590,146
2353,A01,PG02,P03,ITEM043,201502,2015,2,60,N,4,Y,0.197590,146
2354,A01,PG02,P03,ITEM043,201503,2015,3,51,N,4,N,0.000000,146
2355,A01,PG02,P03,ITEM043,201504,2015,4,37,Y,2,N,0.000000,146
2356,A01,PG02,P03,ITEM043,201505,2015,5,136,N,4,Y,0.201205,146
2357,A01,PG02,P03,ITEM043,201506,2015,6,117,N,4,Y,0.201205,146
2358,A01,PG02,P03,ITEM043,201507,2015,7,106,N,4,Y,0.201205,146
2359,A01,PG02,P03,ITEM043,201508,2015,8,56,Y,1,N,0.000000,146
2360,A01,PG02,P03,ITEM043,201509,2015,9,70,N,4,N,0.000000,146
2361,A01,PG02,P03,ITEM043,201510,2015,10,167,N,4,Y,0.201205,146


### 4. 머신러닝 함수 생성

In [12]:
def predict_model(onegroup):

    # * 1. 특성선정 및 데이터 분리
    eachgroup = onegroup.reset_index(drop=True)
    corrdf = eachgroup.corr()
    yearweekStd = 201701

    features = list (corrdf[ (abs(corrdf.QTY) > 0.5) & (abs(corrdf.QTY) < 1)].index)
    
    if (len(features) > 0):

        label = ["QTY"]

        ###1. feature 개수를 확인한다. 
        ###2. feature 개수가 0인 경우 해당 그룹의 예측 값음 0으로 세팅한다. 
        ###3. feature 개수가 1개 이상인 경우 예측값음 머신러닝으로 수행
        ###plus 
        ###1.예측 모델은 3개 이상 구현
        ###2.그룹별 mae 값 생성
        ###3.베스트 모델 추천

        trainingData_features = eachgroup[eachgroup.YEARWEEK < yearweekStd][features]
        trainingData_label = eachgroup[eachgroup.YEARWEEK < yearweekStd][label]
        testData_features = eachgroup[eachgroup.YEARWEEK >= yearweekStd][features]
        testData_label = eachgroup[eachgroup.YEARWEEK >= yearweekStd][label]
        testData_all = eachgroup[eachgroup.YEARWEEK >= yearweekStd]

        # * 2. 모델선언
        model_method_dt = tree.DecisionTreeRegressor(random_state =1)
        model_method_lr = linear_model.LinearRegression()
        model_method_rf = ensemble.RandomForestRegressor(random_state =1, n_estimators= 10)

        # * 3. 학습
        model_dt =model_method_dt.fit (trainingData_features ,trainingData_label)
        model_lr =model_method_lr.fit (trainingData_features ,trainingData_label)
        model_rf =model_method_rf.fit (trainingData_features ,trainingData_label)

        # * 4. 예측
        predict_dt = model_dt.predict(testData_features)
        predict_lr = model_lr.predict(testData_features)
        predict_rf = model_rf.predict(testData_features)
        testData_all["DT_PREDICT"] = predict_dt
        testData_all["LR_PREDICT"] = predict_lr
        testData_all["RF_PREDICT"] = predict_rf
        testData_all["PREDICT_YN"] = "Y"
    else:
        testData_all = eachgroup[eachgroup.YEARWEEK >= yearweekStd]
        testData_all["DT_PREDICT"] = 0
        testData_all["LR_PREDICT"] = 0
        testData_all["RF_PREDICT"] = 0
        testData_all["PREDICT_YN"] = "N"
    
    return testData_all

### 5. 클린데이터 학습

In [13]:
finalResult = cleansedData.groupby(groupKey).apply(predict_model)





### 6. 특성이 없던 데이터 제거

In [14]:
# finalResult = finalResult[(finalResult.DT_PREDICT > 0)& (finalResult.LR_PREDICT>0) & (finalResult.RF_PREDICT>0)]
finalResult = finalResult[finalResult.PREDICT_YN == "Y"]

### 7. mae 값을 구하여 알고리즘 추천

In [16]:
mae_dt = mean_absolute_error(finalResult["QTY"], finalResult.DT_PREDICT)
mae_rf = mean_absolute_error(finalResult["QTY"], finalResult.RF_PREDICT)
mae_lr = mean_absolute_error(finalResult["QTY"], finalResult.LR_PREDICT)
dicMae = [["Decision Trees" , mae_dt],[ "Linear Regression" , mae_lr], ["Random Forests" ,mae_rf ]]
dicMae = sorted(dicMae, key=lambda t :t[1])
pd.DataFrame(dicMae)



Unnamed: 0,0,1
0,Random Forests,69.527337
1,Decision Trees,70.336132
2,Linear Regression,79.892643


In [17]:
print("추천 알고리즘 : " + str(dicMae[0][0]))

추천 알고리즘 : Random Forests
