# Model Sample

## Basic function

### Create folders

In [1]:
import os

def mkdir(path):
    folder = os.path.exists(path)
    if not folder:                   
        os.makedirs(path)            
        print("---  new folder...  ---")
        print("---  OK  ---")
    else:
        print("---  There is this folder!  ---")

### Evaluation function

In [3]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,median_absolute_error,r2_score,mean_squared_log_error

def calculate(y_true, y_predict, n, p):
    y_true = y_true.reshape(-1,1)
    y_predict = y_predict.reshape(-1,1)
    mse = mean_squared_error(y_true, y_predict)
    rmse = np.sqrt(mean_squared_error(y_true, y_predict))
    mae = mean_absolute_error(y_true, y_predict)
    r2 = r2_score(y_true, y_predict)
    mad = median_absolute_error(y_true, y_predict)
    mape = np.mean(np.abs((y_true - y_predict) / y_true)) * 100
    r2_adjusted = 1-((1-r2)*(n-1))/(n-p-1)
#     rmsle = np.sqrt(mean_squared_log_error(y_true,y_predict))
    print('MSE: ', mse)
    print('RMSE: ', rmse)
    print('MAE: ', mae)
    print('R2: ', r2)
    print('MAD:', mad)
    print('MAPE:', mape)
    print('R2_Adjusted: ',r2_adjusted)
#     print("RMSLE: ",rmsle)
    return mse,rmse,mae,r2,mad,mape,r2_adjusted

### Extract the specified file

In [None]:
def file_name(file_dir): 
    L=[] 
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            if(os.path.splitext(file)[1] == '.csv'):
                L.append(os.path.join(root, file))
    return L

### Save variables for the results

In [None]:
pred = pd.DataFrame()
Test = data[data["Date"]>="2020-05-31"]
pred["Date"] = Test["Date"]
pred["Name"] = Test["Name"]
pred.reset_index(drop=True,inplace=True)
all_assess = []
all_pre = []
all_mo = []

### Model fitting diagram

In [None]:
def plt_line(name,pred,true,path):
    plt.figure(figsize=(20, 10),edgecolor='white',facecolor='white')
    plt.plot(pred, '-', label="Predicted", color="blue", linewidth=5,markersize=5)
    plt.plot(true, '--', label="Real", color="red", linewidth=5,markersize=5)
    plt.title(name)
    plt.legend()
    plt.ylabel("Daily Confirmed")
    plt.grid()
    path1 = "./result/" + path + "/"
    if(os.path.exists(path1)):
        pass
    else:
        mkdir(path1)
    path2 = path1  + name + ".pdf"
    plt.savefig(path2)
    plt.show()

![3](./img/3.png)

In [None]:
def plt_line2(Seq,path):
    Citys = np.unique(Seq["Name"])
    for c in Citys:
        S = Seq[Seq["Name"]==c]
        S = S.sort_values(by="Date")
        La = S["Label"].values.tolist()
        if ("Train" in La) and ("Test" in La):
            print(c,path)
            S2 = S.set_index(S["Date"],drop=True)
            sns.set_style("ticks")
            plt.figure(figsize=(20, 10),edgecolor='white',facecolor='white')
            S2["Real"].plot(style="--",fontsize=30,linewidth=5,markersize=5)
            S2[path].plot(style="-",fontsize=30,linewidth=5,markersize=5)
            plt.xlabel("Date",fontsize=30)
            plt.ylabel("Daily Confirmed",fontsize=30)
            plt.legend(fontsize=30)
            plt.title(c,fontsize=30)
            plt.grid()
            path1 = "./result/" + path + "/"
            if(os.path.exists(path1)):
                pass
            else:
                mkdir(path1)
            path2 = path1  + name + ".pdf"
            plt.savefig(path2)
            plt.show()

## AI

### Tradictional ML

#### KNN

##### Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor
import csv
import os

def adjust_knn(train_x_s,test_x_s,train_y,test_y,name=0):
    if(name!=0):
        path2 = "./results/KNN_" + name + "_" + "assess.csv"
        path3 = "./results/KNN_" + name + "_" + "parameter.csv"
    else:
        path2 = "./results/KNN_assess.csv"
        path3 = "./results/KNN_parameter.csv"  
    all_assessed_values = []
    all_parameter = []
    n_neighbors = [5,7,9,11,13,15,17,19] # 默认为5
    weights = ['uniform', 'distance']
    algorithm = ["brute","kd_tree","ball_tree"]
    leaf_size = [25,30,35,40,45] #默认是30
    metric = ["euclidean","manhattan","chebyshev","minkowski","wminkowski","seuclidean","mahalanobis"]
    P = [1,2] # 只在 wminkowski 和 minkowski 调
    all_nb = len(n_neighbors) * len(weights) * len(algorithm) * len(leaf_size) * len(metric) * len(P)
    num=1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for n in n_neighbors:
            for l in leaf_size:
                for p in P:
                    for a in algorithm:
                        for m in metric:
                            for w in weights:
                                if(nums<num):
                                    try:
                                        if(m=="wminkowski" or m=="minkowski"):
                                            print("start....{}/{}".format(num,all_nb))
                                            knn = KNeighborsRegressor(n_neighbors=n,leaf_size=l,p=p,weights=w,metric=m,algorithm=a)
                                            knn.fit(train_x_s,train_y)
                                            pred_test = knn.predict(test_x_s)
                                            pred_test = pred_test.reshape(-1,1)
                                            sample_n = pred_test.shape[0]
                                            feature_n = test_x_s.shape[1]
                                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                            all_p = [num,m,a,w,n,l,p]
                                            print(all_m)
                                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_m)
                                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_p)
                                            print("end....",num)
                                            num = num+1
                                            print("--------------------------------")  
                                        else:
                                            print("start....{}/{}".format(num,all_nb))
                                            knn = KNeighborsRegressor(n_neighbors=n,leaf_size=l,weights=w,metric=m,algorithm=a)
                                            knn.fit(train_x_s,train_y)
                                            pred_test = knn.predict(test_x_s)
                                            pred_test = pred_test.reshape(-1,1)
                                            sample_n = pred_test.shape[0]
                                            feature_n = test_x_s.shape[1]
                                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                            all_p = [num,m,a,w,n,l,p]
                                            print(all_m)
                                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_m)
                                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_p)
                                            print("end....",num)
                                            num = num+1
                                            print("--------------------------------") 
                                    except:
                                        print("error")
                                else:
                                    num = num+1
    else:
        for n in n_neighbors:
            for l in leaf_size:
                for p in P:
                    for a in algorithm:
                        for m in metric:
                            for w in weights:
                                try:
                                    if(m=="wminkowski" or m=="minkowski"):
                                        print("start....{}/{}".format(num,all_nb))
                                        knn = KNeighborsRegressor(n_neighbors=n,leaf_size=l,p=p,weights=w,metric=m,algorithm=a)
                                        knn.fit(train_x_s,train_y)
                                        pred_test = knn.predict(test_x_s)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_x_s.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,m,a,w,n,l,p]
                                        print(all_m)
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------")  
                                    else:
                                        print("start....{}/{}".format(num,all_nb))
                                        knn = KNeighborsRegressor(n_neighbors=n,leaf_size=l,weights=w,metric=m,algorithm=a)
                                        knn.fit(train_x_s,train_y)
                                        pred_test = knn.predict(test_x_s)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_x_s.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,m,a,w,n,l,p]
                                        print(all_m)
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------") 
                                except:
                                    print("error")
                            else:
                                num = num+1


In [None]:
m = "manhattan"
a = "kd_tree"
w = "distance"
n = 5
l = 25
p = 2

knn = KNeighborsRegressor(n_neighbors=n,leaf_size=l,p=p,weights=w,metric=m,algorithm=a)
knn.fit(train_X_n,train_y_n)
pred_test = knn.predict(test_X_n)
pred_test = pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_n.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_n,pred_test,sample_n,feature_n)
plt_line("KNN",pred_test,test_y_n,"plt")
all_assess.append(["KNN",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["KNN" for n in range(sample_n)]
pred["KNN"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["KNN"] = knn.predict(train_X_n).reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["KNN"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

##### Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import csv
import os
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

def adjust_knn(train_x_s,test_x_s,train_y,test_y,name=0):
    if(name!=0):
        path2 = "./results/KNN_" + name + "_" + "assess.csv"
        path3 = "./results/KNN_" + name + "_" + "parameter.csv"
    else:
        path2 = "./results/KNN_assess.csv"
        path3 = "./results/KNN_parameter.csv"
    all_assessed_values = []
    all_parameter = []
    n_neighbors = [5,7,9,11,13,15,17,19] # default=5
    weights = ['uniform', 'distance'] # default=’uniform’
    algorithm = ["auto","brute","kd_tree","ball_tree"] # default=’auto’
    leaf_size = [20,25,30,35,40] # default=30
    metric = ["euclidean","manhattan","chebyshev","minkowski","wminkowski","seuclidean","mahalanobis"] # default=’minkowski’
    P = [1,2] # 只在minkowski调 , default=2
    all_nb = len(n_neighbors) * len(weights) * len(algorithm) * len(leaf_size) * len(metric) * len(P)
    num=1
    n_jobs = -1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for n in n_neighbors:
            for l in leaf_size:
                for p in P:
                    for a in algorithm:
                        for m in metric:
                            for w in weights:
                                if(nums<num):
                                    try:
                                        if(m=="minkowski"):
                                            print("start....{}/{}".format(num,all_nb))
                                            knn = KNeighborsClassifier(n_neighbors=n,leaf_size=l,p=p,weights=w,metric=m,algorithm=a,n_jobs=-1)
                                            knn.fit(train_x_s,train_y)
                                            pred_test = knn.predict(test_x_s)
                                            pred_test = pred_test.reshape(-1,1)
                                            sample_n = pred_test.shape[0]
                                            feature_n = test_x_s.shape[1]
                                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                            all_p = [num,m,a,w,n,l,p]
#                                             print(all_m)
                                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_m)
                                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_p)
                                            print("end....",num)
                                            num = num+1
                                            print("--------------------------------")  
                                        else:
                                            print("start....{}/{}".format(num,all_nb))
                                            knn = KNeighborsClassifier(n_neighbors=n,leaf_size=l,weights=w,metric=m,algorithm=a,n_jobs=-1)
                                            knn.fit(train_x_s,train_y)
                                            pred_test = knn.predict(test_x_s)
                                            pred_test = pred_test.reshape(-1,1)
                                            sample_n = pred_test.shape[0]
                                            feature_n = test_x_s.shape[1]
                                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                            all_p = [num,m,a,w,n,l,p]
#                                             print(all_m)
                                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_m)
                                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_p)
                                            print("end....",num)
                                            num = num+1
                                            print("--------------------------------") 
                                    except:
                                        print("error")
                                else:
                                    num = num+1
    else:
        for n in n_neighbors:
            for l in leaf_size:
                for p in P:
                    for a in algorithm:
                        for m in metric:
                            for w in weights:
                                try:
                                    if(m=="minkowski"):
                                        print("start....{}/{}".format(num,all_nb))
                                        knn = KNeighborsClassifier(n_neighbors=n,leaf_size=l,p=p,weights=w,metric=m,algorithm=a,n_jobs=-1)
                                        knn.fit(train_x_s,train_y)
                                        pred_test = knn.predict(test_x_s)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_x_s.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,m,a,w,n,l,p]
#                                         print(all_m)
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------")  
                                    else:
                                        print("start....{}/{}".format(num,all_nb))
                                        knn = KNeighborsClassifier(n_neighbors=n,leaf_size=l,weights=w,metric=m,algorithm=a,n_jobs=-1)
                                        knn.fit(train_x_s,train_y)
                                        pred_test = knn.predict(test_x_s)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_x_s.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,m,a,w,n,l,p]
#                                         print(all_m)
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------") 
                                except:
                                    print("error")
                            else:
                                num = num+1


#### DT

##### Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
import csv
import os

def adjust_dt(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/DT_" + name + "_" + "assess.csv"
    path3 = "./results/DT_" + name + "_" + "parameter.csv"
    all_pred_results = []
    all_assessed_values = []
    all_parameter = []
    criterion = ["mse","mae"]
    splitter = ["best","random"]
    max_depth = None
    min_samples_split = ["None",2,3,4,5,6,7,8,9,10]
    max_features = ["None","auto","sqrt","log2"]
    random_state = 17
    max_leaf_nodes = None
    all_nb = len(criterion) * len(splitter) * len(min_samples_split) * len(max_features)
    num=1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for c in criterion:
            for s in splitter:
                for ma in max_features:
                    for mi in min_samples_split:
                        if(nums<num):
                            print("start....{}/{}".format(num,all_nb))
                            if(ma=="None" and mi!="None"):
                                dt = DecisionTreeRegressor(criterion=c,splitter=s,random_state=random_state,min_samples_split=mi)
                            elif(ma!="None" and mi!="None"):
                                dt = DecisionTreeRegressor(criterion=c,splitter=s,max_features=ma,random_state=random_state,min_samples_split=mi)
                            elif(ma!="None" and mi=="None"):
                                dt = DecisionTreeRegressor(criterion=c,splitter=s,random_state=random_state,max_features=ma)
                            else:
                                dt = DecisionTreeRegressor(criterion=c,splitter=s,random_state=random_state)
                            dt.fit(train_x_s,train_y)
                            pred_test = dt.predict(test_x_s)
                            pred_test = pred_test.reshape(-1,1)
                            sample_n = pred_test.shape[0]
                            feature_n = test_x_s.shape[1]
                            mse,rmse,mae,r2,mad,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                            all_m = [num,mse,rmse,mae,r2,mad,r2_adjusted]
                            all_p = [num,c,s,ma]
    #                         print(all_m)
                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_m)
                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_p)
                            print("end....",num)
                            num = num+1
                            print("--------------------------------")    
                        else:
                            num = num+1
    else:
        for c in criterion:
            for s in splitter:
                for ma in max_features:
                    for mi in min_samples_split:
                        print("start....{}/{}".format(num,all_nb))
                        if(ma=="None" and mi!="None"):
                            dt = DecisionTreeRegressor(criterion=c,splitter=s,random_state=random_state,min_samples_split=mi)
                        elif(ma!="None" and mi!="None"):
                            dt = DecisionTreeRegressor(criterion=c,splitter=s,max_features=ma,random_state=random_state,min_samples_split=mi)
                        elif(ma!="None" and mi=="None"):
                            dt = DecisionTreeRegressor(criterion=c,splitter=s,random_state=random_state,max_features=ma)
                        else:
                            dt = DecisionTreeRegressor(criterion=c,splitter=s,random_state=random_state)
                        dt.fit(train_x_s,train_y)
                        pred_test = dt.predict(test_x_s)
                        pred_test = pred_test.reshape(-1,1)
                        sample_n = pred_test.shape[0]
                        feature_n = test_x_s.shape[1]
                        mse,rmse,mae,r2,mad,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                        all_m = [num,mse,rmse,mae,r2,mad,r2_adjusted]
                        all_p = [num,c,s,ma]
    #                     print(all_m)
                        with open(path2,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_m)
                        with open(path3,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_p)
                        print("end....",num)
                        num = num+1
                        print("--------------------------------") 

In [None]:
c = "mse"
s = "best"
ma = "sqrt"

dt = DecisionTreeRegressor(criterion=c,splitter=s,random_state=17)
dt.fit(train_X_lg,train_y_lg)
pred_test = dt.predict(test_X_lg)
pred_test = 10**pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_lg.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_lg,pred_test,sample_n,feature_n)
plt_line("DT",pred_test,test_y_lg,"plt")
all_assess.append(["DT",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["DT" for n in range(sample_n)]
pred["DT"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["DT"] = (10**dt.predict(train_X_lg).reshape(1,-1)[0]).tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["DT"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

##### Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
import csv
import os
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

def adjust_dt(train_x_s,test_x_s,train_y,test_y,name=0):
    if(name!=0):
        path2 = "./results/DT_" + name + "_" + "assess.csv"
        path3 = "./results/DT_" + name + "_" + "parameter.csv"
    else:
        path2 = "./results/DT_assess.csv"
        path3 = "./results/DT_parameter.csv"
    all_pred_results = []
    all_assessed_values = []
    all_parameter = []
    criterion = ["gini","entropy"] # default=”gini”
    splitter = ["best","random"] # default=”best”
    max_depth = None # default=None
    min_samples_split = ["None",2,3,4,5,6,7,8,9,10] # default=2
    max_features = ["None","auto","sqrt","log2"] # default=None
    min_samples_leaf = [1,2] # default=1
    random_state = 17 # default=None
    max_leaf_nodes = None
    all_nb = len(criterion) * len(splitter) * len(min_samples_split) * len(max_features) * len(min_samples_leaf)
    num=1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for c in criterion:
            for s in splitter:
                for ma in max_features:
                    for mi in min_samples_split:
                        for ml in min_samples_leaf:
                            if(nums<num):
                                print("start....{}/{}".format(num,all_nb))
                                if(ma=="None" and mi!="None"):
                                    dt = DecisionTreeClassifier(min_samples_leaf=ml,criterion=c,splitter=s,random_state=random_state,min_samples_split=mi)
                                elif(ma!="None" and mi!="None"):
                                    dt = DecisionTreeClassifier(min_samples_leaf=ml,criterion=c,splitter=s,max_features=ma,random_state=random_state,min_samples_split=mi)
                                elif(ma!="None" and mi=="None"):
                                    dt = DecisionTreeClassifier(min_samples_leaf=ml,criterion=c,splitter=s,random_state=random_state,max_features=ma)
                                else:
                                    dt = DecisionTreeClassifier(min_samples_leaf=ml,criterion=c,splitter=s,random_state=random_state)
                                dt.fit(train_x_s,train_y)
                                pred_test = dt.predict(test_x_s)
                                pred_test = pred_test.reshape(-1,1)
                                sample_n = pred_test.shape[0]
                                feature_n = test_x_s.shape[1]
                                mse,rmse,mae,r2,mad,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                all_m = [num,mse,rmse,mae,r2,mad,r2_adjusted]
                                all_p = [num,c,s,ma,mi,ml]
        #                         print(all_m)
                                with open(path2,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_m)
                                with open(path3,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_p)
                                print("end....",num)
                                num = num+1
                                print("--------------------------------")    
                            else:
                                num = num+1
    else:
        for c in criterion:
            for s in splitter:
                for ma in max_features:
                    for mi in min_samples_split:
                        for ml in min_samples_leaf:
                            print("start....{}/{}".format(num,all_nb))
                            if(ma=="None" and mi!="None"):
                                dt = DecisionTreeClassifier(min_samples_leaf=ml,criterion=c,splitter=s,random_state=random_state,min_samples_split=mi)
                            elif(ma!="None" and mi!="None"):
                                dt = DecisionTreeClassifier(min_samples_leaf=ml,criterion=c,splitter=s,max_features=ma,random_state=random_state,min_samples_split=mi)
                            elif(ma!="None" and mi=="None"):
                                dt = DecisionTreeClassifier(min_samples_leaf=ml,criterion=c,splitter=s,random_state=random_state,max_features=ma)
                            else:
                                dt = DecisionTreeClassifier(min_samples_leaf=ml,criterion=c,splitter=s,random_state=random_state)
                            dt.fit(train_x_s,train_y)
                            pred_test = dt.predict(test_x_s)
                            pred_test = pred_test.reshape(-1,1)
                            sample_n = pred_test.shape[0]
                            feature_n = test_x_s.shape[1]
                            mse,rmse,mae,r2,mad,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                            all_m = [num,mse,rmse,mae,r2,mad,r2_adjusted]
                            all_p = [num,c,s,ma,mi,ml]
        #                     print(all_m)
                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_m)
                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_p)
                            print("end....",num)
                            num = num+1
                            print("--------------------------------") 

#### SVM

##### Regressor

In [None]:
from sklearn.svm import SVR
import csv
import os

def adjust_svr(train_x_s,test_x_s,train_y,test_y,name=0):
    if(name!=0):
        path2 = "./results/SVM_" + name + "_" + "assess.csv"
        path3 = "./results/SVM_" + name + "_" + "parameter.csv"
    else:
        path2 = "./results/SVM_assess.csv"
        path3 = "./results/SVM_parameter.csv"
    all_pred_results = []
    all_assessed_values = []
    all_parameter = []
    kernel = ["rbf","linear","poly","sigmoid"]
    degree = [2,3,4,5,6,7,8,9,10,11,12]
    gamma = ["auto","scale"]
    tol = [1e-3,3e-3,2e-3,1e-4,4e-3]
    all_nb = len(kernel) * len(degree) * len(gamma) * len(tol) 
    num=1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for k in kernel:
            if(k=="poly"):
                for d in degree:
                    for g in gamma:
                        if(nums<num):
                            print("start....{}/{}".format(num,all_nb))
                            svr = SVR(kernel=k,degree=d,gamma=g)
                            svr.fit(train_x_s,train_y.ravel())
                            pred_test = svr.predict(test_x_s)
                            pred_test = pred_test.reshape(-1,1)
                            sample_n = pred_test.shape[0]
                            feature_n = test_x_s.shape[1]
                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                            all_p = [num,k,d,g]
#                             print(all_m)
                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_m)
                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_p)
                            print("end....",num)
                            num = num+1
                            print("--------------------------------")
                        else:
                            num = num+1
            elif(k=="rbf" or k=="sigmoid"):
                for g in gamma:
                    if(nums<num):
                        print("start....{}/{}".format(num,all_nb))
                        svr = SVR(kernel=k,gamma=g)
                        svr.fit(train_x_s,train_y.ravel())
                        pred_test = svr.predict(test_x_s)
                        pred_test = pred_test.reshape(-1,1)
                        sample_n = pred_test.shape[0]
                        feature_n = test_x_s.shape[1]
                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                        all_p = [num,k,"None",g]
#                         print(all_m)
                        with open(path2,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_m)
                        with open(path3,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_p)
                        print("end....",num)
                        print("--------------------------------")
                        num = num+1
                    else:
                        num = num+1
            else:
                if(nums<num):
                    print("start....{}/{}".format(num,all_nb))
                    svr = SVR(kernel=k)
                    svr.fit(train_x_s,train_y.ravel())
                    pred_test = svr.predict(test_x_s)
                    pred_test = pred_test.reshape(-1,1)
                    sample_n = pred_test.shape[0]
                    feature_n = test_x_s.shape[1]
                    mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                    all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                    all_p = [num,k,"None","None"]
#                     print(all_m)
                    with open(path2,"a",encoding="utf-8",newline="")as f:
                        f = csv.writer(f)
                        f.writerow(all_m)
                    with open(path3,"a",encoding="utf-8",newline="")as f:
                        f = csv.writer(f)
                        f.writerow(all_p)
                    print("end....",num)
                    num = num+1
                    print("--------------------------------")  
                else:
                    num = num+1
    else:
        for k in kernel:
            if(k=="poly"):
                for d in degree:
                    for g in gamma:
                        print("start....{}/{}".format(num,all_nb))
                        svr = SVR(kernel=k,degree=d,gamma=g)
                        svr.fit(train_x_s,train_y.ravel())
                        pred_test = svr.predict(test_x_s)
                        pred_test = pred_test.reshape(-1,1)
                        sample_n = pred_test.shape[0]
                        feature_n = test_x_s.shape[1]
                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                        all_p = [num,k,d,g]
#                         print(all_m)
                        with open(path2,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_m)
                        with open(path3,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_p)
                        print("end....",num)
                        num = num+1
                        print("--------------------------------")
            elif(k=="rbf" or k=="sigmoid"):
                for g in gamma:
                    print("start....{}/{}".format(num,all_nb))
                    svr = SVR(kernel=k,gamma=g)
                    svr.fit(train_x_s,train_y.ravel())
                    pred_test = svr.predict(test_x_s)
                    pred_test = pred_test.reshape(-1,1)
                    sample_n = pred_test.shape[0]
                    feature_n = test_x_s.shape[1]
                    mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                    all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                    all_p = [num,k,"None",g]
#                     print(all_m)
                    with open(path2,"a",encoding="utf-8",newline="")as f:
                        f = csv.writer(f)
                        f.writerow(all_m)
                    with open(path3,"a",encoding="utf-8",newline="")as f:
                        f = csv.writer(f)
                        f.writerow(all_p)
                    print("end....",num)
                    print("--------------------------------")
                    num = num+1
            else:
                print("start....{}/{}".format(num,all_nb))
                svr = SVR(kernel=k)
                svr.fit(train_x_s,train_y.ravel())
                pred_test = svr.predict(test_x_s)
                pred_test = pred_test.reshape(-1,1)
                sample_n = pred_test.shape[0]
                feature_n = test_x_s.shape[1]
                mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                all_p = [num,k,"None","None"]
#                 print(all_m)
                with open(path2,"a",encoding="utf-8",newline="")as f:
                    f = csv.writer(f)
                    f.writerow(all_m)
                with open(path3,"a",encoding="utf-8",newline="")as f:
                    f = csv.writer(f)
                    f.writerow(all_p)
                print("end....",num)
                num = num+1
                print("--------------------------------")    


In [None]:
k = "poly"
d = 3
g = "auto"

svr = SVR(kernel=k,degree=d,gamma=g)
svr.fit(train_X_n,train_y_n)
pred_test = svr.predict(test_X_n)
pred_test = pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_n.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_n,pred_test,sample_n,feature_n)
plt_line("SVR",pred_test,test_y_n,"plt")
all_assess.append(["SVR",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["SVR" for n in range(sample_n)]
pred["SVR"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["SVR"] = svr.predict(train_X_n).reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["SVR"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0]

##### Classifier

In [None]:
from sklearn.svm import SVC
import csv
import os
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

def adjust_svc(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/SVM_" + name + "_" + "assess.csv"
    path3 = "./results/SVM_" + name + "_" + "parameter.csv"
    all_pred_results = []
    all_assessed_values = []
    all_parameter = []
    random_state = 17 # default=None
    C = [0.5,1.0,1.5] # default=1.0
    shrinking = [True,False] # default=True
    kernel = ["rbf","linear","poly","sigmoid"] # default=’rbf’
    degree = [2,3,4,5,6,7,8,9,10,11,12] # default=3
    gamma = ["auto","scale"] # default=’scale’
    tol = [1e-2,1e-3,3e-3,2e-3,1e-4,4e-3] # default=1e-3
    all_nb = len(kernel) * len(degree) * len(gamma) * len(tol) * len(C) * len(shrinking)
    num=1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for k in kernel:
            if(k=="poly"):
                for d in degree:
                    for g in gamma:
                        for c in C:
                            for s in shrinking:
                                if(nums<num):
                                    print("start....{}/{}".format(num,all_nb))
                                    svr = SVC(kernel=k,degree=d,gamma=g,random_state=random_state)
                                    svr.fit(train_x_s,train_y.ravel())
                                    pred_test = svr.predict(test_x_s)
                                    pred_test = pred_test.reshape(-1,1)
                                    sample_n = pred_test.shape[0]
                                    feature_n = test_x_s.shape[1]
                                    mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                    all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                    all_p = [num,k,d,g,c,s]
#                                     print(all_m)
                                    with open(path2,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_m)
                                    with open(path3,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_p)
                                    print("end....",num)
                                    num = num+1
                                    print("--------------------------------")
                                else:
                                    num = num+1
            elif(k=="rbf" or k=="sigmoid"):
                for g in gamma:
                    for c in C:
                        for s in shrinking:
                            if(nums<num):
                                print("start....{}/{}".format(num,all_nb))
                                svr = SVC(kernel=k,gamma=g,random_state=random_state)
                                svr.fit(train_x_s,train_y.ravel())
                                pred_test = svr.predict(test_x_s)
                                pred_test = pred_test.reshape(-1,1)
                                sample_n = pred_test.shape[0]
                                feature_n = test_x_s.shape[1]
                                mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                all_p = [num,k,"None",g,c,s]
#                                 print(all_m)
                                with open(path2,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_m)
                                with open(path3,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_p)
                                print("end....",num)
                                print("--------------------------------")
                                num = num+1
                            else:
                                num = num+1
            else:
                if(nums<num):
                    for c in C:
                        for s in shrinking:
                            print("start....{}/{}".format(num,all_nb))
                            svr = SVC(kernel=k,random_state=random_state)
                            svr.fit(train_x_s,train_y.ravel())
                            pred_test = svr.predict(test_x_s)
                            pred_test = pred_test.reshape(-1,1)
                            sample_n = pred_test.shape[0]
                            feature_n = test_x_s.shape[1]
                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                            all_p = [num,k,"None","None",c,s]
#                             print(all_m)
                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_m)
                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_p)
                            print("end....",num)
                            num = num+1
                            print("--------------------------------")  
                        else:
                            num = num+1
    else:
        for k in kernel:
            if(k=="poly"):
                for d in degree:
                    for g in gamma:
                        for c in C:
                            for s in shrinking:
                                print("start....{}/{}".format(num,all_nb))
                                svr = SVC(kernel=k,degree=d,gamma=g,random_state=random_state)
                                svr.fit(train_x_s,train_y.ravel())
                                pred_test = svr.predict(test_x_s)
                                pred_test = pred_test.reshape(-1,1)
                                sample_n = pred_test.shape[0]
                                feature_n = test_x_s.shape[1]
                                mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                all_p = [num,k,d,g,c,s]
#                                 print(all_m)
                                with open(path2,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_m)
                                with open(path3,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_p)
                                print("end....",num)
                                num = num+1
                                print("--------------------------------")
            elif(k=="rbf" or k=="sigmoid"):
                for g in gamma:
                    for c in C:
                        for s in shrinking:
                            print("start....{}/{}".format(num,all_nb))
                            svr = SVC(kernel=k,gamma=g,random_state=random_state)
                            svr.fit(train_x_s,train_y.ravel())
                            pred_test = svr.predict(test_x_s)
                            pred_test = pred_test.reshape(-1,1)
                            sample_n = pred_test.shape[0]
                            feature_n = test_x_s.shape[1]
                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                            all_p = [num,k,"None",g,c,s]
#                             print(all_m)
                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_m)
                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_p)
                            print("end....",num)
                            print("--------------------------------")
                            num = num+1
            else:
                for c in C:
                    for s in shrinking:
                        print("start....{}/{}".format(num,all_nb))
                        svr = SVC(kernel=k,random_state=random_state)
                        svr.fit(train_x_s,train_y.ravel())
                        pred_test = svr.predict(test_x_s)
                        pred_test = pred_test.reshape(-1,1)
                        sample_n = pred_test.shape[0]
                        feature_n = test_x_s.shape[1]
                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                        all_p = [num,k,"None","None",c,s]
#                         print(all_m)
                        with open(path2,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_m)
                        with open(path3,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_p)
                        print("end....",num)
                        num = num+1
                        print("--------------------------------")    


### Ensemble learning

#### Ada

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import csv
import os

def adjust_ada(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/Ada_" + name + "_" + "assess.csv"
    path3 = "./results/Ada_" + name + "_" + "parameter.csv"
    all_assessed_values = []
    all_parameter = []
    n_estimators = [50,100,200,300,400,500,600,700,800]
    learning_rate = [0.1,0.25,0.5,0.75,1]
    loss = ["linear","square","exponential"]
    criterion = ["mse","mae"]
    splitter = ["best","random"]
    max_features = ["None"] # ,"log2","sqrt","auto"
    max_leaf_nodes = ["None"]
    min_samples_split = [2]
    min_samples_leaf = [1]
    all_nb = len(n_estimators) * len(learning_rate) * len(loss) * len(criterion) * len(splitter) * len(max_features) * len(max_leaf_nodes) * len(min_samples_leaf) * len(min_samples_split)
    num = 1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for n in n_estimators:
            for l in learning_rate:
                for lo in loss:
                    for mf in max_features:
                        for mi in min_samples_split:
                            for ms in min_samples_leaf:
                                for ml in max_leaf_nodes:
                                    for sp in splitter:
                                        for c in criterion:
                                            if(nums<num):
                                                print("start....{}/{}".format(num,all_nb))
                                                if(mf == "None" and ml != "None"):
                                                    ada = AdaBoostRegressor(n_estimators=n,learning_rate=l,loss=lo,base_estimator=DecisionTreeRegressor(min_samples_split=mi,min_samples_leaf=ms,max_leaf_nodes=ml,splitter=sp,criterion=c))
                                                elif(ml == "None" and mf != "None"):
                                                    ada = AdaBoostRegressor(n_estimators=n,learning_rate=l,loss=lo,base_estimator=DecisionTreeRegressor(min_samples_split=mi,min_samples_leaf=ms,splitter=sp,max_features=mf,criterion=c))
                                                elif(ml == "None" and mf == "None"):
                                                    ada = AdaBoostRegressor(n_estimators=n,learning_rate=l,loss=lo,base_estimator=DecisionTreeRegressor(min_samples_split=mi,min_samples_leaf=ms,splitter=sp,criterion=c))
                                                else:
                                                    ada = AdaBoostRegressor(n_estimators=n,learning_rate=l,loss=lo,base_estimator=DecisionTreeRegressor(min_samples_split=mi,min_samples_leaf=ms,max_leaf_nodes=ml,splitter=sp,max_features=mf,criterion=c))
                                                ada.fit(train_x_s,train_y.ravel())
                                                pred_test = ada.predict(test_x_s)
                                                pred_test = pred_test.reshape(-1,1)
                                                sample_n = pred_test.shape[0]
                                                feature_n = test_x_s.shape[1]
                                                mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                                all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                                all_p = [num,n,l,lo,mf,mi,ms,ml,sp,c]
                                                print(all_m)
                                                with open(path2,"a",encoding="utf-8",newline="")as f:
                                                    f = csv.writer(f)
                                                    f.writerow(all_m)
                                                with open(path3,"a",encoding="utf-8",newline="")as f:
                                                    f = csv.writer(f)
                                                    f.writerow(all_p)
                                                print("end....",num)
                                                num = num+1
                                                print("--------------------------------")
                                            else:
                                                num = num+1
    else:
        for n in n_estimators:
            for l in learning_rate:
                for lo in loss:
                    for mf in max_features:
                        for mi in min_samples_split:
                            for ms in min_samples_leaf:
                                for ml in max_leaf_nodes:
                                    for sp in splitter:
                                        for c in criterion:
                                            print("start....{}/{}".format(num,all_nb))
                                            if(mf == "None" and ml != "None"):
                                                ada = AdaBoostRegressor(n_estimators=n,learning_rate=l,loss=lo,base_estimator=DecisionTreeRegressor(min_samples_split=mi,min_samples_leaf=ms,max_leaf_nodes=ml,splitter=sp,criterion=c))
                                            elif(ml == "None" and mf != "None"):
                                                ada = AdaBoostRegressor(n_estimators=n,learning_rate=l,loss=lo,base_estimator=DecisionTreeRegressor(min_samples_split=mi,min_samples_leaf=ms,splitter=sp,max_features=mf,criterion=c))
                                            elif(ml == "None" and mf == "None"):
                                                ada = AdaBoostRegressor(n_estimators=n,learning_rate=l,loss=lo,base_estimator=DecisionTreeRegressor(min_samples_split=mi,min_samples_leaf=ms,splitter=sp,criterion=c))
                                            else:
                                                ada = AdaBoostRegressor(n_estimators=n,learning_rate=l,loss=lo,base_estimator=DecisionTreeRegressor(min_samples_split=mi,min_samples_leaf=ms,max_leaf_nodes=ml,splitter=sp,max_features=mf,criterion=c))
                                            ada.fit(train_x_s,train_y.ravel())
                                            pred_test = ada.predict(test_x_s)
                                            pred_test = pred_test.reshape(-1,1)
                                            sample_n = pred_test.shape[0]
                                            feature_n = test_x_s.shape[1]
                                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                            all_p = [num,n,l,lo,mf,mi,ms,ml,sp,c]
                                            print(all_m)
                                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_m)
                                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_p)
                                            print("end....",num)
                                            num = num+1
                                            print("--------------------------------")


In [None]:
n = 50
l = 0.25
lo = "exponential"
mi = 2
ms = 1
sp = "random"
c = "mse"

ada = AdaBoostRegressor(n_estimators=n,learning_rate=l,loss=lo,base_estimator=DecisionTreeRegressor(min_samples_split=mi,min_samples_leaf=ms,splitter=sp,criterion=c))
ada.fit(train_X_n,train_y_n)
pred_test = ada.predict(test_X_n)
pred_test = pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_n.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_n,pred_test,sample_n,feature_n)
plt_line("Ada",pred_test,test_y_n,"plt")
all_assess.append(["Ada",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["Ada" for n in range(sample_n)]
pred["Ada"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["Ada"] = ada.predict(train_X_n).reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["Ada"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

#### GBDT

##### Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
import csv
import os

def adjust_gbdt(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/GBDT_" + name + "_" + "assess.csv"
    path3 = "./results/GBDT_" + name + "_" + "parameter.csv"
    all_assessed_values = []
    all_parameter = []
    random_state = 17
    n_estimators = [100,200,300,400,500,600,700,800]
    learning_rate = [0.1,1e-2,0.2,2e-2,1e-3,0.3]
    loss = ["huber","ls","lad"]
    subsample = [1,0.6,0.2]
    min_samples_split = [2]
    max_depth = [3,7,11]
    min_samples_leaf = [1]
    all_nb = len(max_depth) * len(n_estimators) * len(learning_rate) * len(loss) * len(subsample) * len(min_samples_leaf) * len(min_samples_split)
    num = 1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for n in n_estimators:
            for l in learning_rate:
                for lo in loss:
                    for sub in subsample:
                        for mi in min_samples_split:
                            for ma in max_depth:
                                for ms in min_samples_leaf:
                                    if(nums<num):
                                        print("start....{}/{}".format(num,all_nb))
                                        gbrg = GradientBoostingRegressor(random_state=random_state,n_estimators=n,learning_rate=l,loss=lo,subsample=sub,max_depth=ma,min_samples_split=mi,min_samples_leaf=ms)
                                        gbrg.fit(train_x_s,train_y)
                                        pred_test = gbrg.predict(test_x_s)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_x_s.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,n,l,lo,sub,mi,ma,ms]
                                        print(all_m)
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------")    
                                    else:
                                        num = num+1
    else:
        for n in n_estimators:
            for l in learning_rate:
                for lo in loss:
                    for sub in subsample:
                        for mi in min_samples_split:
                            for ma in max_depth:
                                for ms in min_samples_leaf:
                                    print("start....{}/{}".format(num,all_nb))
                                    gbrg = GradientBoostingRegressor(random_state=random_state,n_estimators=n,learning_rate=l,loss=lo,subsample=sub,max_depth=ma,min_samples_split=mi,min_samples_leaf=ms)
                                    gbrg.fit(train_x_s,train_y)
                                    pred_test = gbrg.predict(test_x_s)
                                    pred_test = pred_test.reshape(-1,1)
                                    sample_n = pred_test.shape[0]
                                    feature_n = test_x_s.shape[1]
                                    mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                    all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                    all_p = [num,n,l,lo,sub,mi,ma,ms]
                                    print(all_m)
                                    with open(path2,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_m)
                                    with open(path3,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_p)
                                    print("end....",num)
                                    num = num+1
                                    print("--------------------------------")    


In [None]:
n = 200
l = 0.3
lo = "huber"
sub = 0.2
mi = 2
ma = 3
ms = 1

gbrg = GradientBoostingRegressor(random_state=17,n_estimators=n,learning_rate=l,loss=lo,subsample=sub,max_depth=ma,min_samples_split=mi,min_samples_leaf=ms)
gbrg.fit(train_X_lg,train_y_lg)
pred_test = gbrg.predict(test_X_lg)
pred_test = 10**pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_lg.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_lg,pred_test,sample_n,feature_n)
plt_line("GBDT",pred_test,test_y_lg,"plt")
all_assess.append(["GBDT",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["GBDT" for n in range(sample_n)]
pred["GBDT"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["GBDT"] = (10**gbrg.predict(train_X_lg).reshape(1,-1)[0]).tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["GBDT"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

##### Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import csv
import os
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

def adjust_gbdt(train_x_s,test_x_s,train_y,test_y,name=0):
    if(name!=0):
        path2 = "./results/GBDT_" + name + "_" + "assess.csv"
        path3 = "./results/GBDT_" + name + "_" + "parameter.csv"
    else:
        path2 = "./results/GBDT_assess.csv"
        path3 = "./results/GBDT_parameter.csv"
    all_assessed_values = []
    all_parameter = []
    random_state = 17
    n_estimators = [100,200,300,400,500,600] # default=100
    learning_rate = [0.1,1e-2,0.2,1e-3,0.3,0.5]
    loss = ["deviance","exponential"] # default=’deviance’
    subsample = [1,0.6,0.2] # default=1.0
    min_samples_split = [2] # default=2
    max_depth = [3,4,5,6] # default=3
    min_samples_leaf = [1]
    max_features = ["auto","sqrt","log2"]
    all_nb = len(max_features) * len(max_depth) * len(n_estimators) * len(learning_rate) * len(loss) * len(subsample) * len(min_samples_leaf) * len(min_samples_split)
    num = 1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for n in n_estimators:
            for l in learning_rate:
                for lo in loss:
                    for sub in subsample:
                        for mi in min_samples_split:
                            for ma in max_depth:
                                for ms in min_samples_leaf:
                                    for mf in max_features:
                                        if(nums<num):
                                            print("start....{}/{}".format(num,all_nb))
                                            gbrg = GradientBoostingClassifier(max_features=mf,random_state=random_state,n_estimators=n,learning_rate=l,loss=lo,subsample=sub,max_depth=ma,min_samples_split=mi,min_samples_leaf=ms)
                                            gbrg.fit(train_x_s,train_y)
                                            pred_test = gbrg.predict(test_x_s)
                                            pred_test = pred_test.reshape(-1,1)
                                            sample_n = pred_test.shape[0]
                                            feature_n = test_x_s.shape[1]
                                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                            all_p = [num,n,l,lo,sub,mi,ma,ms,mf]
#                                             print(all_m)
                                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_m)
                                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_p)
                                            print("end....",num)
                                            num = num+1
                                            print("--------------------------------")    
                                        else:
                                            num = num+1
    else:
        for n in n_estimators:
            for l in learning_rate:
                for lo in loss:
                    for sub in subsample:
                        for mi in min_samples_split:
                            for ma in max_depth:
                                for ms in min_samples_leaf:
                                    for mf in max_features:
                                        print("start....{}/{}".format(num,all_nb))
                                        gbrg = GradientBoostingClassifier(max_features=mf,random_state=random_state,n_estimators=n,learning_rate=l,loss=lo,subsample=sub,max_depth=ma,min_samples_split=mi,min_samples_leaf=ms)
                                        gbrg.fit(train_x_s,train_y)
                                        pred_test = gbrg.predict(test_x_s)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_x_s.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,n,l,lo,sub,mi,ma,ms,mf]
#                                         print(all_m)
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------")    


#### Bagging

#### RF

##### Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
import csv
import os

def adjust_rf(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/RF_" + name + "_" + "assess.csv"
    path3 = "./results/RF_" + name + "_" + "parameter.csv"
    all_assessed_values = []
    all_parameter = []
    n_estimators = [100,200,300,400,500,600,700,800]
    criterion = ["mae","mse"]
    max_features = ["None","log2","sqrt","auto"]
    max_leaf_nodes = ["None"]
    min_samples_split = [2,3]
    min_samples_leaf = [1,2,3]
    oob_score = ["True","False"]
    random_state = 17
    n_jobs = -1
    all_nb = len(oob_score) * len(n_estimators) * len(criterion) * len(max_features) * len(max_leaf_nodes) * len(min_samples_leaf) * len(min_samples_split)
    num = 1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for n in n_estimators:
            for o in oob_score:
                for mf in max_features:
                    for mi in min_samples_split:
                        for ms in min_samples_leaf:
                            for ml in max_leaf_nodes:
                                for c in criterion:
                                    if(nums<num):
                                        print("start....{}/{}".format(num,all_nb))
                                        if(ml=="None" and mf!= "None"):
                                            rf = RandomForestRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_features=mf,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                        elif(ml!="None" and mf=="None"):
                                            rf = RandomForestRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                        elif(ml=="None" and mf=="None"):
                                            rf = RandomForestRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                        else:
                                            rf = RandomForestRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_features=mf,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                        rf.fit(train_x_s,train_y.ravel())
                                        pred_test = rf.predict(test_x_s)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_x_s.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,n,o,mf,mi,ms,ml,c]
                                        print(all_m)
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------")  
                                    else:
                                        num = num+1
    else:
        for n in n_estimators:
            for o in oob_score:
                for mf in max_features:
                    for mi in min_samples_split:
                        for ms in min_samples_leaf:
                            for ml in max_leaf_nodes:
                                for c in criterion:
                                    print("start....{}/{}".format(num,all_nb))
                                    if(ml=="None" and mf!= "None"):
                                        rf = RandomForestRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_features=mf,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    elif(ml!="None" and mf=="None"):
                                        rf = RandomForestRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    elif(ml=="None" and mf=="None"):
                                        rf = RandomForestRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    else:
                                        rf = RandomForestRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_features=mf,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    rf.fit(train_x_s,train_y.ravel())
                                    pred_test = rf.predict(test_x_s)
                                    pred_test = pred_test.reshape(-1,1)
                                    sample_n = pred_test.shape[0]
                                    feature_n = test_x_s.shape[1]
                                    mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                    all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                    all_p = [num,n,o,mf,mi,ms,ml,c]
                                    print(all_m)
                                    with open(path2,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_m)
                                    with open(path3,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_p)
                                    print("end....",num)
                                    num = num+1
                                    print("--------------------------------")  


In [None]:
n = 800
o = "True"
mi = 2
ms = 1
c = "mae"

rf = RandomForestRegressor(n_jobs=-1,random_state=17,n_estimators=n,oob_score=o,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
rf.fit(train_X_n,train_y_n)
pred_test = rf.predict(test_X_n)
pred_test = pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_n.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_n,pred_test,sample_n,feature_n)
plt_line("RF",pred_test,test_y_n,"plt")
all_assess.append(["RF",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["RF" for n in range(sample_n)]
pred["RF"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["RF"] = rf.predict(train_X_n).reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["RF"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

##### Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
import csv
import os
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

def adjust_rf(train_x_s,test_x_s,train_y,test_y,name=0):
    if(name!=0):
        path2 = "./results/RF_" + name + "_" + "assess.csv"
        path3 = "./results/RF_" + name + "_" + "parameter.csv"
    else:
        path2 = "./results/RF_assess.csv"
        path3 = "./results/RF_parameter.csv"
    all_assessed_values = []
    all_parameter = []
    n_estimators = [100,200,300,400,500,600,700,800] # default=100
    criterion = ["gini","entropy"] # default=”gini”
    max_features = ["None","log2","sqrt","auto"] # default=”auto”
    max_leaf_nodes = ["None"] # default=None
    min_samples_split = [2,3] # default=2
    min_samples_leaf = [1,2,3] # default=1
    oob_score = ["True","False"] # default=False
    random_state = 17 # default=None
    n_jobs = -1
    all_nb = len(oob_score) * len(n_estimators) * len(criterion) * len(max_features) * len(max_leaf_nodes) * len(min_samples_leaf) * len(min_samples_split)
    num = 1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for n in n_estimators:
            for o in oob_score:
                for mf in max_features:
                    for mi in min_samples_split:
                        for ms in min_samples_leaf:
                            for ml in max_leaf_nodes:
                                for c in criterion:
                                    if(nums<num):
                                        print("start....{}/{}".format(num,all_nb))
                                        if(ml=="None" and mf!= "None"):
                                            rf = RandomForestClassifier(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_features=mf,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                        elif(ml!="None" and mf=="None"):
                                            rf = RandomForestClassifier(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                        elif(ml=="None" and mf=="None"):
                                            rf = RandomForestClassifier(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                        else:
                                            rf = RandomForestClassifier(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_features=mf,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                        rf.fit(train_x_s,train_y.ravel())
                                        pred_test = rf.predict(test_x_s)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_x_s.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,n,o,mf,mi,ms,ml,c]
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------")  
                                    else:
                                        num = num+1
    else:
        for n in n_estimators:
            for o in oob_score:
                for mf in max_features:
                    for mi in min_samples_split:
                        for ms in min_samples_leaf:
                            for ml in max_leaf_nodes:
                                for c in criterion:
                                    print("start....{}/{}".format(num,all_nb))
                                    if(ml=="None" and mf!= "None"):
                                        rf = RandomForestClassifier(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_features=mf,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    elif(ml!="None" and mf=="None"):
                                        rf = RandomForestClassifier(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    elif(ml=="None" and mf=="None"):
                                        rf = RandomForestClassifier(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    else:
                                        rf = RandomForestClassifier(n_jobs=n_jobs,random_state=random_state,n_estimators=n,oob_score=o,max_features=mf,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    rf.fit(train_x_s,train_y.ravel())
                                    pred_test = rf.predict(test_x_s)
                                    pred_test = pred_test.reshape(-1,1)
                                    sample_n = pred_test.shape[0]
                                    feature_n = test_x_s.shape[1]
                                    mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                    all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                    all_p = [num,n,o,mf,mi,ms,ml,c]
                                    with open(path2,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_m)
                                    with open(path3,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_p)
                                    print("end....",num)
                                    num = num+1
                                    print("--------------------------------")  


#### EXT

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
import csv
import os

def adjust_extreme(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/EXT_" + name + "_" + "assess.csv"
    path3 = "./results/EXT_" + name + "_" + "parameter.csv"
    all_assessed_values = []
    all_parameter = []
    n_estimators = [10,50,100,200,300,400,500,600,700,800]
    criterion = ["mae","mse"]
    max_features = ["None","log2","sqrt","auto"]
    max_leaf_nodes = ["None"]
    min_samples_split = [2,3,4,5,6,7,8]
    min_samples_leaf = [1]
    random_state = 17
    n_jobs = -1
    all_nb = len(n_estimators) * len(criterion) * len(max_features) * len(min_samples_leaf) * len(min_samples_split)
    num = 1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for n in n_estimators:
            for mf in max_features:
                for mi in min_samples_split:
                    for ms in min_samples_leaf:
                        for ml in max_leaf_nodes:
                            for c in criterion:
                                if(nums<num):
                                    print("start....{}/{}".format(num,all_nb))
                                    if(ml=="None" and mf!= "None"):
                                        ext = ExtraTreesRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,max_features=mf,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    elif(ml!="None" and mf=="None"):
                                        ext = ExtraTreesRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    elif(ml=="None" and mf=="None"):
                                        ext = ExtraTreesRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    else:
                                        ext = ExtraTreesRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,max_features=mf,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                    ext.fit(train_x_s,train_y.ravel())
                                    pred_test = ext.predict(test_x_s)
                                    pred_test = pred_test.reshape(-1,1)
                                    sample_n = pred_test.shape[0]
                                    feature_n = test_x_s.shape[1]
                                    mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                    all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                    all_p = [num,n,mf,mi,ms,ml,c]
                                    print(all_m)
                                    with open(path2,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_m)
                                    with open(path3,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_p)
                                    print("end....",num)
                                    num = num+1
                                    print("--------------------------------")  
                                else:
                                    num = num+1
    else:
        for n in n_estimators:
            for mf in max_features:
                for mi in min_samples_split:
                    for ms in min_samples_leaf:
                        for ml in max_leaf_nodes:
                            for c in criterion:
                                print("start....{}/{}".format(num,all_nb))
                                if(ml=="None" and mf!= "None"):
                                    ext = ExtraTreesRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,max_features=mf,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                elif(ml!="None" and mf=="None"):
                                    ext = ExtraTreesRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                elif(ml=="None" and mf=="None"):
                                    ext = ExtraTreesRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                else:
                                    ext = ExtraTreesRegressor(n_jobs=n_jobs,random_state=random_state,n_estimators=n,max_features=mf,max_leaf_nodes=ml,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
                                ext.fit(train_x_s,train_y.ravel())
                                pred_test = ext.predict(test_x_s)
                                pred_test = pred_test.reshape(-1,1)
                                sample_n = pred_test.shape[0]
                                feature_n = test_x_s.shape[1]
                                mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                all_p = [num,n,mf,mi,ms,ml,c]
                                print(all_m)
                                with open(path2,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_m)
                                with open(path3,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_p)
                                print("end....",num)
                                num = num+1
                                print("--------------------------------")  

In [None]:
n = 500
mi = 2 
ms = 1
c = "mse"

ext = ExtraTreesRegressor(n_jobs=-1,random_state=17,n_estimators=n,min_samples_leaf=ms,min_samples_split=mi,criterion=c)
ext.fit(train_X_n,train_y_n)
pred_test = ext.predict(test_X_n)
pred_test = pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_n.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_n,pred_test,sample_n,feature_n)
plt_line("ET",pred_test,test_y_n,"plt")
all_assess.append(["ET",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["ET" for n in range(sample_n)]
pred["ET"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["ET"] = ext.predict(train_X_n).reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["ET"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

#### XGB

In [None]:
import xgboost as xgb
import csv
import os

def adjust_xgb(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/XGB_" + name + "_" + "assess.csv"
    path3 = "./results/XGB_" + name + "_" + "parameter.csv"
    all_assessed_values = []
    all_parameter = []
    max_depth = [5,6,7,8,9]
    n_estimator = [10,50,75,100,150,200,250,300,400,500,600,700]
    learning_rate=[0.01,0.1,0.2,0.3,0.4,0.5]
    subample = [0.5,0.7,0.9,1]
    gamma = [0.01,1,5,]
    reg_lambda = [0.01,1]
    reg_alpha = [0.01,1]
    colsample_bytree = [0.8,0.9,1]
    all_nb = len(max_depth)*len(n_estimator)*len(learning_rate)*len(subample)*len(gamma)*len(reg_alpha)*len(reg_lambda)*len(colsample_bytree)
    num=1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for ma in max_depth:
            for s in subample:
                for l in learning_rate:
                    for g in gamma:
                        for rl in reg_lambda:
                            for ra in reg_alpha:
                                for c in colsample_bytree:
                                    for n in n_estimator:
                                        if(nums<num):
                                            try:
                                                print("train...{}/{}".format(num,all_nb))
                                                xg = xgb.XGBRegressor(n_estimator=n,colsample_bytree=c,reg_lambda=rl,reg_alpha=ra,subample=s,gamma=g,max_depth=ma,learning_rate=l,subsample=s)
                                                xg.fit(train_x_s,train_y)
                                                pred_test = xg.predict(test_x_s)
                                                pred_test = pred_test.reshape(-1,1)
                                                sample_n = pred_test.shape[0]
                                                feature_n = test_x_s.shape[1]
                                                mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                                all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                                all_p = [num,ma,s,l,g,rl,ra,c,n]
                                                with open(path2,"a",encoding="utf-8",newline="")as f:
                                                    f = csv.writer(f)
                                                    f.writerow(all_m)
                                                with open(path3,"a",encoding="utf-8",newline="")as f:
                                                    f = csv.writer(f)
                                                    f.writerow(all_p)
                                                print("end....",num)
                                                num = num+1
                                                print("--------------------------------") 
                                            except:
                                                print("error")
                                        else:
                                            num = num+1
    else:
        for ma in max_depth:
            for s in subample:
                for l in learning_rate:
                    for g in gamma:
                        for rl in reg_lambda:
                            for ra in reg_alpha:
                                for c in colsample_bytree:
                                    for n in n_estimator:
                                        try:
                                            print("train...{}/{}".format(num,all_nb))
                                            xg = xgb.XGBRegressor(n_estimator=n,colsample_bytree=c,reg_lambda=rl,reg_alpha=ra,subample=s,gamma=g,max_depth=ma,learning_rate=l,subsample=s)
                                            xg.fit(train_x_s,train_y)
                                            pred_test = xg.predict(test_x_s)
                                            pred_test = pred_test.reshape(-1,1)
                                            sample_n = pred_test.shape[0]
                                            feature_n = test_x_s.shape[1]
                                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                            all_p = [num,ma,s,l,g,rl,ra,c,n]
                                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_m)
                                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_p)
                                            print("end....",num)
                                            num = num+1
                                            print("--------------------------------") 
                                        except:
                                            print("error")


In [None]:
ma = 5
s = 0.7
l = 0.2
g = 5
rl = 1
ra = 0.01
c = 0.8
n = 10

xg = xgb.XGBRegressor(n_estimator=n,colsample_bytree=c,reg_lambda=rl,reg_alpha=ra,subample=s,gamma=g,max_depth=ma,learning_rate=l,subsample=s)
xg.fit(train_X_n,train_y_n)
pred_test = xg.predict(test_X_n)
pred_test = pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_n.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_n,pred_test,sample_n,feature_n)
plt_line("XGB",pred_test,test_y_n,"plt")
all_assess.append(["XGB",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["XGB" for n in range(sample_n)]
pred["XGB"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["XGB"] = xg.predict(train_X_n).reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["XGB"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

#### LGB

In [None]:
import lightgbm as lgb
import csv
import os

def adjust_lgb(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/LGB" + name + "_" + "assess.csv"
    path3 = "./results/LGB" + name + "_" + "parameter.csv"
    all_assessed_values = []
    all_parameter = []
    depth = [5,6,7,8,9,10]
    learning_rate = [0.01,0.03,0.05,0.07,0.09,0.1,0.15,0.2]
    n_estimators = [100,200,300,400,500,600,700,800,900,1000,1200,1500]
    feature_fraction = [1,0.9,0.8,0.7]
    lambda_l1 = [0,0.01,0.5,1]
    lambda_l2 = [0,0.01,0.5,1]
    all_nb = len(depth)*len(learning_rate)*len(n_estimators)*len(feature_fraction)*len(lambda_l1)*len(lambda_l2)
    num = 1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for d in depth:
            for l in learning_rate:
                for n in n_estimators:
                    for l2 in lambda_l2:
                        for l1 in lambda_l1:
                            for f in feature_fraction:
                                if(nums<num):
                                    try:
                                        print("train...{}/{}".format(num,all_nb))
                                        lgbr = lgb.LGBMRegressor(objective='regression',max_depth=d,learning_rate=l,n_estimators=n,lambda_l1=l1,lambda_l2=l2,feature_fraction=f)
                                        lgbr.fit(train_x_s,train_y)
                                        pred_test = lgbr.predict(test_x_s)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_x_s.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,d,l,n,l1,l2,f]
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------")   
                                    except:
                                        print("error")
                                else:
                                    num = num+1
    else:
        for d in depth:
            for l in learning_rate:
                for n in n_estimators:
                    for l2 in lambda_l2:
                        for l1 in lambda_l1:
                            for f in feature_fraction:
                                try:
                                    print("train...{}/{}".format(num,all_nb))
                                    lgbr = lgb.LGBMRegressor(objective='regression',max_depth=d,learning_rate=l,n_estimators=n,lambda_l1=l1,lambda_l2=l2,feature_fraction=f)
                                    lgbr.fit(train_x_s,train_y)
                                    pred_test = lgbr.predict(test_x_s)
                                    pred_test = pred_test.reshape(-1,1)
                                    sample_n = pred_test.shape[0]
                                    feature_n = test_x_s.shape[1]
                                    mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                    all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                    all_p = [num,d,l,n,l1,l2,f]
                                    with open(path2,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_m)
                                    with open(path3,"a",encoding="utf-8",newline="")as f:
                                        f = csv.writer(f)
                                        f.writerow(all_p)
                                    print("end....",num)
                                    num = num+1
                                    print("--------------------------------")   
                                except:
                                    print("error")


In [None]:
d = 10
l = 0.1
n = 1500
l1 = 0
l2 = 0.5
f = 0.7

lgbr = lgb.LGBMRegressor(objective='regression',max_depth=d,learning_rate=l,n_estimators=n,lambda_l1=l1,lambda_l2=l2,feature_fraction=f)
lgbr.fit(train_X_lg,train_y_lg)
pred_test = lgbr.predict(test_X_lg)
pred_test = 10**pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_lg.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_lg,pred_test,sample_n,feature_n)
plt_line("LGB",pred_test,test_y_lg,"plt")
all_assess.append(["LGB",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["LGB" for n in range(sample_n)]
pred["LGB"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["LGB"] = (10**lgbr.predict(train_X_lg).reshape(1,-1)[0]).tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["LGB"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0]

#### Cat

In [None]:
import catboost as cb
import csv
import os

def adjust_cat(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/CAT_" + name + "_" + "assess.csv"
    path3 = "./results/CAT_" + name + "_" + "parameter.csv"
    all_assessed_values = []
    all_parameter = []
    depth=[5,6,7,8,9,10]
    learning_rate=[0.001,0.01,0.03,0.05,0.07,0.09,0.1,0.2,0.3]
    iterations = [1500,1400,1300,1200,1100,1000,900,800]
    l2_leaf_reg = [0,1,2,3,4,5]
    all_nb = len(depth)*len(learning_rate)*len(iterations)*len(l2_leaf_reg)
    num=1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for d in depth:
            for l in learning_rate:
                for i in iterations:
                    for l2 in l2_leaf_reg:
                        if(nums<num):
                            try:
                                print("train...{}/{}".format(num,all_nb))
                                cbr = cb.CatBoostRegressor(depth=d,learning_rate=l,iterations=i,l2_leaf_reg=l2,logging_level='Silent')
                                cbr.fit(train_x_s,train_y)
                                pred_test = cbr.predict(test_x_s)
                                pred_test = pred_test.reshape(-1,1)
                                sample_n = pred_test.shape[0]
                                feature_n = test_x_s.shape[1]
                                mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                all_p = [num,d,l,i,l2]
                                print(all_m)
                                with open(path2,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_m)
                                with open(path3,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_p)
                                print("end....",num)
                                num = num+1
                                print("--------------------------------")   
                            except:
                                print("error")
                        else:
                            num = num+1
    else:
        for d in depth:
            for l in learning_rate:
                for i in iterations:
                    for l2 in l2_leaf_reg:
                        try:
                            print("train...{}/{}".format(num,all_nb))
                            cbr = cb.CatBoostRegressor(depth=d,learning_rate=l,iterations=i,l2_leaf_reg=l2,logging_level='Silent')
                            cbr.fit(train_x_s,train_y)
                            pred_test = cbr.predict(test_x_s)
                            pred_test = pred_test.reshape(-1,1)
                            sample_n = pred_test.shape[0]
                            feature_n = test_x_s.shape[1]
                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                            all_p = [num,d,l,i,l2]
                            print(all_m)
                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_m)
                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_p)
                            print("end....",num)
                            num = num+1
                            print("--------------------------------")   
                        except:
                            print("error")


In [None]:
d = 5
l = 0.09
i = 1500
l2 = 5

cbr = cb.CatBoostRegressor(depth=d,learning_rate=l,iterations=i,l2_leaf_reg=l2,logging_level='Silent')
cbr.fit(train_X_n,train_y_n)
pred_test = cbr.predict(test_X_n)
pred_test = pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_n.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_n,pred_test,sample_n,feature_n)
plt_line("CAT",pred_test,test_y_n,"plt")
all_assess.append(["CAT",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["CAT" for n in range(sample_n)]
pred["CAT"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["CAT"] = cbr.predict(train_X_n).reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["CAT"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

### Deep learning

#### GPU

In [None]:
!nvidia-smi # 查看显卡信息

In [None]:
torch.cuda.is_available() # 查看 GPU cuda 是否可用

In [None]:
torch.cuda.device_count() # 查看 GPU 数量

In [None]:
torch.cuda.current_device() # 查看当前 GPU 索引号

In [None]:
torch.cuda.get_device_name(0) # 根据索引号查看GPU名字

In [None]:
x = x.cuda(0) # 将CPU上的转换到GPU上，并指定使用索引号为0的GPU

In [None]:
x.device # 查看数据 x 所在位置（CPU/GPU）

In [None]:
# 自动指定CPU或GPU的例子

device = torch.device('cuda' if torch.cuda.is_available() else
'cpu')

x = torch.tensor([1, 2, 3], device=device)
x = torch.tensor([1, 2, 3]).to(device)

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' #指定0、1显卡

In [None]:
x = x.cpu() # x 从GPU转换为CPU的 x

In [None]:
net.cuda() # 模型net使用cuda加速，注意输入的数据也要使用cuda加速

In [None]:
net = torch.nn.DataParallel(net) # 多GPU模型的cuda加速

In [None]:
print(torch.version.cuda) # 查看版本

In [None]:
torch.cuda.empty_cache() # 清除显存缓存

In [None]:
tensor = torch.randn(3,4,5)
print(tensor.type())  # 数据类型
print(tensor.size())  # 张量的shape，是个元组
print(tensor.dim())   # 维度的数量

In [None]:
# 设置默认类型，pytorch中的FloatTensor远远快于DoubleTensor
torch.set_default_tensor_type(torch.FloatTensor)

# 类型转换
tensor = tensor.cuda()
tensor = tensor.cpu()
tensor = tensor.float()
tensor = tensor.long()

In [None]:
# torch.Tensor与np.ndarray转换
darray = tensor.cpu().numpy()
tensor = torch.from_numpy(ndarray).float()
tensor = torch.from_numpy(ndarray.copy()).float() # If ndarray has negative stride.

In [None]:
# 从只包含一个元素的张量中提取值
value = torch.rand(1).item()

In [None]:
tensor = tensor[torch.randperm(tensor.size(0))]  # 打乱第一个维度

In [None]:
# 张量拼接
'''
注意torch.cat和torch.stack的区别在于torch.cat沿着给定的维度拼接，
而torch.stack会新增一维。例如当参数是3个10x5的张量，torch.cat的结果是30x5的张量，
而torch.stack的结果是3x10x5的张量。
'''
tensor = torch.cat(list_of_tensors, dim=0)
tensor = torch.stack(list_of_tensors, dim=0)

In [None]:
# 计算模型整体参数
num_parameters = sum(torch.numel(parameter) for parameter in model.parameters())

#### Change mode

In [None]:
model.eval() # 模型测试
model.train() # 模型训练

#### Dropout

In [None]:
nn.Dropout(p=0.5) # p指定丢弃的概率 train = (1-p)*train

#### Init net

In [None]:
# 初始化权重
def weight_init(m):
    if isinstance(m, nn.Linear):
#         nn.init.normal_(m.weight,mean=0,std=1) 正太分布随机初始化
        nn.init.xavier_normal_(m.weight) # Xavier权重初始化
        nn.init.constant_(m.bias, 0)

# 自定义初始化 ， 只改变值，不涉及记录梯度
def normal_(tensor, mean=0, std=1):
    with torch.no_grad():
        return tensor.normal_(mean, std)
        
net.apply(weight_init) #使用方法

In [None]:
# 注意 model.modules() 和 model.children() 的区别：model.modules() 会迭代地遍历模型的所有子层，
# 而 model.children() 只会遍历模型下的一层
# Common practise for initialization.
for layer in model.modules():
    if isinstance(layer, torch.nn.Conv2d):
        torch.nn.init.kaiming_normal_(layer.weight, mode='fan_out',
                                      nonlinearity='relu')
        if layer.bias is not None:
            torch.nn.init.constant_(layer.bias, val=0.0)
    elif isinstance(layer, torch.nn.BatchNorm2d):
        torch.nn.init.constant_(layer.weight, val=1.0)
        torch.nn.init.constant_(layer.bias, val=0.0)
    elif isinstance(layer, torch.nn.Linear):
        torch.nn.init.xavier_normal_(layer.weight)
        if layer.bias is not None:
            torch.nn.init.constant_(layer.bias, val=0.0)

# Initialization with given tensor.
layer.weight = torch.nn.Parameter(tensor)

#### Loss

In [None]:
class MyLoss(torch.nn.Moudle):
    def __init__(self):
        super(MyLoss, self).__init__()

    def forward(self, x, y):
        loss = torch.mean((x - y) ** 2)
        return loss

#### Optimizer

In [None]:
# 不对偏置项进行权重衰减（weight decay）
# weight decay相当于l2正则

optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-4)

In [None]:
# 梯度裁剪（gradient clipping）

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=20)

#### Save model

In [None]:
is_best = current_acc > best_acc
best_acc = max(current_acc, best_acc)
checkpoint = {
    'best_acc': best_acc,
    'epoch': epoch + 1,
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict(),
}
model_path = os.path.join('model', 'checkpoint.tar')
best_model_path = os.path.join('model', 'best_checkpoint.tar')
torch.save(checkpoint, model_path)
if is_best:
    shutil.copy(model_path, best_model_path)

#### Model Structure

In [None]:
# import statements
import torch
import torch.nn as nn
from torch.utils import data
...

# set flags / seeds
torch.backends.cudnn.benchmark = True
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed(1)
...

# Start with main code
if __name__ ==  __main__ :
    # argparse for additional flags for experiment
    parser = argparse.ArgumentParser(description="Train a network for ...")
    ...
    opt = parser.parse_args() 

    # add code for datasets (we always use train and validation/ test set)
    data_transforms = transforms.Compose([
        transforms.Resize((opt.img_size, opt.img_size)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    train_dataset = datasets.ImageFolder(
        root=os.path.join(opt.path_to_data, "train"),
        transform=data_transforms)
    train_data_loader = data.DataLoader(train_dataset, ...)

    test_dataset = datasets.ImageFolder(
        root=os.path.join(opt.path_to_data, "test"),
        transform=data_transforms)
    test_data_loader = data.DataLoader(test_dataset ...)
    ...

    # instantiate network (which has been imported from *networks.py*)
    net = MyNetwork(...)
    ...

    # create losses (criterion in pytorch)
    criterion_L1 = torch.nn.L1Loss()
    ...

    # if running on GPU and we want to use cuda move model there
    use_cuda = torch.cuda.is_available()
    if use_cuda:
        net = net.cuda()
        ...

    # create optimizers
    optim = torch.optim.Adam(net.parameters(), lr=opt.lr)
    ...

    # load checkpoint if needed/ wanted
    start_n_iter = 0
    start_epoch = 0
    if opt.resume:
        ckpt = load_checkpoint(opt.path_to_checkpoint) # custom method for loading last checkpoint
        net.load_state_dict(ckpt[ net ])
        start_epoch = ckpt[ epoch ]
        start_n_iter = ckpt[ n_iter ]
        optim.load_state_dict(ckpt[ optim ])
        print("last checkpoint restored")
        ...

    # if we want to run experiment on multiple GPUs we move the models there
    net = torch.nn.DataParallel(net)
    ...

    # typically we use tensorboardX to keep track of experiments
    writer = SummaryWriter(...)

    # now we start the main loop
    n_iter = start_n_iter
    for epoch in range(start_epoch, opt.epochs):
        # set models to train mode
        net.train()
        ...

        # use prefetch_generator and tqdm for iterating through data
        pbar = tqdm(enumerate(BackgroundGenerator(train_data_loader, ...)),
                    total=len(train_data_loader))
        start_time = time.time()

        # for loop going through dataset
        for i, data in pbar:
            # data preparation
            img, label = data
            if use_cuda:
                img = img.cuda()
                label = label.cuda()
            ...

            # It s very good practice to keep track of preparation time and computation time using tqdm to find any issues in your dataloader
            prepare_time = start_time-time.time()

            # forward and backward pass
            optim.zero_grad()
            ...
            loss.backward()
            optim.step()
            ...

            # udpate tensorboardX
            writer.add_scalar(..., n_iter)
            ...

            # compute computation time and *compute_efficiency*
            process_time = start_time-time.time()-prepare_time
            pbar.set_description("Compute efficiency: {:.2f}, epoch: {}/{}:".format(
                process_time/(process_time+prepare_time), epoch, opt.epochs))
            start_time = time.time()

        # maybe do a test pass every x epochs
        if epoch % x == x-1:
            # bring models to evaluation mode
            net.eval()
            ...
            #do some tests
            pbar = tqdm(enumerate(BackgroundGenerator(test_data_loader, ...)),
                    total=len(test_data_loader)) 
            for i, data in pbar:
                ...

            # save checkpoint if needed
            ...

#### MLP

In [None]:
from itertools import combinations
from sklearn.neural_network import MLPRegressor
import csv
def adjust_mlp(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/MLP_" + name + "_" + "assess.csv"
    path3 = "./results/MLP_" + name + "_" + "parameter.csv"
    all_assessed_values = []
    all_parameter = []
    max_iter = [5000,10000,15000,20000]
    tol = [1e-3,2e-3,1e-4,1e-2]
    learning_rate_init = [1e-2,1e-3,1e-4]
    hidden_layer_sizes = list(combinations([64,32,16,8,4], 3))
    all_nb = len(max_iter) * len(tol) * len(learning_rate_init) * len(hidden_layer_sizes)
    num = 1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for m in max_iter:
            for t in tol:
                for l in learning_rate_init:
                    for hd in hidden_layer_sizes:
                        if(nums<num):
                            print("start....{}/{}".format(num,all_nb))
                            mlp = MLPRegressor(hidden_layer_sizes=hd, activation="relu",
                                             solver='adam', alpha=0.0001,
                                             batch_size='auto', learning_rate="constant",
                                             learning_rate_init=l,
                                             power_t=0.5, max_iter=m,tol=t)
                            mlp.fit(train_X,train_y)
                            pred_test = mlp.predict(test_x_s)
                            pred_test = pred_test.reshape(-1,1)
                            sample_n = pred_test.shape[0]
                            feature_n = test_x_s.shape[1]
                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                            all_p = [num,m,t,l,hd]
                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_m)
                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_p)
                            print("end....",num)
                            num = num+1
                            print("--------------------------------")  
                        else:
                            num = num+1
    else:
        for m in max_iter:
            for t in tol:
                for l in learning_rate_init:
                    for hd in hidden_layer_sizes:
                        print("start....{}/{}".format(num,all_nb))
                        mlp = MLPRegressor(hidden_layer_sizes=hd, activation="relu",
                                         solver='adam', alpha=0.0001,
                                         batch_size='auto', learning_rate="constant",
                                         learning_rate_init=l,
                                         power_t=0.5, max_iter=m,tol=t)
                        mlp.fit(train_X,train_y)
                        pred_test = mlp.predict(test_x_s)
                        pred_test = pred_test.reshape(-1,1)
                        sample_n = pred_test.shape[0]
                        feature_n = test_x_s.shape[1]
                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                        all_p = [num,m,t,l,hd]
                        with open(path2,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_m)
                        with open(path3,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_p)
                        print("end....",num)
                        num = num+1
                        print("--------------------------------")  


#### LinearNet

In [None]:
class LinearNet(nn.Module):
    def __init__(self, n_feature):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(n_feature, 1)
    def forward(self, x):
        y = self.linear(x)
        return y
    
# 初始化权重
def weight_init(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight) # Xavier权重初始化
        nn.init.constant_(m.bias, 0)
        
# 学习率衰减
# for param_group in optimizer.param_groups:
#     param_group['lr'] *= 0.1 

In [None]:
net = LinearNet(train_x.shape[1])# 定义模型
print(net)
net.apply(weight_init)# 初始化权重

learning_rate = 0.01
optimizer = optim.SGD(net.parameters(),learning_rate)
loss = nn.MSELoss()

In [None]:
num_epochs = 1000
loss_1 = 0
loss_2 = 0
for epoch in range(1,num_epochs+1):
    for X,y in data_iter:
        output = net(X)
        l = loss(output,y.view(-1,1))
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
    if(epoch % 50 == 0):
        loss_2 = l.item()
        print('epoch %d, loss: %f ,change : %f' % (epoch, l.item(),loss_2 - loss_1))
        loss_1 = l.item()

#### ANN

#### LSTM

In [None]:
class Lstm1(nn.Module):
    def __init__(self,input_size,hidden_size,output_size=1,num_layers=2,dropout=2):
        super(Lstm1,self).__init__()
        self.lstm1 = nn.LSTM(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,dropout=dropout,batch_first=True)
        self.reg = nn.Linear(hidden_size,output_size)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self,x):
        x,_ = self.lstm(x)
        b,s,h = x.shape
        x = x.view(b*s,h)
        x = self.reg(x)
        x = self.relu(x)
        return x

In [None]:
class Lstm2(nn.Module):
    def __init__(self,input_size,hidden_size,output_size=1,num_layers=2,dropout=0):
        super(Lstm2,self).__init__()
        self.lstm2 = nn.LSTM(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,dropout=dropout,batch_first=True,bidirectional=True)
        self.reg = nn.Linear(hidden_size*2,output_size)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self,x):
        x,_ = self.lstm(x)
        b,s,h = x.shape
        x = x.view(b*s,h)
        x = self.reg(x)
        x = self.relu(x)
        return x

#### CNN

#### GRU

In [None]:
class Gru1(nn.Module):
    def __init__(self,input_size,hidden_size,output_size=1,num_layers=2,dropout=0):
        super(Gru1,self).__init__()
        self.gru = nn.GRU(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,dropout=dropout,batch_first=True,bidirectional=True)
        self.reg = nn.Linear(hidden_size,output_size)
        self.relu = nn.ReLU(inplace=True)

    def forward(self,x):
        x,_ = self.gru(x)
        s,b,h = x.shape
        x = x.view(s*b,h)
        x = self.reg(x)
        x = self.relu(x)
        x = x.view(s,b,-1)
        return 

In [None]:
class Gru2(nn.Module):
    def __init__(self,input_size,hidden_size,output_size=1,num_layers=2,dropout=0):
        super(Gru2,self).__init__()
        self.gru = nn.GRU(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,dropout=dropout,batch_first=True,bidirectional=True)
        self.reg = nn.Linear(hidden_size*2,output_size)
        self.relu = nn.ReLU(inplace=True)

    def forward(self,x):
        x,_ = self.gru(x)
        s,b,h = x.shape
        x = x.view(s*b,h)
        x = self.reg(x)
        x = self.relu(x)
        x = x.view(s,b,-1)
        return 

#### GAN

In [None]:
class Generator(nn.Module):
    def __init__(self,input_size,output_size):
        super(Generator,self).__init__()
        self.layer1 = nn.Linear(input_size,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,32)
        self.layer4 = nn.Linear(32,output_size)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self,x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.layer3(x)
        x = self.relu(x)
        x = self.layer4(x)
        x = self.relu(x)
        return x

class Discriminator(nn.Module):
    def __init__(self,input_size,output_size):
        super(Discriminator,self).__init__()
        self.layer1 = nn.Linear(input_size,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,32)
        self.layer4 = nn.Linear(32,output_size)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self,x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.layer3(x)
        x = self.relu(x)
        x = self.layer4(x)
        x = self.relu(x)
        return x
    
D = Discriminator(8,1)
G = Generator(1,8)
criterion = nn.MSELoss()
d_optimizer = torch.optim.Adam(D.parameters(), lr=0.0002)
g_optimizer = torch.optim.Adam(G.parameters(), lr=0.0002)

def reset_grad():
    d_optimizer.zero_grad()
    g_optimizer.zero_grad()

for epoch in range(num_epochs): 
    # ================================================================== #
    #                      训练判别模型                      #
    # ================================================================== #
    outputs = D(train_d_v)
    d_loss_real = criterion(outputs,train_d_y_v)

    # 计算fake损失
    # 生成模型输入生成
    g_s = G(train_g_y_v) 
    outputs = D(g_s)
    d_loss_fake = criterion(outputs,train_g_v)
    
    # 反向传播和优化
    d_loss = d_loss_real + d_loss_fake
#     if((epoch+1) % 1000 ==0):
#         for param_group in d_optimizer.param_groups:
#             param_group["lr"] *=0.1
    d_loss.backward()
    d_optimizer.step()
    reset_grad()


    # ================================================================== #
    #                       训练生成模型                       #
    # ================================================================== #

    # 生成模型根据随机输入生成,然后判别模型进行判别
    g_s = G(train_g_y_v)
    outputs = D(g_s)

    # 训练生成模型，使之最大化 f(D(G(z)) ，而不是最小化 lf(1-D(G(z)))
    # 大致含义就是在训练初期，生成模型G还很菜，判别模型会拒绝高置信度的样本，因为这些样本与训练数据不同。
    # 这样f(1-D(G(z)))就近乎饱和，梯度计算得到的值很小，不利于反向传播和训练。
    # 换一种思路，通过计算最大化f(D(G(z))，就能够在训练初期提供较大的梯度值，利于快速收敛
    g_loss = criterion(outputs,train_g_y_v)

    # 反向传播和优化
#     if((epoch+1) % 1000 ==0):
#         for param_group in g_optimizer.param_groups:
#             param_group["lr"] *=0.1
    g_loss.backward()
    g_optimizer.step()
    reset_grad()


    if (epoch+1) % 100 == 0:
        print('Epoch [{}/{}],, d_loss: {:.4f}, g_loss: {:.4f}' 
              .format(epoch+1, num_epochs,d_loss.item(), g_loss.item()))

### Broad learning

#### BLS

In [None]:
import math
from numpy import random
from sklearn import preprocessing
import csv

def tansig(x):
    return (2/(1+np.exp(-2*x)))-1

def pinv(A,reg):
    return np.mat(reg*np.eye(A.shape[1])+A.T.dot(A)).I.dot(A.T)

def shrinkage(a,b):
    z = np.maximum(a - b, 0) - np.maximum( -a - b, 0)
    return z

def sparse_bls(A,b):
    lam = 0.001
    itrs = 50
    AA = np.dot(A.T,A)
    m = A.shape[1]
    n = b.shape[1]
    wk = np.zeros([m,n],dtype = 'double')
    ok = np.zeros([m,n],dtype = 'double')
    uk = np.zeros([m,n],dtype = 'double')
    L1 = np.mat(AA + np.eye(m)).I
    L2 = np.dot(np.dot(L1,A.T),b)
    for i in range(itrs):
        tempc = ok - uk
        ck =  L2 + np.dot(L1,tempc)
        ok = shrinkage(ck + uk, lam)
        uk += ck - ok
        wk = ok
    return wk

def bls_regression(train_x,train_y,test_x,test_y,s,C,NumFea,NumWin,NumEnhan):
    u = 0
    WF = list()
    for i in range(NumWin):
        random.seed(i+u)
        WeightFea=2*random.randn(train_x.shape[1]+1,NumFea)-1
        WF.append(WeightFea)
    WeightEnhan=2*random.randn(NumWin*NumFea+1,NumEnhan)-1
    time_start = time.time()
    H1 = np.hstack([train_x, 0.1 * np.ones([train_x.shape[0],1])])
    y = np.zeros([train_x.shape[0],NumWin*NumFea])
    WFSparse = list()
    distOfMaxAndMin = np.zeros(NumWin)
    meanOfEachWindow = np.zeros(NumWin)
    for i in range(NumWin):
        WeightFea = WF[i]
        A1 = H1.dot(WeightFea)        
        scaler1 = preprocessing.MinMaxScaler(feature_range=(-1, 1)).fit(A1)
        A1 = scaler1.transform(A1)
        WeightFeaSparse  = sparse_bls(A1,H1).T
        WFSparse.append(WeightFeaSparse)
    
        T1 = H1.dot(WeightFeaSparse)
        meanOfEachWindow[i] = T1.mean()
        distOfMaxAndMin[i] = T1.max() - T1.min()
        T1 = (T1 - meanOfEachWindow[i])/distOfMaxAndMin[i] 
        y[:,NumFea*i:NumFea*(i+1)] = T1

    H2 = np.hstack([y,0.1 * np.ones([y.shape[0],1])])
    T2 = H2.dot(WeightEnhan)
    T2 = tansig(T2)
    T3 = np.hstack([y,T2])
    WeightTop = pinv(T3,C).dot(train_y)
    NetoutTrain = T3.dot(WeightTop)

    HH1 = np.hstack([test_x, 0.1 * np.ones([test_x.shape[0],1])])
    yy1=np.zeros([test_x.shape[0],NumWin*NumFea])
    for i in range(NumWin):
        WeightFeaSparse = WFSparse[i]
        TT1 = HH1.dot(WeightFeaSparse)
        TT1  = (TT1 - meanOfEachWindow[i])/distOfMaxAndMin[i]   
        yy1[:,NumFea*i:NumFea*(i+1)] = TT1

    HH2 = np.hstack([yy1, 0.1 * np.ones([yy1.shape[0],1])])
    TT2 = tansig(HH2.dot( WeightEnhan))
    TT3 = np.hstack([yy1,TT2])
    NetoutTest = TT3.dot(WeightTop)
    return NetoutTest


def adjust_bl(train_x_s,test_x_s,train_y,test_y,name):
    path2 = "./results/BLS_" + name + "_" + "assess.csv"
    path3 = "./results/BLS_" + name + "_" + "parameter.csv"
    all_assessed_values = []
    all_parameter = []
    NumFea = [i for i in range(2,40,4)]
    NumWin = [i for i in range(5,40,5)]
    NumEnhan = [i for i in range(5,60,10)]
    S = [0.4,0.6,0.8,1,1.2,4]
    C = [2**-30,2**-10,2**-20,2**-40,1**-30]
    all_nb = len(NumFea)*len(NumWin)*len(S)*len(C)*len(NumEnhan)
    num=1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for nf in NumFea:
            for nw in NumWin:
                for s in S:
                    for c in C:
                        for ne in NumEnhan:
                            if(nums<num):
                                print("train...{}/{}".format(num,all_nb))
                                pred_test = bls_regression(train_X, train_y, test_X, test_y, s=s, C=c, NumFea=nf, NumWin=nw, NumEnhan=ne)
                                pred_test = pred_test.reshape(-1,1)
                                sample_n = pred_test.shape[0]
                                feature_n = test_x_s.shape[1]
                                mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                all_p = [num,s,c,nf,nw,ne]
                                with open(path2,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_m)
                                with open(path3,"a",encoding="utf-8",newline="")as f:
                                    f = csv.writer(f)
                                    f.writerow(all_p)
                                print("end....",num)
                                num = num+1
                                print("--------------------------------")   
                            else:
                                num = num+1
    else:
        for nf in NumFea:
            for nw in NumWin:
                for s in S:
                    for c in C:
                        for ne in NumEnhan:
                            print("train...{}/{}".format(num,all_nb))
                            pred_test = bls_regression(train_X, train_y, test_X, test_y, s=s, C=c, NumFea=nf, NumWin=nw, NumEnhan=ne)
                            pred_test = pred_test.reshape(-1,1)
                            sample_n = pred_test.shape[0]
                            feature_n = test_x_s.shape[1]
                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                            all_p = [num,s,c,nf,nw,ne]
                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_m)
                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                f = csv.writer(f)
                                f.writerow(all_p)
                            print("end....",num)
                            num = num+1
                            print("--------------------------------")   


In [None]:
class BLSregressor:
    def __init__(self,s,C,NumFea,NumWin,NumEnhan):
        self.s = s
        self.C = C
        self.NumFea = NumFea
        self.NumEnhan = NumEnhan
        self.NumWin = NumWin

    def shrinkage(self,a,b):
        z = np.maximum(a - b, 0) - np.maximum( -a - b, 0)
        return z
        
    def tansig(self,x):
        return (2/(1+np.exp(-2*x)))-1

    def pinv(self,A,reg):
        return np.mat(reg*np.eye(A.shape[1])+A.T.dot(A)).I.dot(A.T)
    
    def sparse_bls(self,A,b):
        lam = 0.001
        itrs = 50
        AA = np.dot(A.T,A)
        m = A.shape[1]
        n = b.shape[1]
        wk = np.zeros([m,n],dtype = 'double')
        ok = np.zeros([m,n],dtype = 'double')
        uk = np.zeros([m,n],dtype = 'double')
        L1 = np.mat(AA + np.eye(m)).I
        L2 = np.dot(np.dot(L1,A.T),b)
        for i in range(itrs):
            tempc = ok - uk
            ck =  L2 + np.dot(L1,tempc)
            ok = self.shrinkage(ck + uk, lam)
            uk += ck - ok
            wk = ok
        return wk
    
    def fit(self,train_x,train_y):  
        train_y = train_y.reshape(-1,1)
        u = 0
        WF = list()
        for i in range(self.NumWin):
            random.seed(i+u)
            WeightFea=2*random.randn(train_x.shape[1]+1,self.NumFea)-1
            WF.append(WeightFea)
        random.seed(100)
        WeightEnhan=2*random.randn(self.NumWin*self.NumFea+1,self.NumEnhan)-1
        H1 = np.hstack([train_x, 0.1 * np.ones([train_x.shape[0],1])])
        y = np.zeros([train_x.shape[0],self.NumWin*self.NumFea])
        WFSparse = list()
        distOfMaxAndMin = np.zeros(self.NumWin)
        meanOfEachWindow = np.zeros(self.NumWin)
        for i in range(self.NumWin):
            WeightFea = WF[i]
            A1 = H1.dot(WeightFea)        
            scaler1 = preprocessing.MinMaxScaler(feature_range=(-1, 1)).fit(A1)
            A1 = scaler1.transform(A1)
            WeightFeaSparse  = self.sparse_bls(A1,H1).T
            WFSparse.append(WeightFeaSparse)
        
            T1 = H1.dot(WeightFeaSparse)
            meanOfEachWindow[i] = T1.mean()
            distOfMaxAndMin[i] = T1.max() - T1.min()
            T1 = (T1 - meanOfEachWindow[i])/distOfMaxAndMin[i] 
            y[:,self.NumFea*i:self.NumFea*(i+1)] = T1
        H2 = np.hstack([y,0.1 * np.ones([y.shape[0],1])])
        T2 = H2.dot(WeightEnhan)
        T2 = self.tansig(T2)
        T3 = np.hstack([y,T2])
        WeightTop = self.pinv(T3,self.C).dot(train_y)
        self.WeightTop = WeightTop
        self.WFSparse = WFSparse
        self.meanOfEachWindow = meanOfEachWindow
        self.distOfMaxAndMin = distOfMaxAndMin
        self.WeightEnhan = WeightEnhan
        return self

    def predict(self,test_x):
        HH1 = np.hstack([test_x, 0.1 * np.ones([test_x.shape[0],1])])
        yy1=np.zeros([test_x.shape[0],self.NumWin*self.NumFea])
        for i in range(self.NumWin):
            WeightFeaSparse = self.WFSparse[i]
            TT1 = HH1.dot(WeightFeaSparse)
            TT1  = (TT1 - self.meanOfEachWindow[i])/self.distOfMaxAndMin[i]   
            yy1[:,self.NumFea*i:self.NumFea*(i+1)] = TT1
        HH2 = np.hstack([yy1, 0.1 * np.ones([yy1.shape[0],1])])
        TT2 = self.tansig(HH2.dot(self.WeightEnhan))
        TT3 = np.hstack([yy1,TT2])
        NetoutTest = TT3.dot(self.WeightTop)
        NetoutTest = np.array(NetoutTest).reshape(1,-1)
        return NetoutTest
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
            
    def get_params(self,deep = False):
        return {
            's':self.s,
            'C':self.C,
            'NumFea':self.NumFea,
            'NumWin':self.NumWin,
            'NumEnhan':self.NumEnhan
        }


s = 1
c = 2**-20
nf = 10
nw = 20
ne = 35

BLS = BLSregressor(s=s,C=c,NumFea=nf,NumWin=nw,NumEnhan=ne)
BLS.fit(train_x_s,train_y_n)
pred_test = BLS.predict(test_x_s)
sample_n,feature_n = test_x_s.shape
calculate(test_y_n,predict,sample_n,feature_n)
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_n,pred_test,sample_n,feature_n)
plt_line("BL",pred_test,test_y_n,"plt")
all_assess.append(["BL",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["BL" for n in range(sample_n)]
pred["BL"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["BL"] = np.array(train_p).reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["BL"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

## Time series

### ARIMA

![57](./img/57.png)

主要是确定p，d，q三个参数

#### AR

![54](./img/54.png)

#### MA

![55](./img/55.png)

#### ARMA

![56](./img/56.png)

#### Adjust （p,d,q）

![58](./img/58.png)

画图 acf 和 pacf 确定 p,q

In [None]:
import statsmodels.api as sm
fig = plt.figure(figsize=(12,8))
 
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(train, lags=20,ax=ax1)
ax1.xaxis.set_ticks_position('bottom')
fig.tight_layout()
 
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(train, lags=20, ax=ax2)
ax2.xaxis.set_ticks_position('bottom')
fig.tight_layout()
plt.show()

![60](./img/60.png)

使用BIC（Bayesian InformationCriterion） 或 AIC（Akaike Information Criterion） 确定 p,q

两种方法

In [None]:
# 第一种

#遍历，寻找适宜的参数
import itertools
import numpy as np
import seaborn as sns
 
p_min = 0
d_min = 0
q_min = 0
p_max = 5
d_max = 0
q_max = 5
 
# Initialize a DataFrame to store the results,，以BIC准则
results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min,p_max+1)],
                           columns=['MA{}'.format(i) for i in range(q_min,q_max+1)])
 
for p,d,q in itertools.product(range(p_min,p_max+1),
                               range(d_min,d_max+1),
                               range(q_min,q_max+1)):
    if p==0 and d==0 and q==0:
        results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = np.nan
        continue
 
    try:
        model = sm.tsa.ARIMA(train, order=(p, d, q),
                               #enforce_stationarity=False,
                               #enforce_invertibility=False,
                              )
        results = model.fit()
        results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = results.bic
    except:
        continue
results_bic = results_bic[results_bic.columns].astype(float)

fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.heatmap(results_bic,
                 mask=results_bic.isnull(),
                 ax=ax,
                 annot=True,
                 fmt='.2f',
                 )
ax.set_title('BIC')
plt.show()


![61](./img/61.png)

In [None]:
# 第二种
train_results = sm.tsa.arma_order_select_ic(train, ic=['aic', 'bic'], trend='nc', max_ar=8, max_ma=8)
 
print('AIC', train_results.aic_min_order)
print('BIC', train_results.bic_min_order)

#一般来说，BIC准则得到的ARMA模型的阶数较AIC的低。

模型检验

这里的模型检验主要有两个：

1）检验参数估计的显著性（t检验）

2）检验残差序列的随机性，即残差之间是独立的

残差序列的随机性可以通过自相关函数法来检验，即做残差的自相关函数图：

In [None]:
model = sm.tsa.ARIMA(train, order=(1, 0, 0))
results = model.fit()
resid = results.resid #赋值
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40)
plt.show()

![62](./img/62.png)

预测主要有两个函数，一个是predict函数，一个是forecast函数，predict中进行预测的时间段必须在我们训练ARIMA模型的数据中，forecast则是对训练数据集末尾下一个时间段的值进行预估

In [None]:
model = sm.tsa.ARIMA(sub, order=(1, 0, 0))
results = model.fit()
predict_sunspots = results.predict(start=str('2014-04'),end=str('2014-05'),dynamic=False)
print(predict_sunspots)
fig, ax = plt.subplots(figsize=(12, 8))
ax = sub.plot(ax=ax)
predict_sunspots.plot(ax=ax)
plt.show()

results.forecast()[0] # 预估下一个值

![63](./img/63.png)

## Math 

#### Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(train_X_n,train_y_n)
pred_test = lr.predict(test_X_n)
pred_test = pred_test.reshape(-1,1)
sample_n = pred_test.shape[0]
feature_n = test_X_n.shape[1]
mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y_n,pred_test,sample_n,feature_n)
plt_line("LR",pred_test,test_y_n,"plt")
all_assess.append(["LR",mse,rmse,mae,r2,mad,mape,r2_adjusted])
all_pre = all_pre + pred_test.reshape(1,-1)[0].tolist()
all_mo = all_mo + ["LR" for n in range(sample_n)]
pred["LR"] = pd.DataFrame(pred_test.reshape(-1,1))
# Seq["LR"] = lr.predict(train_X_n).reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()
Seq["LR"] = train_y_n.reshape(1,-1)[0].tolist() + pred_test.reshape(1,-1)[0].tolist()

## Merge or combine models

#### Bagging - BLS

like RF = bagging + tree

In [None]:
import math
from numpy import random
from sklearn import preprocessing
from sklearn.ensemble import BaggingRegressor

class BLSregressor:
    def __init__(self,s,C,NumFea,NumWin,NumEnhan):
        self.s = s
        self.C = C
        self.NumFea = NumFea
        self.NumEnhan = NumEnhan
        self.NumWin = NumWin

    def shrinkage(self,a,b):
        z = np.maximum(a - b, 0) - np.maximum( -a - b, 0)
        return z
        
    def tansig(self,x):
        return (2/(1+np.exp(-2*x)))-1

    def pinv(self,A,reg):
        return np.mat(reg*np.eye(A.shape[1])+A.T.dot(A)).I.dot(A.T)
    
    def sparse_bls(self,A,b):
        lam = 0.001
        itrs = 50
        AA = np.dot(A.T,A)
        m = A.shape[1]
        n = b.shape[1]
        wk = np.zeros([m,n],dtype = 'double')
        ok = np.zeros([m,n],dtype = 'double')
        uk = np.zeros([m,n],dtype = 'double')
        L1 = np.mat(AA + np.eye(m)).I
        L2 = np.dot(np.dot(L1,A.T),b)
        for i in range(itrs):
            tempc = ok - uk
            ck =  L2 + np.dot(L1,tempc)
            ok = self.shrinkage(ck + uk, lam)
            uk += ck - ok
            wk = ok
        return wk
    
    def fit(self,train_x,train_y):  
        train_y = train_y.reshape(-1,1)
        u = 0
        WF = list()
        for i in range(self.NumWin):
            random.seed(i+u)
            WeightFea=2*random.randn(train_x.shape[1]+1,self.NumFea)-1
            WF.append(WeightFea)
        random.seed(100)
        WeightEnhan=2*random.randn(self.NumWin*self.NumFea+1,self.NumEnhan)-1
        H1 = np.hstack([train_x, 0.1 * np.ones([train_x.shape[0],1])])
        y = np.zeros([train_x.shape[0],self.NumWin*self.NumFea])
        WFSparse = list()
        distOfMaxAndMin = np.zeros(self.NumWin)
        meanOfEachWindow = np.zeros(self.NumWin)
        for i in range(self.NumWin):
            WeightFea = WF[i]
            A1 = H1.dot(WeightFea)        
            scaler1 = preprocessing.MinMaxScaler(feature_range=(-1, 1)).fit(A1)
            A1 = scaler1.transform(A1)
            WeightFeaSparse  = self.sparse_bls(A1,H1).T
            WFSparse.append(WeightFeaSparse)
        
            T1 = H1.dot(WeightFeaSparse)
            meanOfEachWindow[i] = T1.mean()
            distOfMaxAndMin[i] = T1.max() - T1.min()
            T1 = (T1 - meanOfEachWindow[i])/distOfMaxAndMin[i] 
            y[:,self.NumFea*i:self.NumFea*(i+1)] = T1
        H2 = np.hstack([y,0.1 * np.ones([y.shape[0],1])])
        T2 = H2.dot(WeightEnhan)
        T2 = self.tansig(T2)
        T3 = np.hstack([y,T2])
        WeightTop = self.pinv(T3,self.C).dot(train_y)
        self.WeightTop = WeightTop
        self.WFSparse = WFSparse
        self.meanOfEachWindow = meanOfEachWindow
        self.distOfMaxAndMin = distOfMaxAndMin
        self.WeightEnhan = WeightEnhan
        return self

    def predict(self,test_x):
        HH1 = np.hstack([test_x, 0.1 * np.ones([test_x.shape[0],1])])
        yy1=np.zeros([test_x.shape[0],self.NumWin*self.NumFea])
        for i in range(self.NumWin):
            WeightFeaSparse = self.WFSparse[i]
            TT1 = HH1.dot(WeightFeaSparse)
            TT1  = (TT1 - self.meanOfEachWindow[i])/self.distOfMaxAndMin[i]   
            yy1[:,self.NumFea*i:self.NumFea*(i+1)] = TT1
        HH2 = np.hstack([yy1, 0.1 * np.ones([yy1.shape[0],1])])
        TT2 = self.tansig(HH2.dot(self.WeightEnhan))
        TT3 = np.hstack([yy1,TT2])
        NetoutTest = TT3.dot(self.WeightTop)
        NetoutTest = np.array(NetoutTest).reshape(1,-1)
        return NetoutTest
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
            
    def get_params(self,deep = False):
        return {
            's':self.s,
            'C':self.C,
            'NumFea':self.NumFea,
            'NumWin':self.NumWin,
            'NumEnhan':self.NumEnhan
        }

    
    
    
def adjust_bl_bagging(train_X,test_X,train_y,test_y,name):
    path2 = "./results/Bagging-BLS_" + name + "_" + "access.csv"
    path3 = "./results/Bagging-BLS_" + name + "_" + "parameter.csv"
    s = 1.2
    c = 2**-20
    nf = 10
    nw = 15
    ne = 55
    N = [10+(5*i) for i in range(200)]
    MF = [0.7,0.8,0.9,1.0]
    MS = [0.7,0.8,0.9,1.0]
    num=1
    all_len = len(N) * len(MF) * len(MS)
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for mf in MF:
            for ma in MS:
                for n in N:
                    if(nums<num):
                        print("train...{}/{}".format(num,all_len))
                        regr = BaggingRegressor(base_estimator=BLSregressor(s=s,C=c,NumFea=nf,NumWin=nw,NumEnhan=ne),n_estimators=n, random_state=17,max_samples=ms,max_features=mf).fit(train_X,train_y.ravel())
                        pred_test = regr.predict(test_X)
                        pred_test = pred_test.reshape(-1,1)
                        sample_n = pred_test.shape[0]
                        feature_n = test_X.shape[1]
                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                        all_p = [num,s,c,nf,nw,ne]
                        print(all_m)
                        with open(path2,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_m)

                        with open(path3,"a",encoding="utf-8",newline="")as f:
                            f = csv.writer(f)
                            f.writerow(all_p)

                        print("end....",num)
                        num = num+1
                        print("--------------------------------") 
                    else:
                        num = num+1
    else:
        for mf in MF:
            for ma in MS:
                for n in N:
                    print("train...{}/{}".format(num,all_len))
                    regr = BaggingRegressor(base_estimator=BLSregressor(s=s,C=c,NumFea=nf,NumWin=nw,NumEnhan=ne),n_estimators=n, random_state=17,max_samples=ms,max_features=mf).fit(train_X,train_y.ravel())
                    pred_test = regr.predict(test_X)
                    pred_test = pred_test.reshape(-1,1)
                    sample_n = pred_test.shape[0]
                    feature_n = test_X.shape[1]
                    mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                    all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                    all_p = [num,s,c,nf,nw,ne]
                    print(all_m)
                    with open(path2,"a",encoding="utf-8",newline="")as f:
                        f = csv.writer(f)
                        f.writerow(all_m)

                    with open(path3,"a",encoding="utf-8",newline="")as f:
                        f = csv.writer(f)
                        f.writerow(all_p)

                    print("end....",num)
                    num = num+1
                    print("--------------------------------") 
    
    
    
    
s = 1
c = 2**-20
nf = 10
nw = 20
ne = 35

regr = BaggingRegressor(base_estimator=BLSregressor(s=s,C=c,NumFea=nf,NumWin=nw,NumEnhan=ne),n_estimators=10, random_state=0).fit(train_x_s,train_y_n.ravel())
predict = regr.predict(test_x_s)
sample_n,feature_n = test_x_s.shape
calculate(test_y_n,predict,sample_n,feature_n)

#### Adaboost - BLS

In [None]:
import math
from numpy import random
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostRegressor
import os

class BLSregressor:
    def __init__(self,s,C,NumFea,NumWin,NumEnhan):
        self.s = s
        self.C = C
        self.NumFea = NumFea
        self.NumEnhan = NumEnhan
        self.NumWin = NumWin

    def shrinkage(self,a,b):
        z = np.maximum(a - b, 0) - np.maximum( -a - b, 0)
        return z
        
    def tansig(self,x):
        return (2/(1+np.exp(-2*x)))-1

    def pinv(self,A,reg):
        return np.mat(reg*np.eye(A.shape[1])+A.T.dot(A)).I.dot(A.T)
    
    def sparse_bls(self,A,b):
        lam = 0.001
        itrs = 50
        AA = np.dot(A.T,A)
        m = A.shape[1]
        n = b.shape[1]
        wk = np.zeros([m,n],dtype = 'double')
        ok = np.zeros([m,n],dtype = 'double')
        uk = np.zeros([m,n],dtype = 'double')
        L1 = np.mat(AA + np.eye(m)).I
        L2 = np.dot(np.dot(L1,A.T),b)
        for i in range(itrs):
            tempc = ok - uk
            ck =  L2 + np.dot(L1,tempc)
            ok = self.shrinkage(ck + uk, lam)
            uk += ck - ok
            wk = ok
        return wk
    
    def fit(self,train_x,train_y):  
        train_y = train_y.reshape(-1,1)
        u = 0
        WF = list()
        for i in range(self.NumWin):
            random.seed(i+u)
            WeightFea=2*random.randn(train_x.shape[1]+1,self.NumFea)-1
            WF.append(WeightFea)
        WeightEnhan=2*random.randn(self.NumWin*self.NumFea+1,self.NumEnhan)-1
        H1 = np.hstack([train_x, 0.1 * np.ones([train_x.shape[0],1])])
        y = np.zeros([train_x.shape[0],self.NumWin*self.NumFea])
        WFSparse = list()
        distOfMaxAndMin = np.zeros(self.NumWin)
        meanOfEachWindow = np.zeros(self.NumWin)
        for i in range(self.NumWin):
            WeightFea = WF[i]
            A1 = H1.dot(WeightFea)        
            scaler1 = preprocessing.MinMaxScaler(feature_range=(-1, 1)).fit(A1)
            A1 = scaler1.transform(A1)
            WeightFeaSparse  = self.sparse_bls(A1,H1).T
            WFSparse.append(WeightFeaSparse)
            T1 = H1.dot(WeightFeaSparse)
            meanOfEachWindow[i] = T1.mean()
            distOfMaxAndMin[i] = T1.max() - T1.min()
            T1 = (T1 - meanOfEachWindow[i])/distOfMaxAndMin[i] 
            y[:,self.NumFea*i:self.NumFea*(i+1)] = T1
        H2 = np.hstack([y,0.1 * np.ones([y.shape[0],1])])
        T2 = H2.dot(WeightEnhan)
        T2 = self.tansig(T2)
        T3 = np.hstack([y,T2])
        WeightTop = self.pinv(T3,self.C).dot(train_y)
        self.WeightTop = WeightTop
        self.WFSparse = WFSparse
        self.meanOfEachWindow = meanOfEachWindow
        self.distOfMaxAndMin = distOfMaxAndMin
        self.WeightEnhan = WeightEnhan
        return self

    def predict(self,test_x):
        HH1 = np.hstack([test_x, 0.1 * np.ones([test_x.shape[0],1])])
        yy1=np.zeros([test_x.shape[0],self.NumWin*self.NumFea])
        for i in range(self.NumWin):
            WeightFeaSparse = self.WFSparse[i]
            TT1 = HH1.dot(WeightFeaSparse)
            TT1  = (TT1 - self.meanOfEachWindow[i])/self.distOfMaxAndMin[i]   
            yy1[:,self.NumFea*i:self.NumFea*(i+1)] = TT1
        HH2 = np.hstack([yy1, 0.1 * np.ones([yy1.shape[0],1])])
        TT2 = self.tansig(HH2.dot(self.WeightEnhan))
        TT3 = np.hstack([yy1,TT2])
        NetoutTest = TT3.dot(self.WeightTop)
        NetoutTest = np.array(NetoutTest).reshape(-1,1).ravel()
        return NetoutTest
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
            
    def get_params(self,deep = False):
        return {
            's':self.s,
            'C':self.C,
            'NumFea':self.NumFea,
            'NumWin':self.NumWin,
            'NumEnhan':self.NumEnhan
        }
                      
        
def adjust_bl_ada(train_X,test_X,train_y,test_y,name):
    path2 = "./Boost4/" + name + "_" + "access.csv"
    path3 = "./Boost4/" + name + "_" + "parameter.csv"
    NumFea = [i for i in range(2,40,4)]
    NumWin = [i for i in range(5,40,5)]
    NumEnhan = [i for i in range(5,60,10)]
    S = [0.4,0.6,0.8,1,1.2,4]
    C = [2**-30,2**-10,2**-20,2**-40,1**-30]
    n_estimators = [50,100,200,300,400,500,600]
    learning_rate = [0.25,0.5,0.75,1]
    loss = ["linear","square"] # ,"exponential"
    all_len = len(n_estimators) * len(learning_rate) * len(loss) * len(NumFea) * len(NumWin) * len(S) * len(C)
    num=1
    if(os.path.exists(path2)):
        data = pd.read_csv(path2,header=None)
        nums = int(data.values[-1,0])
        for nf in NumFea:
            for nw in NumWin:
                for s in S:
                    for c in C:
                        for ne in NumEnhan:
                            for n in n_estimators:
                                for lr in learning_rate:
                                    for lo in loss:
                                        if(nums<num):
                                            print("train...{}/{}".format(num,all_len))
                                            abrbl = AdaBoostRegressor(n_estimators=n,learning_rate=lr,loss=lo,base_estimator=BLSregressor(s=s,C=c,NumFea=nf,NumWin=nw,NumEnhan=ne)).fit(train_x_s,train_y_n.ravel())
                                            pred_test = abrbl.predict(test_X)
                                            pred_test = pred_test.reshape(-1,1)
                                            sample_n = pred_test.shape[0]
                                            feature_n = test_X.shape[1]
                                            mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                            all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                            all_p = [num,lo,lr,n,nf,nw,s,c,ne]
                                            print(all_m)
                                            with open(path2,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_m)
                                            with open(path3,"a",encoding="utf-8",newline="")as f:
                                                f = csv.writer(f)
                                                f.writerow(all_p)
                                            print("end....",num)
                                            num = num+1
                                            print("--------------------------------")   
                                        else:
                                            num = num+1
    else:
        for nf in NumFea:
            for nw in NumWin:
                for s in S:
                    for c in C:
                        for ne in NumEnhan:
                            for n in n_estimators:
                                for lr in learning_rate:
                                    for lo in loss:
                                        print("train...{}/{}".format(num,all_len))
                                        abrbl = AdaBoostRegressor(n_estimators=n,learning_rate=lr,loss=lo,base_estimator=BLSregressor(s=s,C=c,NumFea=nf,NumWin=nw,NumEnhan=ne)).fit(train_x_s,train_y_n.ravel())
                                        pred_test = abrbl.predict(test_X)
                                        pred_test = pred_test.reshape(-1,1)
                                        sample_n = pred_test.shape[0]
                                        feature_n = test_X.shape[1]
                                        mse,rmse,mae,r2,mad,mape,r2_adjusted = calculate(test_y,pred_test,sample_n,feature_n)
                                        all_m = [num,mse,rmse,mae,r2,mad,mape,r2_adjusted]
                                        all_p = [num,lo,lr,n,nf,nw,s,c,ne]
                                        print(all_m)
                                        with open(path2,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_m)
                                        with open(path3,"a",encoding="utf-8",newline="")as f:
                                            f = csv.writer(f)
                                            f.writerow(all_p)
                                        print("end....",num)
                                        num = num+1
                                        print("--------------------------------")   
                                        
                                        
adjust_bl_ada(train_x_s,test_x_s,train_y_n,test_y_n,"Ada-BLS")

## Simple display results

### General

#### assess 1

In [None]:
all_assess1 = pd.DataFrame(all_assess,columns=["Model","MSE","RMSE","MAE","R2","MAD","MAPE","R2_Adjusted"])

path1 = "./results/parameter1/"
mkdir(path1)

plt.figure(figsize=(8,8))
sns.barplot(x="Model",y="MSE",data=all_assess1)
path = path1 + "MSE.pdf"
plt.savefig(path)
plt.show()

plt.figure(figsize=(8,8))
sns.barplot(x="Model",y="RMSE",data=all_assess1)
path = path1 + "RMSE.pdf"
plt.savefig(path)
plt.show()

plt.figure(figsize=(8,8))
sns.barplot(x="Model",y="MAE",data=all_assess1)
path = path1 + "MAE.pdf"
plt.savefig(path)
plt.show()

plt.figure(figsize=(8,8))
sns.barplot(x="Model",y="MAD",data=all_assess1)
path = path1 + "MAD.pdf"
plt.savefig(path)
plt.show()

plt.figure(figsize=(8,8))
sns.barplot(x="Model",y="MAPE",data=all_assess1)
path = path1 + "MAPE.pdf"
plt.savefig(path)
plt.show()

plt.figure(figsize=(8,8))
sns.barplot(x="Model",y="R2",data=all_assess1)
path = path1 + "R2.pdf"
plt.savefig(path)
plt.show()

plt.figure(figsize=(8,8))
sns.barplot(x="Model",y="R2_Adjusted",data=all_assess1)
path = path1 + "R2_Adjusted.pdf"
plt.savefig(path)
plt.show()

![4](./img/4.png)

#### assess 2 

In [None]:
all_assess2 = []
pa = ["PA","MSE","RMSE","MAE","R2","MAD",",MAPE","R2_Adjusted"]
for a in all_assess:
    for p in range(len(pa)-1):
        all_assess2.append([a[0],pa[p+1],a[p+1]])
all_assess2 = pd.DataFrame(all_assess2,columns=["Model","Paremeter","Values"])

path1 = "./results/parameter2/"
mkdir(path1)

for n in np.unique(all_assess2["Paremeter"]):
    plt.figure(figsize=(8,8))
    sns.barplot(x="Paremeter",y="Values",data=all_assess2[all_assess2["Paremeter"]==n],hue="Model")
    path = path1 + n +".pdf"
    plt.savefig(path)
    plt.show()

![5](./img/5.png)

#### assess 3

In [None]:
all_assess3 = pd.DataFrame(all_mo,columns=["Model"])
all_assess3["Predicted"] = pd.DataFrame(all_pre)
all_real = []
for i in range(np.unique(all_assess3["Model"]).shape[0]):
    all_real = all_real + test_y_n.reshape(1,-1)[0].tolist()
all_assess3["Real"] = pd.DataFrame(all_real)

if((all_assess3["Real"].max())>(all_assess3["Predicted"].max())):
    all_max = all_assess3["Real"].max()+10000
else:
    all_max = all_assess3["Predicted"].max()+10000

path1 = "./results/"
mkdir(path1)
    
path = path1 + "all_pre_sca.pdf"
plt.figure(figsize=(8,8))
sns.relplot(x="Real", y="Predicted", data = all_assess3,hue="Model")
plt.plot([0,all_max],[0,all_max])
plt.title("Predict result")
plt.savefig(path,bbox_inches = 'tight')
plt.show()

#---------------------------------------------------------------------------
path1 = "./results/scat/"
mkdir(path1)

pred["Real"] = pd.DataFrame(test_y_n)
for n in pred.columns[2:-1]:
    if((pred["Real"].max())>(pred[n].max())):
        one_max = pred["Real"].max()+10000
    else:
        one_max = pred[n].max()+10000
    plt.figure(figsize=(8,8))
    sns.relplot(x="Real",y=n,data = pred)
    plt.plot([0,one_max],[0,one_max])
    plt.ylabel("Predicted")
    plt.title(n)
    path = path1 + n + ".pdf"
    plt.savefig(path,bbox_inches = 'tight')
    plt.show()

#---------------------------------------------------------------------------

path = path1 + "all_fit.pdf"

pred["Real"] = pd.DataFrame(test_y_n)
color_sequence = ["violet","tomato","greenyellow","deepskyblue","indigo","deeppink","cyan",
                "hotpink","aquamarine","limegreen","cornflowerblue","crimson","darkgoldenrod"]
plt.figure(figsize=(20, 10),edgecolor='white',facecolor='white')
cs = 0
for mo in np.unique(all_assess3["Model"]):
    all_one = all_assess3[all_assess3["Model"]==mo]
    plt.plot(all_one["Index"], all_one["Predicted"], '-o', label=mo, color=color_sequence[cs], linewidth=3,markersize=5)
    cs = cs+1
plt.plot(all_one["Index"], all_one["Real"], '-o', label="Real", color="gold", linewidth=3,markersize=5)
# plt.gcf().autofmt_xdate()  # 自动旋转日期标记
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.xlabel("Sample",fontsize=30)
plt.ylabel("Confirmed",fontsize=30)
plt.grid()
plt.title("Predict fit",fontsize=30)
plt.legend(prop={"size":25})
plt.savefig(path)
plt.show()

![6](./img/6.png)
![7](./img/7.png)
![8](./img/8.png)

#### Plant results plot 1

In [None]:
def machine_ensemble_plot_t(true, DT, Catb, Ada, EXT, LR, SVM, KNN, 
                            XG, Bagging, GBDT, RF, LGBM, dates=ts):
    
    # 生成横纵坐标信息
    xs = [datetime.strptime(d, '%Y/%m/%d %H') for d in dates]
    
    # 配置横坐标
    #plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y/%m/%d %H'))
    #plt.gca().xaxis.set_major_locator(mdates.DayLocator())
    # Plot
    color_sequence = ['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c',
                      '#98df8a', '#d62728', '#ff9896', '#9467bd', '#c5b0d5',
                      '#8c564b', '#c49c94', '#e377c2', '#f7b6d2', '#7f7f7f',
                      '#c7c7c7', '#bcbd22', '#dbdb8d', '#17becf', '#9edae5']

    plt.figure(num=None, figsize=(20, 10), dpi=80, facecolor='w', edgecolor='k')
    plt.plot(xs, true, '-o', label='True', color=color_sequence[0], linewidth=2)
    plt.plot(xs, DT, '-o', label='DT', color=color_sequence[1], linewidth=2)
    plt.plot(xs, Catb, '-o', label='Catb', color=color_sequence[2], linewidth=2)
    plt.plot(xs, Ada, '-o', label='Ada', color=color_sequence[3], linewidth=2)
    plt.plot(xs, EXT, '-o', label='EXT', color=color_sequence[4], linewidth=2)
    plt.plot(xs, LR, '-o', label='LR', color=color_sequence[5], linewidth=2)
    plt.plot(xs, SVM, '-o', label='SVM', color=color_sequence[6], linewidth=2)
    plt.plot(xs, KNN, '-o', label='KNN', color=color_sequence[7], linewidth=2)
    plt.plot(xs, XG, '-o', label='XG', color=color_sequence[8], linewidth=2)
    plt.plot(xs, Bagging, '-o', label='Bagging', color=color_sequence[9], linewidth=2)
    plt.plot(xs, GBDT, '-o', label='GBDT', color=color_sequence[10], linewidth=2)
    plt.plot(xs, RF, '-o', label='RF', color=color_sequence[11], linewidth=2)
    plt.plot(xs, LGBM, '-o', label='LGBM', color=color_sequence[12], linewidth=2)
    #plt.gcf().autofmt_xdate()  # 自动旋转日期标记
    plt.grid()
    plt.legend()
    plt.show()
    
machine_ensemble_plot_t(test_y, machine_ensemble[0], machine_ensemble[1], 
                        machine_ensemble[2], machine_ensemble[3], machine_ensemble[4], 
                        machine_ensemble[5], machine_ensemble[6], machine_ensemble[7],
                        machine_ensemble[8], machine_ensemble[9], machine_ensemble[10], machine_ensemble[11])

![1](./img/1.png)

#### Plant results plot 2

In [None]:
def plot_evl(model_num, model_name, value, err, score_name):
    '''
    model_num：模型个数 int
    model_name：模型名称 str list
    value：不同模型的同一评估指标 list
    err：不同模型评估指标的误差 list
    score_name: 当前画的指标名字 str
    '''
    color_sequence = ['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c',
                      '#98df8a', '#d62728', '#ff9896', '#9467bd', '#c5b0d5',
                      '#8c564b', '#c49c94', '#e377c2', '#f7b6d2', '#7f7f7f',
                      '#c7c7c7', '#bcbd22', '#dbdb8d', '#17becf', '#9edae5']
    
    ind = np.arange(model_num)
    
    plt.figure(num=None, figsize=(10, 8), dpi=80, facecolor='w', edgecolor='k')
    plt.title('Score of ' + score_name)
    for i in range(model_num):
        plt.bar(i, value[i], yerr=err[i], color = color_sequence[i])
        
    plt.xticks(ind, model_name)  
    
score_name = ['MSE', 'RMSE', 'MAE', 'MAD', 'MAPE', 'R square', 'R2_adjusted', 'RMSLE']
model_name = ['Ada', 'Bagging', 'Catb', 'DT', 'EXT', 'KNN', 'LGBM' , 'RF', 'SVM', 'XG', 'ANN','BLS','ELM','LSTM']
model_num = 14

i=0
plot_evl(model_num, model_name, scores[i], errs[i], score_name[i])

![2](./img/2.png)

#### Latex table

In [None]:
assess = []
for a in all_assess:
    ass = []
    for i in range(len(a)):
        if(type(a[i]) ==  np.float64):
            a[i] = round(a[i],4)
        ass.append(str(a[i]))
    assess.append(ass)
sub = " & "
table = ""
for ae in assess:
    t = sub.join(ae) + " \\\\" 
    print(t)

In [None]:
assess = []
for a in all_assess:
    ass = []
    for i in range(len(a)):
        if(type(a[i]) ==  np.float64):
            a[i] = round(a[i],4)
        ass.append(str(a[i]))
    ass.append("Test")
    assess.append(ass)
sub = " & "
table = ""
for ae in assess:
    t = sub.join(ae) + " \\\\ \n \hline" 
    print(t)

#### Save models and results

In [None]:
import joblib

mkdir("./model/")

def save_model(wk):
    path = "./model/KNN" + "_" + wk + ".m" 
    joblib.dump(knn,path)
    path = "./model/DT" + "_" + wk + ".m" 
    joblib.dump(dt,path)
    path = "./model/SVR" + "_" + wk + ".m" 
    joblib.dump(svr,path)
    path = "./model/Ada" + "_" + wk + ".m" 
    joblib.dump(ada,path)
    path = "./model/RF" + "_" + wk + ".m" 
    joblib.dump(rf,path)
    path = "./model/GBDT" + "_" + wk + ".m" 
    joblib.dump(gbrg,path)
    path = "./model/ET" + "_" + wk + ".m" 
    joblib.dump(ext,path)
    path = "./model/LR" + "_" + wk + ".m" 
    joblib.dump(lr,path)
    path = "./model/CAT" + "_" + wk + ".m" 
    joblib.dump(cbr,path)
    path = "./model/LGB" + "_" + wk + ".m" 
    joblib.dump(lgbr,path)
    path = "./model/XGB" + "_" + wk + ".m" 
    joblib.dump(xg,path)
    
wk = "add_test"

save_model(wk)

#---------------------------------------------------------------------------

wk = "add_test"

Seq.to_csv("./result/Seq_"+wk+".csv",index=None)
pred.to_csv("./result/pred_"+wk+".csv",index=None)
all_assess1.to_csv("./result/all_assess1_"+wk+".csv",index=None)
all_assess2.to_csv("./result/all_assess2_"+wk+".csv",index=None)
all_assess3.to_csv("./result/all_assess3_"+wk+".csv",index=None)
Seq.to_csv("./result/Seq_"+wk+".csv",index=None)

# Data cleaning

In [None]:
# dat[3] 为 python时间戳
nums = 0
t = (dat[3][1:].reset_index(drop=True)  - dat[3][:-1].reset_index(drop=True)).dt.days
if(t[t>1].shape[0]>0):
    nums = nums+1
print(nums)

In [None]:
# 转换时间戳为正常格式的string
da['Time'] = pd.to_datetime(da['Time_s'],origin='unix',unit='ms').apply(lambda x: x.strftime('%Y-%m-%d'))
da['datetime'] = pd.to_datetime(pd.to_datetime(da['Time_s'],origin='unix',unit='ms').apply(lambda x: x.strftime('%Y-%m-%d')),format='%Y-%m-%d')

![17](./img/17.png)

In [None]:
datt = pd.to_datetime(da['Time_s'],origin='unix',unit='ms').apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
da['datetime'] = pd.to_datetime(datt,format='%Y-%m-%d %H:%M:%S')

![18](./img/18.png)

In [None]:
# 清洗数据，转换日期特征
data["Release_date_month"] = data["Release_date_month"].astype(str)
data["Release_date_month"] = data["Release_date_month"].replace({"January":"1",
 "February":"2",
 "March":"3",
 "April":"4",
 "May":"5",
 "June":"6",
 "July":"7",
 "August":"8",
 "September":"9",
 "October":"10",
 "November":"11",
 "December":"12"})
data["Release_date_month"] = data["Release_date_month"].astype(int)

In [None]:
# 清洗数据，转换日期特征
ld = []
for saf in Merge_dataset["Release Date"].values:
    if(saf != "N/A  "):
        saf = saf[:-2]
        saf = saf.replace("Jan","01")
        saf = saf.replace("Feb","02")
        saf = saf.replace("Mar","03")
        saf = saf.replace("Apr","04")
        saf = saf.replace("May","05")
        saf = saf.replace("Jun","06")
        saf = saf.replace("Jul","07")
        saf = saf.replace("Aug","08")
        saf = saf.replace("Sep","09")
        saf = saf.replace("Oct","10")
        saf = saf.replace("Nov","11")
        saf = saf.replace("Dec","12")
        saf = saf.replace("th","").replace("st","").replace("nd","").replace("rd","").replace(" ","-")
        ld.append(saf)
    else:
        ld.append("-1")
Merge_dataset["Release Date"] = pd.DataFrame(ld)

In [None]:
# 提取时间特征
D_year = []
D_month = []
D_day = []
D_hour = []
D_minute = []
D_second = []
for t in d["DateTime"].values:
    dt = t.split(" ")
    D_year.append(int(dt[0].split("-")[0]))
    D_month.append(int(dt[0].split("-")[1]))
    D_day.append(int(dt[0].split("-")[2]))
    D_hour.append(int(dt[1].split(":")[0]))
    D_minute.append(int(dt[1].split(":")[1]))
    D_second.append(int(dt[1].split(":")[2]))
d["D_year"] = pd.DataFrame(D_year)
d["D_month"] = pd.DataFrame(D_month)
d["D_day"] = pd.DataFrame(D_day)
d["D_hour"] = pd.DataFrame(D_hour)
d["D_minute"] = pd.DataFrame(D_minute)
d["D_second"] = pd.DataFrame(D_second)

In [None]:
# 计算周特征和星期特征
number_of_week = []
week_day = []
for d in pd.to_datetime(data["DateTime"],format=("%Y-%m-%d %H:%M:%S")):
    week_day.append(d.isocalendar()[2])
    number_of_week.append(d.isocalendar()[1])
data["Number_of_week"] = pd.DataFrame(number_of_week)
data["Week_day"] = pd.DataFrame(week_day)

In [None]:
# 统计缺失
def lack_of_analysis1(c_name):
    num_d = 0
    num_c = 0
    for c in c_name:
        dir_name = "./video_game_csv/"+c+"/"
        li = os.listdir(dir_name)
        for l in li:
            path = dir_name + l
            data = pd.read_csv(path)
            num_c = num_c+1
            if(data[data["Total"].isnull()].shape[0]>0):
                num_d = num_d+1
    print("Missing :{}".format(num_d/num_c))
    return num_d,num_c

def lack_of_analysis2(c_name):
    num_d = 0
    num_c = 0
    for c in c_name:
        dir_name = "./video_game_csv/"+c+"/"
        li = os.listdir(dir_name)
        for l in li:
            path = dir_name + l
            data = pd.read_csv(path)
            num_c = num_c+1
            if(data[data["Total"]==-1].shape[0]>0):
                num_d = num_d+1
    print("Missing :{}".format(num_d/num_c))
    return num_d,num_c



In [None]:
# 方法：因为缺失值的是每周销量和总销量，假如某一天缺失了，
# 并且下周的每周销量和总销量，就可以补全一个总销量的数据值。

def fill_1(c_name,num_d):
    num_f = 0
    for c in c_name:
        dir_name = "./video_game_csv/"+c+"/"
        li = os.listdir(dir_name)
        for l in li:
            path = dir_name + l
            data = pd.read_csv(path)
            if(data[data["Total"].isnull()].shape[0]>0):
                data["Total"] = data["Total"].fillna(-1)
                data.to_csv(path,index=0)
                num_f = num_f+1
    print(num_f == num_d)
    return num_f

def fill_2(c_name):
    num_fd = 0
    for c in c_name:
        dir_name = "./video_game_csv/"+c+"/"
        li = os.listdir(dir_name)
        for l in li:
            path = dir_name + l
            data = pd.read_csv(path)
            if(data[data["Total"]==-1].shape[0]>0):
                for now in data[data["Total"]==-1].index.tolist():
                    if(now<data.shape[0]-1):
                        next_w = data.loc[now+1,"Weekly"]
                        next_t = data.loc[now+1,"Total"]
                        if(next_t!=-1 and next_w!='Pro'):
                            fill = int(next_t) - int(next_w)
                            data.loc[now,"Total"] = fill
                data.to_csv(path,index=0)
                num_fd = num_fd+1
    return num_fd

In [None]:
# 构建时间序列数据
def create_seq(data,num=2,later=1,ignore=None,y=None,drop_c=None):
    if(drop_c!=None):
        data = data.drop(columns=drop_c)
    later = later-1
    if(isinstance(data,pd.DataFrame)): # 先判断类型是否是pd.DataFrame，不是的话直接退出
        if(ignore!=None and y!=None):
            print("1")
            # ingure 和 y 都有指定
            # 取出y值
            seq = pd.DataFrame(data[y].values[num+later:].tolist(),columns=["Y"])
            for c in data.columns: # 遍历所有的列名
                if c not in ignore: # 判断列名是否在ignore里
                    cl = data[c].values # 取出相应列名的列
                    cl_time = np.array([cl[n:n+num] for n in range(cl.shape[0]-num-later)]) # 按每次取num行出来，直到倒数第num+later+1行结束
                    cl_name = [c+"_"+str(i+1) for i in range(num)] # 取列名
                    data2 = pd.DataFrame(cl_time.reshape(-1,num),columns=cl_name)
                    seq = pd.concat([data2,seq],axis=1) # 跟之前存在的数据进行拼接
                else:
                    seq[c] = data[c].values[num+later:]
        elif(ignore!=None and y==None):
            print("2")
            # ingure有指定，但是y没指定，则取最后一列
            # 取出y值
            seq = pd.DataFrame(data.iloc[num+later:,-1].values.tolist(),columns=["Y"])
            for c in data.columns:
                if c not in ignore:
                    cl = data[c].values
                    cl_time = np.array([cl[n:n+num] for n in range(cl.shape[0]-num-later)])
                    cl_name = [c+"_"+str(i+1) for i in range(num)]
                    data2 = pd.DataFrame(cl_time.reshape(-1,num),columns=cl_name)
                    seq = pd.concat([data2,seq],axis=1)
                else:
                    seq[c] = data[c].values[num+later:]
        elif(ignore==None and y!=None):
            print("3")
            # y有指定，但是ingure没指定，则取除去y列剩下的所有
            # 取出y值
            seq = pd.DataFrame(data[y].values[num+later:].tolist(),columns=["Y"])
            for c in data.columns:
                cl = data[c].values
                cl_time = np.array([cl[n:n+num] for n in range(cl.shape[0]-num-later)])
                cl_name = [c+"_"+strstr(i+1) for i in range(num)]
                data2 = pd.DataFrame(cl_time.reshape(-1,num),columns=cl_name)
                seq = pd.concat([data2,seq],axis=1)
        elif(ignore==None and y==None):
            print("4")
            # y , ingure都没指定，则取最后一列为y,其他所有特征都做成时序数据
            # 取出y值
            seq = pd.DataFrame(data.iloc[num+later:,-1].values.tolist(),columns=["Y"])
            for c in data.columns:
                cl = data[c].values
                cl_time = np.array([cl[n:n+num] for n in range(cl.shape[0]-num-later)])
                cl_name = [c+"_"+strstr(i+1) for i in range(num)]
                data2 = pd.DataFrame(cl_time.reshape(-1,num),columns=cl_name)
                seq = pd.concat([data2,seq],axis=1)   
        print("shape:{}".format(seq.shape))
    else:
        print("Error: type is not pd.DataFrame.")
        return
    return seq

In [None]:
def create_seq(data,num=2,later=1,ignore=None,y=None,drop_c=None,is_sum=1):
    if(drop_c!=None):
        data = data.drop(columns=drop_c)
    column = data.columns
    later = later-1
    if(isinstance(data,pd.DataFrame)): # 先判断类型是否是pd.DataFrame，不是的话直接退出
        if(ignore!=None and y!=None):
            print("1")
            # ingure 和 y 都有指定
            # 取出y值
            if(is_sum==1):
                seq = pd.DataFrame(data[y].values[num+later+is_sum:].tolist(),columns=["Y"])
            else:
                sy = []
                for n in range(data.shape[0]-later-is_sum-num):
                    sy.append(np.sum(data[y].iloc[n+later+num+1:n+is_sum+later+num+1].values))
                seq = pd.DataFrame(sy,columns=["Y"])
            for c in data.columns: # 遍历所有的列名
                if c not in ignore: # 判断列名是否在ignore里
                    cl = data[c].values # 取出相应列名的列
                    cl_time = np.array([cl[n:n+num] for n in range(cl.shape[0]-num-later-is_sum)]) # 按每次取num行出来，直到倒数第num+later+1行结束
                    cl_name = [c+"_"+str(i+1) for i in range(num)] # 取列名
                    data2 = pd.DataFrame(cl_time.reshape(-1,num),columns=cl_name)
                    seq = pd.concat([data2,seq],axis=1) # 跟之前存在的数据进行拼接
                else:
                    seq[c] = data[c].values[num+later+is_sum:]
        elif(ignore!=None and y==None):
            print("2")
            # ingure有指定，但是y没指定，则取最后一列
            # 取出y值
            y = -1
            if(is_sum==1):
                seq = pd.DataFrame(data.iloc[num+later+is_sum:,-1].values.tolist(),columns=["Y"])
            else:
                sy = []
                for n in range(data.shape[0]-later-is_sum-num):
                    sy.append(np.sum(data[y].iloc[n+later+num+1:n+is_sum+later+num+1].values))
                seq = pd.DataFrame(sy,columns=["Y"])
            for c in data.columns:
                if c not in ignore:
                    cl = data[c].values
                    cl_time = np.array([cl[n:n+num] for n in range(cl.shape[0]-num-later-is_sum)])
                    cl_name = [c+"_"+str(i+1) for i in range(num)]
                    data2 = pd.DataFrame(cl_time.reshape(-1,num),columns=cl_name)
                    seq = pd.concat([data2,seq],axis=1)
                else:
                    seq[c] = data[c].values[num+later+is_sum:]
        elif(ignore==None and y!=None):
            print("3")
            # y有指定，但是ingure没指定，则取除去y列剩下的所有
            # 取出y值
            if(is_sum==1):
                seq = pd.DataFrame(data[y].values[num+later+is_sum:].tolist(),columns=["Y"])
            else:
                sy = []
                for n in range(data.shape[0]-later-is_sum-num):
                    sy.append(np.sum(data[y].iloc[n+later+num+1:n+is_sum+later+num+1].values))
                seq = pd.DataFrame(sy,columns=["Y"])
            for c in data.columns:
                cl = data[c].values
                cl_time = np.array([cl[n:n+num] for n in range(cl.shape[0]-num-later-is_sum)])
                cl_name = [c+"_"+str(i+1) for i in range(num)]
                data2 = pd.DataFrame(cl_time.reshape(-1,num),columns=cl_name)
                print(data2.shape[0])
                seq = pd.concat([data2,seq],axis=1)
        elif(ignore==None and y==None):
            print("4")
            # y , ingure都没指定，则取最后一列为y,其他所有特征都做成时序数据
            # 取出y值
            if(is_sum==1):
                seq = pd.DataFrame(data.iloc[num+later+is_sum:,-1].values.tolist(),columns=["Y"])
            else:
                sy = []
                for n in range(data.shape[0]-later-is_sum-num):
                    sy.append(np.sum(data[y].iloc[n+later+num+1:n+is_sum+later+num+1].values))
                seq = pd.DataFrame(sy,columns=["Y"])
            for c in data.columns:
                cl = data[c].values
                cl_time = np.array([cl[n:n+num] for n in range(cl.shape[0]-num-later-is_sum)])
                cl_name = [c+"_"+str(i+1) for i in range(num)]
                data2 = pd.DataFrame(cl_time.reshape(-1,num),columns=cl_name)
                seq = pd.concat([data2,seq],axis=1)   
        print("shape:{}".format(seq.shape))
    else:
        print("Error: type is not pd.DataFrame.")
        return
    return seq

# Data analysis

## Correlation 

In [None]:
f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(Data.corr(),vmax=.8, square=True)
plt.show()

In [None]:
p_name = "Y"
plt.figure(figsize=(8,8))
sns.distplot(np.abs(Data.drop(columns=["Time"]).corr()[p_name].sort_values()[:-1].values),label="pearson",bins=20)
plt.show()

In [None]:
p_name = "Y"
plt.figure(figsize=(8,8))
sns.distplot(Data.drop(columns=["Time"]).corr()[p_name].sort_values()[:-1].values,label="pearson",bins=20)
plt.show()

In [None]:
Data.drop(columns=["Time"]).corr()["Y"].sort_values()

In [None]:
plt.figure(figsize=(20,20), dpi= 80)
sns.pairplot(Data, kind="scatter", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()

In [None]:
plt.figure(figsize=(10,8), dpi= 80)
sns.pairplot(df, kind="reg")
plt.show()

In [None]:
use = Train.corr().sort_values(by="Y")["Y"][np.abs(Train.corr().sort_values(by="Y")["Y"].values > 0.40)].index.tolist()

In [None]:
Train.corr().sort_values(by="Y")["Y"][np.abs(Train.corr().sort_values(by="Y")["Y"].values > 0.50)]

In [None]:
import scipy

y = Train["Y"]
SP = {}
for i in Train.drop(columns=["Time"]).columns:
    sp = scipy.stats.pearsonr(Train[i],y)
    print("{}  :  cor -- {} ， p-v  --- {}".format(i,sp[0],sp[1]))
    SP[i] = sp

## Importance

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

s = StandardScaler()
s.fit(Train.drop(columns=["Y","Time"]))
train_x1 = s.transform(Train.drop(columns=["Y","Time"]))
train_y1 = Train["Y"].values

rf = RandomForestRegressor(n_jobs=-1,n_estimators=100)
rf.fit(train_x1,train_y1)

feature = Train.drop(columns=["Y","Time"]).columns.tolist()
importances=rf.feature_importances_
indices=np.argsort(importances)[::-1] # 从大到小提取索引
for f in range(train_x1.shape[1]):
    print ("%2d) %-*s %f" % (f+1,30,feature[indices[f]],importances[indices[f]]))
    
plt.figure(figsize=(10,10))
plt.title('Feature Importance-RandomForest')
plt.barh(range(train_x1.shape[1]),importances[indices],color='lightblue',align='center')
# plt.yticks(range(train_x_n.shape[1]),feature,rotation=90)
plt.ylim([-1,train_x1.shape[1]])
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,8))
sns.distplot(importances)
plt.show()

## Combine importance and correlation

In [None]:
score = {}
n = train_x1.shape[1]
for f in range(train_x1.shape[1]):
    score[feature[indices[f]]] = n
    n = n-1

fe = Train.corr().sort_values(by="Y")["Y"].index.tolist()[:-1]
n = 1
for f in range(train_x1.shape[1]):
    score[fe[f]] = score[fe[f]] + n
    n = n+1

In [None]:
score_results = sorted(score.items(),key=lambda item:item[1])
y = []
x = []
for i in score_results:
    y.append(i[0])
    x.append(i[1])

In [None]:
plt.rcParams['font.sans-serif'] = ['SimHei']  # 中文字体设置-黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
sns.set(font='SimHei',font_scale=1.5)

plt.figure(figsize=(10,15))
plt.title('Feature score')
plt.barh(range(train_x1.shape[1]),x,color='lightblue',align='center',label=y)
y_pos = np.arange(len(y))
plt.yticks(y_pos, y)
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams['font.sans-serif'] = ['SimHei']  # 中文字体设置-黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
sns.set(font='SimHei',font_scale=1.5)

plt.figure(figsize=(15,10))
plt.title('Feature score')
plt.bar(range(train_x1.shape[1]),x,color='lightblue',align='center',label=y)
x_pos = np.arange(len(y))
plt.xticks(x_pos, y,rotation=90)
plt.tight_layout()
plt.show()

## Variance inflation factor

![53](./img/53.png)

In [2]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
from sklearn.preprocessing import StandardScaler
df= pd.DataFrame(
{'a': [1, 1, 2, 3, 4],
'b': [2, 2, 3, 2, 1],
'c': [4, 6, 7, 8, 9],
'd': [4, 3, 4, 5, 4]}
)
X= StandardScaler().fit_transform(df)
vif= pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
vif['columns'] = df.columns
print(vif)

   VIF Factor columns
0       22.95       a
1        3.00       b
2       12.95       c
3        3.00       d


In [39]:
from statsmodels.regression.linear_model import OLS

results = OLS(X[:,0], X[:,1:]).fit()
print(results.rsquared)
print(1/(1-results.rsquared))

0.9564270152505446
22.949999999999985


解决多重共线的方法一般有以下三种：

（1）向后消除法(Backward elimination)：每次循环，遍历当前还没有剔除的变量，依次计算对应的 VIF，再去除最差的那个变量（也就是VIF值最大的变量），一直循环，直至变量数目少于预期个数或者所有的变量VIF值都小于VIF阈值。一般而言 VIF > 10，认为存在共线性。

（2）PCA降维：PCA降维后，所有提取的主成分间两两独立，所以不会再有共线性。

（3）岭回归分析法：岭回归线性回归在线性回归的基础上新增了一个惩罚项，解决了共线性。

## VarianceThreshold

In [None]:
import pandas as pd
data = pd.read_csv("digit recognizor.csv")
X = data.iloc[:,1:]
y = data.iloc[:,0]

from sklearn.feature_selection import VarianceThreshold
 
selector = VarianceThreshold()         #实例化，不填参数默认方差为0
X_var0 = selector.fit_transform(X)         #获取删除不合格特征之后的新特征矩阵
#也可以直接写成 X = VairanceThreshold().fit_transform(X)
# 可能变更好，代表被滤掉的特征大部分是噪音
# 也可能变糟糕，代表被滤掉的特征中很多都是有效特征