https://archive.ics.uci.edu/ml/datasets/student+performance

In [2]:
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.utils import shuffle
from scipy.stats import ranksums
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
import warnings
from math import log2
from time import time

def KFoldCV(model, data, n_fold=10):
#     num = len(data)
    diff = int(len(data)/n_fold)
    results = np.zeros((n_fold, 4))
    for i in range(n_fold):
        begin = diff*i
        end = diff*(i+1)
#         if i == n_fold-1:
#             end = -1
        test = data[begin:end]
        train = deepcopy(data)
        train = np.delete(train, range(begin, end),axis=0)
        X_train, y_train = train[:,:-1], train[:,-1]
        X_test, y = test[:,:-1], test[:,-1]
        predictY = model.fit(X_train, y_train).predict(X_test)
        mae = np.mean(abs((y-predictY)))
        stdErr = np.std(((y-predictY)))
        error=sum((y-predictY)**2)
        RMSE=np.sqrt(error/len(y))
        MAC = np.dot(y,predictY)**2/(np.dot(y, y)*np.dot(predictY, predictY))
#         print(mae, stdErr, RMSE, MAC)
        results[i,:] = [mae, stdErr, RMSE, MAC]
    return results

warnings.filterwarnings('ignore')

# data = pd.read_csv('./student-mat.csv', sep=';')
# por = pd.read_csv('./student-por.csv', sep=';')
# combine = pd.merge(math, por, on=["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"])
# data = data.values
# data = shuffle(data)

In [11]:
data=pd.read_csv('./exams.csv')
data['math_score']=data['math score']
data.drop(columns=['math score'],inplace=True)
data.drop(columns=['reading score'],inplace=True)
data.drop(columns=['writing score'],inplace=True)

cats = ['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course']
oe = OrdinalEncoder()
for c in cats:
    oe.fit(data[c].values.reshape(-1,1))
    data[c] = np.asarray(oe.transform(data[c].values.reshape(-1,1)),dtype=int)
#     np.asarray(data['gender'],dtype=int)
data = data.values
data = shuffle(data)

In [12]:
# data.shape (395, 33)
# np.savetxt('math.txt',data)
data.shape

(1000, 6)

In [4]:
t1 = time()
hidden_layer_sizes  = [100, 300, 500]
max_iter = [500, 1000,1500]
# hidden_layer_sizes  = [500]
# max_iter = [1000]
epochs=1
mlps = np.zeros((epochs,4))
for i in hidden_layer_sizes:
    for j in max_iter:
        for epoch in range(epochs):
            data = shuffle(data)
            model = MLPRegressor(hidden_layer_sizes=i, max_iter=j)
            result = KFoldCV(model, data)
            mlps[epoch,:] = np.mean(result, axis=0)
#             print(np.mean(result, axis=0))
        print(np.mean(mlps, axis=0))
#     [0.96503638 1.28823634 1.30661604 0.97660302]
print(time()-t1)

[10.7704353  13.13808694 13.19360253  0.96325136]
[10.53802629 12.79354589 12.93117427  0.96500862]
[10.50202469 12.86493812 12.90573321  0.96466753]
[10.49990913 12.85830608 12.89136609  0.96479579]
[10.50890642 12.78323161 12.9110008   0.96508799]
[10.49691524 12.86306954 12.8889675   0.96469626]
[10.55639212 12.87166213 12.92573326  0.96471062]
[10.48971926 12.82259669 12.88380252  0.96497915]
[10.51428861 12.85645728 12.92091978  0.96483108]
150.53565382957458


In [14]:
mlps

array([[10.49708927, 12.78568516, 12.88564314,  0.96506534],
       [10.58147643, 12.85437211, 12.94835564,  0.96464657],
       [10.51136407, 12.79752304, 12.86320654,  0.96492994],
       [10.47790754, 12.80428571, 12.85632311,  0.96492226],
       [10.52447424, 12.84432187, 12.90651716,  0.96489199],
       [10.49910328, 12.77616124, 12.8904822 ,  0.96522214],
       [10.5578901 , 12.83660394, 12.93931126,  0.96485764],
       [10.52107536, 12.81853855, 12.90387554,  0.96505331],
       [10.5286978 , 12.863341  , 12.93575292,  0.96471475],
       [10.50522347, 12.85753958, 12.90199089,  0.96481261],
       [10.51651894, 12.87767554, 12.91485625,  0.96463246],
       [10.51182459, 12.85346298, 12.88399365,  0.96475729],
       [10.51790596, 12.77564868, 12.9150444 ,  0.96518826],
       [10.48896818, 12.757954  , 12.83458862,  0.96499535],
       [10.51183577, 12.86958061, 12.90885101,  0.96465234],
       [10.51815204, 12.78844878, 12.87294337,  0.96497814],
       [10.51418004, 12.

In [5]:
t1 = time()
Cs = [10, 100, 1000]
gammas = [0.001, 0.01, 0.1]
# Cs = [1000]
# gammas = [0.001]
svms = np.zeros((epochs,4))
for i in range(len(Cs)):
    for j in range(len(gammas)):
        lc = len(Cs)
        lg = len(gammas)
        for epoch in range(epochs):
            data = shuffle(data)
            model = SVR(kernel = 'rbf', C = Cs[i], gamma = gammas[j])
            result = KFoldCV(model, data)
            svms[epoch,:] = np.mean(result, axis=0)
        print(np.mean(svms, axis=0))
#         [0.90084585 1.28205627 1.30138029 0.97616123]
print(time()-t1)

[11.36366477 13.86199146 13.97713241  0.95873437]
[10.6894108  13.04009821 13.09536214  0.96366152]
[10.73411193 13.0647072  13.13626223  0.96365063]
[10.66115976 12.99812131 13.09013294  0.96372076]
[10.53071363 12.81290342 12.9427764   0.9650078 ]
[10.68061617 12.87418181 13.10050357  0.96438947]
[10.55401268 12.80406678 12.95717199  0.96496384]
[10.58678854 12.87759021 12.9955522   0.96456892]
[10.84346636 13.27192395 13.40200551  0.96252197]
3.0399978160858154


In [15]:
svms

array([[10.55265309, 12.91135932, 12.96121856,  0.96445691],
       [10.53613806, 12.89480847, 12.95836585,  0.96453308],
       [10.56349846, 12.89703144, 12.9781195 ,  0.96458261],
       [10.54322054, 12.93271206, 12.97159624,  0.96441919],
       [10.57989849, 12.91855038, 12.97419243,  0.96441568],
       [10.54422089, 12.87879022, 12.94868123,  0.96449911],
       [10.53242466, 12.87917265, 12.94775777,  0.96459812],
       [10.54091675, 12.83903791, 12.92374584,  0.96477851],
       [10.54477993, 12.86189174, 12.96509862,  0.96465578],
       [10.55305094, 12.89110089, 12.96975662,  0.96459745],
       [10.54284385, 12.84146348, 12.97185249,  0.96464989],
       [10.49953087, 12.88643536, 12.92031439,  0.96462218],
       [10.54168813, 12.88879764, 12.96915171,  0.96473607],
       [10.5435933 , 12.92276307, 12.96614552,  0.96440535],
       [10.55600866, 12.89222411, 12.98503688,  0.96457528],
       [10.56713815, 12.87436489, 12.98802271,  0.96475304],
       [10.51646547, 12.

In [6]:
t1 = time()
n_estimators  = [100, 300, 500]
max_depth = [3, 4, 5]
# n_estimators  = [500]
# max_depth = [4]
rfs = np.zeros((epochs,4))
for i in n_estimators:
    for j in max_depth:
        for epoch in range(epochs):
            data = shuffle(data)
            model = RandomForestRegressor(n_estimators=i, max_depth=j)
            result = KFoldCV(model, data)
            rfs[epoch,:] = np.mean(result, axis=0)
        print(np.mean(rfs, axis=0))
#         [0.82798772 1.18901865 1.20639261 0.97973146]
print(time()-t1)

[10.62693427 13.00526258 13.0960595   0.96397901]
[10.54951289 12.90458019 12.99225627  0.9644773 ]
[10.51086126 12.87602    12.95920541  0.96455808]
[10.61073989 13.03838074 13.07541274  0.96383331]
[10.54239409 12.90507198 12.96048796  0.96434844]
[10.51117145 12.82912876 12.91653734  0.96480317]
[10.6051706  13.00312395 13.06839696  0.96384172]
[10.50530719 12.84148735 12.92496509  0.96489466]
[10.46907333 12.82539465 12.87922688  0.96478279]
18.1609947681427


In [16]:
rfs

array([[10.50371981, 12.83035242, 12.96062913,  0.96476865],
       [10.51447094, 12.93000406, 12.96474692,  0.96433851],
       [10.47177346, 12.83814261, 12.91418275,  0.96483733],
       [10.48295228, 12.83789353, 12.92986802,  0.96479649],
       [10.55029421, 12.9498245 , 12.98944056,  0.96418019],
       [10.48837408, 12.87853633, 12.89201561,  0.96461188],
       [10.51022087, 12.89210757, 12.92357099,  0.96442648],
       [10.50261109, 12.73921989, 12.92390231,  0.96537501],
       [10.54213681, 12.87338071, 12.97446532,  0.96458503],
       [10.55185355, 12.85583568, 12.98244224,  0.96455437],
       [10.51053395, 12.8460504 , 12.93470633,  0.96486805],
       [10.53985992, 12.82989303, 12.95838634,  0.96479536],
       [10.56097272, 12.91791281, 13.00311593,  0.96433031],
       [10.51935891, 12.93000476, 12.94252727,  0.96439548],
       [10.48156401, 12.8493928 , 12.89600681,  0.96474567],
       [10.49113719, 12.85744123, 12.92113676,  0.96465012],
       [10.51242014, 12.

In [7]:
t1 = time()
n_estimators= [100, 300, 500]
max_depth = [3, 4, 5]
# n_estimators  = [100]
# max_depth = [3]
xgbs = np.zeros((epochs,4))
for i in n_estimators:
    for j in max_depth:
        for epoch in range(epochs):
            data = shuffle(data)
            model = XGBRegressor(n_estimators=i, max_depth=j)
            result = KFoldCV(model, data)
            xgbs[epoch,:] = np.mean(result, axis=0)
        print(np.mean(xgbs, axis=0))
#         [0.89059432 1.2367164  1.25472816 0.97802632]
print(time()-t1)

[10.5337089  12.96381401 12.99323168  0.96423234]
[10.89562072 13.31154558 13.41640657  0.96226039]
[11.03628894 13.56497613 13.66044379  0.96060372]
[10.72734224 13.14371941 13.2341252   0.96312108]
[11.29624695 13.86559754 13.92460357  0.95904617]
[11.36702077 14.06432499 14.11065934  0.95758407]
[10.74045938 13.20964211 13.2789101   0.96285298]
[11.26637544 13.82333309 13.94620375  0.95933583]
[11.50977552 14.22714875 14.27494127  0.95705736]
16.167994260787964


In [17]:
xgbs

array([[10.57886198, 12.99494391, 13.04114744,  0.96410549],
       [10.51842529, 12.90851125, 12.95365621,  0.96446914],
       [10.52392659, 12.99800914, 13.04292003,  0.96396464],
       [10.52849908, 12.9542272 , 12.9955638 ,  0.96436901],
       [10.5531063 , 12.97114243, 12.98883651,  0.96419041],
       [10.52065941, 12.95613619, 13.0276614 ,  0.96418176],
       [10.44807971, 12.88234509, 12.89572535,  0.96441466],
       [10.57878542, 12.96204618, 13.06274521,  0.96418091],
       [10.58631935, 12.98783152, 13.03866286,  0.96398684],
       [10.56918261, 12.960393  , 13.04075313,  0.96420731],
       [10.43504967, 12.81233638, 12.90253259,  0.96500598],
       [10.48650392, 12.92163877, 12.94822504,  0.96440278],
       [10.49646539, 12.89561921, 12.96143396,  0.96445078],
       [10.43440881, 12.85675147, 12.87738679,  0.96484895],
       [10.52388782, 12.9139282 , 12.98902812,  0.96441421],
       [10.47606824, 12.80632693, 12.94821616,  0.96503541],
       [10.53881771, 12.

In [8]:
t1 = time()
n_estimators= [100,300,500]
max_depth = [3,4, 5]
# n_estimators  = [500]
# max_depth = [3]
# epochs=20
cats = np.zeros((epochs,4))
for i in n_estimators:
    for j in max_depth:
        for epoch in range(epochs):
            data = shuffle(data)
            model = CatBoostRegressor(n_estimators=i, max_depth=j,verbose=False)
            result = KFoldCV(model, data)
            cats[epoch,:] = np.mean(result, axis=0)
#             print(np.mean(result, axis=0))
        print(np.mean(cats, axis=0))
#     [0.86499577 1.17583629 1.19355378 0.98050501]
# cats=np.array([[0.84773321, 1.18530512, 1.19306331, 0.98084826],
#        [0.87318519, 1.19095516, 1.20408149, 0.97999474],
#        [0.85981984, 1.16350233, 1.1858332 , 0.98023445],
#        [0.86596327, 1.16306753, 1.18090649, 0.98112577],
#        [0.86587085, 1.15946447, 1.16665111, 0.98154846],
#        [0.85216804, 1.18104583, 1.20084337, 0.98039339],
#        [0.85173353, 1.15397286, 1.19102704, 0.98076268],
#        [0.8645997 , 1.17639364, 1.18768641, 0.98105376],
#        [0.84649538, 1.16063863, 1.17129066, 0.98086062],
#        [0.87962123, 1.18580982, 1.20171304, 0.98037885],
#        [0.86219365, 1.15704304, 1.19524932, 0.98061188],
#        [0.87495255, 1.17701208, 1.1898502 , 0.98088003],
#        [0.87474074, 1.17512204, 1.17834407, 0.98084475],
#        [0.87032843, 1.21080409, 1.22664114, 0.97900683],
#        [0.86057597, 1.17325285, 1.19423038, 0.98005067],
#        [0.85329401, 1.15267492, 1.1737014 , 0.98045111],
#        [0.87063903, 1.16437776, 1.18565864, 0.98124465],
#        [0.89037259, 1.20667152, 1.23498653, 0.97889956],
#        [0.8647609 , 1.18759166, 1.19780891, 0.98053437],
#        [0.87086728, 1.19202033, 1.21150895, 0.98037537]])
print(time()-t1)

[10.37747275 12.72436499 12.79980348  0.96551372]
[10.58519651 12.91870298 13.05337143  0.96429616]
[10.73015337 13.16642433 13.23401671  0.96307028]
[10.35538067 12.75758874 12.79504926  0.96537849]
[10.67308177 13.05974276 13.17616899  0.9635503 ]
[10.8512124  13.33207559 13.43049909  0.96213997]
[10.53612196 12.91370757 12.99000992  0.96447462]
[10.77519322 13.14887232 13.23097987  0.9630522 ]
[11.04043806 13.54038365 13.64262577  0.96086491]
12.043314695358276


In [18]:
cats

array([[10.37925663, 12.78408265, 12.82481093,  0.96516762],
       [10.39282988, 12.76265586, 12.84776208,  0.9651003 ],
       [10.46903899, 12.75896029, 12.89643248,  0.96525215],
       [10.41397   , 12.8195005 , 12.85017683,  0.96499637],
       [10.4249591 , 12.69964867, 12.86365769,  0.9657004 ],
       [10.43008498, 12.75747407, 12.85532296,  0.96525789],
       [10.42255747, 12.81274752, 12.85425426,  0.96490365],
       [10.37894366, 12.74096763, 12.81729596,  0.9654078 ],
       [10.43873645, 12.85028132, 12.90209008,  0.96478502],
       [10.43445176, 12.71339938, 12.89477133,  0.96539347],
       [10.46154382, 12.82042994, 12.91211589,  0.96502546],
       [10.4431002 , 12.80465856, 12.89202044,  0.96501678],
       [10.50638007, 12.85535096, 12.95584764,  0.96477818],
       [10.40182231, 12.7935813 , 12.86503258,  0.96516637],
       [10.41661251, 12.78548677, 12.82951229,  0.96494549],
       [10.44134227, 12.84633906, 12.88016714,  0.96484527],
       [10.43944458, 12.

# hierarchical CatBoost

In [4]:
from sklearn.model_selection import GridSearchCV


def deepCatBoostBackward(train, test,importances,ind,n_estimators, max_depth):
    s = 0
    inds = []
    inds2 = []
    inds3 = []
    for i in ind:
        s = s + importances[i]
        inds.append(i)
#     70% importance for the first layer
        if (s > 0.9):
            model = gridSearch4CatBoost(train[:,inds], train[:,-1],n_estimators, max_depth)
            predY = model.fit(train[:,inds], train[:,-1]).predict(train[:,inds])
            predY2 = model.predict(test[:,inds])
        else:
            inds2.append(i)
#     20% importance for the second layer
            if (s > 0.95):
                tempTrain = np.array([train[:,inds2],predY])
                tempTest = np.array([test[:,inds2],predY2])
                model = gridSearch4CatBoost(tempTrain, train[:,-1],n_estimators, max_depth)
                predYY = model.fit(tempTrain, train[:,-1]).predict(tempTrain)
                predYY2 = model.predict(tempTest)
            else:
                inds3.append(i)
#     10% importance for the third layer
                if (s == 1):
                    tempTrain = np.array([train[:,inds3],predY,predYY])
                    tempTest = np.array([test[:,inds3],predY2,predYY2])
                    model = gridSearch4CatBoost(tempTrain, train[:,-1],n_estimators, max_depth)
#                     predY = model.fit(tempTrain, train[:,-1]).predict(tempTrain)
                    predY2 = model.predict(tempTest)
    return predY2

def deepCatBoostForward(train, test,importances,ind,n_estimators, max_depth):
#     ind = ind[::-1]
    s = 0
    inds = []
    inds2 = []
    inds3 = []
    for i in ind:
        s = s + importances[i]
        inds.append(i)
#     10% importance for the first layer
        if (s > 0.05):
            model = gridSearch4CatBoost(train[:,inds], train[:,-1],n_estimators, max_depth)
            predY = model.fit(train[:,inds], train[:,-1]).predict(train[:,inds])
            predY2 = model.predict(test[:,inds])
        else:
            inds2.append(i)
#     20% importance for the second layer
            if (s > 0.1):
                tempTrain = np.array([train[:,inds2],predY])
                tempTest = np.array([test[:,inds2],predY2])
                model = gridSearch4CatBoost(tempTrain, train[:,-1],n_estimators, max_depth)
                predYY = model.fit(tempTrain, train[:,-1]).predict(tempTrain)
                predYY2 = model.predict(tempTest)
            else:
                inds3.append(i)
#     10% importance for the third layer
                if (s == 1):
                    tempTrain = np.array([train[:,inds3],predY,predYY])
                    tempTest = np.array([test[:,inds3],predY2,predYY2])
                    model = gridSearch4CatBoost(tempTrain, train[:,-1],n_estimators, max_depth)
#                     predY = model.fit(tempTrain, train[:,-1]).predict(tempTrain)
                    predY2 = model.predict(tempTest)
    return predY2

def gridSearch4CatBoost(X,y,n_estimators, max_depth):
#     cbc = CatBoostRegressor(verbose=False,task_type="GPU",devices='0:1')
# #     cbc = RandomForestRegressor()
#     grid = {'max_depth': [3,4,5],'n_estimators':[100,300,500]}
#     gscv = GridSearchCV (estimator = cbc, param_grid = grid, cv = 10)
#     gscv.fit(X,y)
#     return gscv.best_estimator_
    cbc = CatBoostRegressor(n_estimators=n_estimators, max_depth=max_depth,verbose=False,task_type="GPU",devices='0:1').fit(X,y)
    return cbc
#     return model.fit(X,y).predict(testX)

def KFoldForward(data, ind,n_estimators, max_depth, n_fold=10):
#     num = len(data)
    diff = int(len(data)/n_fold)
    results = np.zeros((n_fold, 4))
    for i in range(n_fold):
        begin = diff*i
        end = diff*(i+1)
#         if i == n_fold-1:
#             end = -1
        test = data[begin:end]
        train = deepcopy(data)
        train = np.delete(train, range(begin, end),axis=0)
        X_train, y_train = train[:,:-1], train[:,-1]
        X_test, y = test[:,:-1], test[:,-1]
#         predictY = model.fit(X_train, y_train).predict(X_test)
        predictY = deepCatBoostForward(train, test,importances,ind,n_estimators, max_depth)
        mae = np.mean(abs((y-predictY)))
        stdErr = np.std(((y-predictY)))
        error=sum((y-predictY)**2)
        RMSE=np.sqrt(error/len(y))
        MAC = np.dot(y,predictY)**2/(np.dot(y, y)*np.dot(predictY, predictY))
#         print(mae, stdErr, RMSE, MAC)
        results[i,:] = [mae, stdErr, RMSE, MAC]
    return results

def KFoldBackward(data, ind,n_estimators, max_depth, n_fold=10):
#     num = len(data)
    diff = int(len(data)/n_fold)
    results = np.zeros((n_fold, 4))
    for i in range(n_fold):
        begin = diff*i
        end = diff*(i+1)
#         if i == n_fold-1:
#             end = -1
        test = data[begin:end]
        train = deepcopy(data)
        train = np.delete(train, range(begin, end),axis=0)
        X_train, y_train = train[:,:-1], train[:,-1]
        X_test, y = test[:,:-1], test[:,-1]
#         predictY = model.fit(X_train, y_train).predict(X_test)
        predictY = deepCatBoostBackward(train, test,importances,ind,n_estimators, max_depth)
        mae = np.mean(abs((y-predictY)))
        stdErr = np.std(((y-predictY)))
        error=sum((y-predictY)**2)
        RMSE=np.sqrt(error/len(y))
        MAC = np.dot(y,predictY)**2/(np.dot(y, y)*np.dot(predictY, predictY))
#         print(mae, stdErr, RMSE, MAC)
        results[i,:] = [mae, stdErr, RMSE, MAC]
    return results

In [5]:
t1 = time()
n_estimators= [100,300,500]
max_depth = [3,4, 5]
# n_estimators  = [500]
# max_depth = [5]
epochs=1
cats = np.zeros((epochs,4))
for i in n_estimators:
    for j in max_depth:
        for epoch in range(epochs):
            data = shuffle(data)
            model = RandomForestRegressor(n_estimators=100, max_depth=3).fit(data[:,:-1], data[:,-1])
#             model = CatBoostRegressor(n_estimators=300, max_depth=3,verbose=False,task_type="GPU",devices='0:1').fit(data[:,:-1], data[:,-1])#n_estimators=i, max_depth=j,
            importances = model.feature_importances_
            ind = np.argsort(importances)
        #     model = CatBoostRegressor(n_estimators=i, max_depth=j,verbose=False,task_type="GPU",devices='0:1')
            result = KFoldForward(data, ind,n_estimators=i, max_depth=j)
            cats[epoch,:] = np.mean(result, axis=0)
        print(np.mean(cats, axis=0))
print(time()-t1)

[10.31794775 12.59744585 12.71826218  0.9661297 ]
[10.50016582 12.86498933 12.9332439   0.96462085]
[10.67538711 13.13454818 13.18518262  0.96317516]
[10.34480213 12.73678409 12.76954871  0.96540485]
[10.5283942  12.91824589 13.00903076  0.96442466]
[10.82209096 13.25896912 13.35505148  0.9625322 ]
[10.29692961 12.61213251 12.6673503   0.96594111]
[10.32715859 12.57061756 12.69966222  0.96634898]
[10.37048235 12.65949302 12.75112944  0.96587143]
982.847273349762


In [8]:
t1 = time()
n_estimators= [100,300,500]
max_depth = [3,4, 5]
n_estimators  = [500]
max_depth = [3]
epochs=20
cats = np.zeros((epochs,4))
for i in n_estimators:
    for j in max_depth:
        for epoch in range(epochs):
            data = shuffle(data)
            model = RandomForestRegressor().fit(data[:,:-1], data[:,-1])#n_estimators=500, max_depth=5
#             model = CatBoostRegressor(n_estimators=300, max_depth=3,verbose=False,task_type="GPU",devices='0:1').fit(data[:,:-1], data[:,-1])#n_estimators=i, max_depth=j,
            importances = model.feature_importances_
            ind = np.argsort(importances)
        #     model = CatBoostRegressor(n_estimators=i, max_depth=j,verbose=False,task_type="GPU",devices='0:1')
            result = KFoldForward(data, ind,n_estimators=i, max_depth=j)
            cats[epoch,:] = np.mean(result, axis=0)
        print(np.mean(cats, axis=0))
print(time()-t1)
# cats = np.array([[10.28976097, 12.59481721, 12.65070932,  0.96624157],
#        [10.30024397, 12.63062878, 12.69241147,  0.96602366],
#        [10.33083445, 12.59531248, 12.69207518,  0.96621038],
#        [10.24792874, 12.58840255, 12.61996367,  0.96629239],
#        [10.30794189, 12.6137253 , 12.69108725,  0.96614657],
#        [10.29798972, 12.62405592, 12.67647327,  0.96602566],
#        [10.32873924, 12.61653428, 12.71578044,  0.9661013 ],
#        [10.28083714, 12.57314053, 12.62959809,  0.96616244],
#        [10.30403658, 12.63575339, 12.68464483,  0.9660592 ],
#        [10.26307977, 12.59914586, 12.64763497,  0.96622729],
#        [10.32173914, 12.6809491 , 12.72575812,  0.96570388],
#        [10.28004193, 12.61886929, 12.65076916,  0.96597413],
#        [10.3444059 , 12.61383646, 12.74207457,  0.96609326],
#        [10.3266248 , 12.59553247, 12.74308068,  0.96601364],
#        [10.34485756, 12.65096954, 12.7355711 ,  0.96579763],
#        [10.30052081, 12.5984135 , 12.68527344,  0.96601561],
#        [10.26237152, 12.56257937, 12.61511786,  0.96626345],
#        [10.36245171, 12.62669499, 12.71238474,  0.96595044],
#        [10.32223432, 12.68663689, 12.72457832,  0.96568758],
#        [10.28316186, 12.56110305, 12.67896506,  0.96627807]])



[10.3049901  12.61335505 12.68569758  0.96606341]
9377.573452949524


In [3]:
catsf = np.array([[10.28976097, 12.59481721, 12.65070932,  0.96624157],
       [10.30024397, 12.63062878, 12.69241147,  0.96602366],
       [10.33083445, 12.59531248, 12.69207518,  0.96621038],
       [10.24792874, 12.58840255, 12.61996367,  0.96629239],
       [10.30794189, 12.6137253 , 12.69108725,  0.96614657],
       [10.29798972, 12.62405592, 12.67647327,  0.96602566],
       [10.32873924, 12.61653428, 12.71578044,  0.9661013 ],
       [10.28083714, 12.57314053, 12.62959809,  0.96616244],
       [10.30403658, 12.63575339, 12.68464483,  0.9660592 ],
       [10.26307977, 12.59914586, 12.64763497,  0.96622729],
       [10.32173914, 12.6809491 , 12.72575812,  0.96570388],
       [10.28004193, 12.61886929, 12.65076916,  0.96597413],
       [10.3444059 , 12.61383646, 12.74207457,  0.96609326],
       [10.3266248 , 12.59553247, 12.74308068,  0.96601364],
       [10.34485756, 12.65096954, 12.7355711 ,  0.96579763],
       [10.30052081, 12.5984135 , 12.68527344,  0.96601561],
       [10.26237152, 12.56257937, 12.61511786,  0.96626345],
       [10.26237152, 12.56257937, 12.61511786,  0.96626345],
       [10.32223432, 12.68663689, 12.72457832,  0.96568758],
       [10.28316186, 12.56110305, 12.67896506,  0.96627807]])

In [13]:
t1 = time()
n_estimators= [100,300,500]
max_depth = [3,4, 5]
# n_estimators  = [500]
# max_depth = [3]
# epochs=20
cats = np.zeros((epochs,4))
for i in n_estimators:
    for j in max_depth:
        for epoch in range(epochs):
            data = shuffle(data)
            model = RandomForestRegressor(n_estimators=100, max_depth=3).fit(data[:,:-1], data[:,-1])
#             model = CatBoostRegressor(n_estimators=300, max_depth=3,verbose=False,task_type="GPU",devices='0:1').fit(data[:,:-1], data[:,-1])#n_estimators=i, max_depth=j,
            importances = model.feature_importances_
            ind = np.argsort(importances)
        #     model = CatBoostRegressor(n_estimators=i, max_depth=j,verbose=False,task_type="GPU",devices='0:1')
            result = KFoldBackward(data, ind[::-1],n_estimators=i, max_depth=j)
            cats[epoch,:] = np.mean(result, axis=0)
        print(np.mean(cats, axis=0))
print(time()-t1)

[10.36614959 12.69420403 12.79676066  0.96565374]
[10.47310438 12.91071287 12.9638704   0.96450822]
[10.62711653 13.06638053 13.08613655  0.96356736]
[10.42653544 12.74791228 12.78826384  0.9652562 ]
[10.62697537 13.02775326 13.09210136  0.9638165 ]
[10.70271048 13.04682205 13.17844813  0.96358623]
[10.35925177 12.69167011 12.73485574  0.9656374 ]
[10.2762919  12.55890107 12.63178806  0.96627053]
[10.28438036 12.6139164  12.69082185  0.9660747 ]
756.545571565628


In [19]:
mlps = np.array([[10.49708927, 12.78568516, 12.88564314,  0.96506534],
       [10.58147643, 12.85437211, 12.94835564,  0.96464657],
       [10.51136407, 12.79752304, 12.86320654,  0.96492994],
       [10.47790754, 12.80428571, 12.85632311,  0.96492226],
       [10.52447424, 12.84432187, 12.90651716,  0.96489199],
       [10.49910328, 12.77616124, 12.8904822 ,  0.96522214],
       [10.5578901 , 12.83660394, 12.93931126,  0.96485764],
       [10.52107536, 12.81853855, 12.90387554,  0.96505331],
       [10.5286978 , 12.863341  , 12.93575292,  0.96471475],
       [10.50522347, 12.85753958, 12.90199089,  0.96481261],
       [10.51651894, 12.87767554, 12.91485625,  0.96463246],
       [10.51182459, 12.85346298, 12.88399365,  0.96475729],
       [10.51790596, 12.77564868, 12.9150444 ,  0.96518826],
       [10.48896818, 12.757954  , 12.83458862,  0.96499535],
       [10.51183577, 12.86958061, 12.90885101,  0.96465234],
       [10.51815204, 12.78844878, 12.87294337,  0.96497814],
       [10.51418004, 12.80720822, 12.87322749,  0.96489738],
       [10.54494063, 12.90340229, 12.94266005,  0.96458269],
       [10.55739559, 12.82326593, 12.93500399,  0.96500314],
       [10.53123342, 12.82465294, 12.92804465,  0.96487665]])
svms = np.array([[10.55265309, 12.91135932, 12.96121856,  0.96445691],
       [10.53613806, 12.89480847, 12.95836585,  0.96453308],
       [10.56349846, 12.89703144, 12.9781195 ,  0.96458261],
       [10.54322054, 12.93271206, 12.97159624,  0.96441919],
       [10.57989849, 12.91855038, 12.97419243,  0.96441568],
       [10.54422089, 12.87879022, 12.94868123,  0.96449911],
       [10.53242466, 12.87917265, 12.94775777,  0.96459812],
       [10.54091675, 12.83903791, 12.92374584,  0.96477851],
       [10.54477993, 12.86189174, 12.96509862,  0.96465578],
       [10.55305094, 12.89110089, 12.96975662,  0.96459745],
       [10.54284385, 12.84146348, 12.97185249,  0.96464989],
       [10.49953087, 12.88643536, 12.92031439,  0.96462218],
       [10.54168813, 12.88879764, 12.96915171,  0.96473607],
       [10.5435933 , 12.92276307, 12.96614552,  0.96440535],
       [10.55600866, 12.89222411, 12.98503688,  0.96457528],
       [10.56713815, 12.87436489, 12.98802271,  0.96475304],
       [10.51646547, 12.85539206, 12.94128389,  0.96488786],
       [10.5703738 , 12.92927482, 12.97138278,  0.96439332],
       [10.52164072, 12.9118744 , 12.94880407,  0.96448353],
       [10.55874059, 12.93593009, 12.97524587,  0.96440243]])
rfs = np.array([[10.50371981, 12.83035242, 12.96062913,  0.96476865],
       [10.51447094, 12.93000406, 12.96474692,  0.96433851],
       [10.47177346, 12.83814261, 12.91418275,  0.96483733],
       [10.48295228, 12.83789353, 12.92986802,  0.96479649],
       [10.55029421, 12.9498245 , 12.98944056,  0.96418019],
       [10.48837408, 12.87853633, 12.89201561,  0.96461188],
       [10.51022087, 12.89210757, 12.92357099,  0.96442648],
       [10.50261109, 12.73921989, 12.92390231,  0.96537501],
       [10.54213681, 12.87338071, 12.97446532,  0.96458503],
       [10.55185355, 12.85583568, 12.98244224,  0.96455437],
       [10.51053395, 12.8460504 , 12.93470633,  0.96486805],
       [10.53985992, 12.82989303, 12.95838634,  0.96479536],
       [10.56097272, 12.91791281, 13.00311593,  0.96433031],
       [10.51935891, 12.93000476, 12.94252727,  0.96439548],
       [10.48156401, 12.8493928 , 12.89600681,  0.96474567],
       [10.49113719, 12.85744123, 12.92113676,  0.96465012],
       [10.51242014, 12.78956126, 12.94886437,  0.96515911],
       [10.52942762, 12.87599571, 12.94635579,  0.96465507],
       [10.52257925, 12.86119259, 12.90738945,  0.96461612],
       [10.4734361 , 12.81667768, 12.89220562,  0.96496302]])
xgbs = np.array([[10.57886198, 12.99494391, 13.04114744,  0.96410549],
       [10.51842529, 12.90851125, 12.95365621,  0.96446914],
       [10.52392659, 12.99800914, 13.04292003,  0.96396464],
       [10.52849908, 12.9542272 , 12.9955638 ,  0.96436901],
       [10.5531063 , 12.97114243, 12.98883651,  0.96419041],
       [10.52065941, 12.95613619, 13.0276614 ,  0.96418176],
       [10.44807971, 12.88234509, 12.89572535,  0.96441466],
       [10.57878542, 12.96204618, 13.06274521,  0.96418091],
       [10.58631935, 12.98783152, 13.03866286,  0.96398684],
       [10.56918261, 12.960393  , 13.04075313,  0.96420731],
       [10.43504967, 12.81233638, 12.90253259,  0.96500598],
       [10.48650392, 12.92163877, 12.94822504,  0.96440278],
       [10.49646539, 12.89561921, 12.96143396,  0.96445078],
       [10.43440881, 12.85675147, 12.87738679,  0.96484895],
       [10.52388782, 12.9139282 , 12.98902812,  0.96441421],
       [10.47606824, 12.80632693, 12.94821616,  0.96503541],
       [10.53881771, 12.93339016, 12.98809264,  0.96431833],
       [10.54238486, 12.95703225, 13.02064334,  0.9642589 ],
       [10.44647316, 12.86593061, 12.91987862,  0.96466773],
       [10.55382651, 12.85465195, 12.99457095,  0.96475651]])
cats = np.array([[10.37925663, 12.78408265, 12.82481093,  0.96516762],
       [10.39282988, 12.76265586, 12.84776208,  0.9651003 ],
       [10.46903899, 12.75896029, 12.89643248,  0.96525215],
       [10.41397   , 12.8195005 , 12.85017683,  0.96499637],
       [10.4249591 , 12.69964867, 12.86365769,  0.9657004 ],
       [10.43008498, 12.75747407, 12.85532296,  0.96525789],
       [10.42255747, 12.81274752, 12.85425426,  0.96490365],
       [10.37894366, 12.74096763, 12.81729596,  0.9654078 ],
       [10.43873645, 12.85028132, 12.90209008,  0.96478502],
       [10.43445176, 12.71339938, 12.89477133,  0.96539347],
       [10.46154382, 12.82042994, 12.91211589,  0.96502546],
       [10.4431002 , 12.80465856, 12.89202044,  0.96501678],
       [10.50638007, 12.85535096, 12.95584764,  0.96477818],
       [10.40182231, 12.7935813 , 12.86503258,  0.96516637],
       [10.41661251, 12.78548677, 12.82951229,  0.96494549],
       [10.44134227, 12.84633906, 12.88016714,  0.96484527],
       [10.43944458, 12.78307635, 12.85192703,  0.9650288 ],
       [10.45495306, 12.77353222, 12.83484946,  0.96495781],
       [10.3927987 , 12.7923809 , 12.84358586,  0.96504306],
       [10.41528807, 12.80266504, 12.87288684,  0.96488931]])
catsf = np.array([[10.28976097, 12.59481721, 12.65070932,  0.96624157],
       [10.30024397, 12.63062878, 12.69241147,  0.96602366],
       [10.33083445, 12.59531248, 12.69207518,  0.96621038],
       [10.24792874, 12.58840255, 12.61996367,  0.96629239],
       [10.30794189, 12.6137253 , 12.69108725,  0.96614657],
       [10.29798972, 12.62405592, 12.67647327,  0.96602566],
       [10.32873924, 12.61653428, 12.71578044,  0.9661013 ],
       [10.28083714, 12.57314053, 12.62959809,  0.96616244],
       [10.30403658, 12.63575339, 12.68464483,  0.9660592 ],
       [10.26307977, 12.59914586, 12.64763497,  0.96622729],
       [10.32173914, 12.6809491 , 12.72575812,  0.96570388],
       [10.28004193, 12.61886929, 12.65076916,  0.96597413],
       [10.3444059 , 12.61383646, 12.74207457,  0.96609326],
       [10.3266248 , 12.59553247, 12.74308068,  0.96601364],
       [10.26307977, 12.59914586, 12.64763497,  0.96622729],
       [10.30052081, 12.5984135 , 12.68527344,  0.96601561],
       [10.26237152, 12.56257937, 12.61511786,  0.96626345],
       [10.26237152, 12.56257937, 12.61511786,  0.96626345],
       [10.32223432, 12.68663689, 12.72457832,  0.96568758],
       [10.28316186, 12.56110305, 12.67896506,  0.96627807]])

In [20]:
print(np.mean(catsf, axis=0))

[10.2958972  12.60755808 12.67643743  0.96610054]


In [21]:
print(mlps.std(axis=0))
print(svms.std(axis=0))
print(rfs.std(axis=0))
print(xgbs.std(axis=0))
print(cats.std(axis=0))
print(catsf.std(axis=0))

[0.02410408 0.03830307 0.03085411 0.00017641]
[0.01847376 0.02804305 0.01789502 0.00013844]
[0.02610086 0.04875628 0.03176833 0.00027935]
[0.04768866 0.05667229 0.05315785 0.00030229]
[0.03069864 0.04078814 0.03321369 0.00022571]
[0.02701373 0.03313825 0.03987293 0.00016783]


In [22]:
def getTestResult(f1,f2,i):
    print(ranksums(np.asarray(f1[:,i]), np.asarray(f2[:,i])).pvalue)

i = 0
print(getTestResult(mlps, catsf,i))
print(getTestResult(svms, catsf,i))
print(getTestResult(rfs, catsf,i))
print(getTestResult(xgbs, catsf,i))
print(getTestResult(cats, catsf,i))

i = 2
print(getTestResult(mlps, catsf,i))
print(getTestResult(svms, catsf,i))
print(getTestResult(rfs, catsf,i))
print(getTestResult(xgbs, catsf,i))
print(getTestResult(cats, catsf,i))

6.301848221392269e-08
None
6.301848221392269e-08
None
6.301848221392269e-08
None
6.301848221392269e-08
None
6.301848221392269e-08
None
6.301848221392269e-08
None
6.301848221392269e-08
None
6.301848221392269e-08
None
6.301848221392269e-08
None
6.301848221392269e-08
None
