In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import pandas.plotting
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.model_selection import RepeatedKFold
from autorank import autorank, plot_stats
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

ALGORITHM_NAMES = ["random_forest","decision_stump","unpruned_decision_tree","pruned_decision_tree"]

labels = pd.read_csv("labels_A2.csv", header = None)
labels.columns = ["target_variable"]
data = pd.read_csv("data_A2.csv", header = None)

#Task One, dealing with missing values
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
data=pd.DataFrame(imputer.fit_transform(data))
data.head(5) #only showing 5 for space

#Creating dataset with Target noise
flip = np.random.binomial(1, 0.05, np.array(labels.iloc[:,0]).shape).astype(bool)
task_6_targets = pd.DataFrame(np.where(flip, 1 - np.array(labels.iloc[:,0]),np.array(labels.iloc[:,0])))

repeated_k_fold = RepeatedKFold(n_splits=10, n_repeats=10)

In [2]:
#clean dataset
dataframes = [data,labels]
full = pd.concat(dataframes, axis=1)
correlations = full.corr(method="pearson")
target_variable_correlation = np.array(correlations.iloc[:,-1])[:-1]
copy = [x for x in target_variable_correlation]
copy.sort()
copy = copy[::-1]
indexes = [list(target_variable_correlation).index(value) for value in copy]


top_ten_indexes = []
while len(top_ten_indexes) < 10:
    correlation_value = copy[0]
    if len(top_ten_indexes)!= 0:
        similarity_count  = 0 
        index = indexes[0]
        column1 = data.iloc[:,index]
        for a in range(len(top_ten_indexes)):
            count = 0
            column2 = data.iloc[:,top_ten_indexes[a]]
            for number in range(1000):
                if column1[number] == column2[number]:
                    count +=1
            if similarity_count < count:
                similarity_count = count
        if similarity_count < 500:
            top_ten_indexes.append(index)     
    else:
        top_ten_indexes.append(indexes[0])
    copy = copy[1:]
    indexes = indexes[1:]
    
new_columns = [data.iloc[:,index] for index in top_ten_indexes]
cleaned_features = pd.concat(new_columns, axis=1)

cleaned_features.head(5)

Unnamed: 0,78,77,92,64,95,4,96,43,67,38
0,-0.7229,-0.215309,-0.867395,1.424084,4.107535,-1.366659,3.817897,-1.919015,-0.916539,0.294604
1,2.492973,-2.245505,-0.382098,0.471146,-1.267888,1.65677,-1.843256,1.394421,0.87401,-1.731359
2,-0.247858,-3.41649,-4.660688,-3.615905,-4.317565,-1.733342,-2.506476,-0.290845,-1.840549,-0.520724
3,-1.338379,1.198258,7.150844,2.767716,-0.08195,-0.41629,0.186089,1.046033,-0.279751,-2.133925
4,3.575898,-1.122903,3.576609,2.773314,-2.613114,-0.075077,-3.117484,1.371734,-1.755619,0.251593


In [3]:
#TASK 3
ALGORITHM_NAMES = ["random_forest","decision_stump","unpruned_decision_tree","pruned_decision_tree"]
task_3_dataframe = pd.DataFrame (columns = ALGORITHM_NAMES)
random_forest = RandomForestClassifier()
decision_stump = DecisionTreeClassifier(max_depth=1)
decision_tree_unpruned = DecisionTreeClassifier()
random_forest_task_3 = cross_val_score(random_forest, cleaned_features, labels, cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
decision_stump_task_3 = cross_val_score(decision_stump, cleaned_features, labels, cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
unpruned_decision_tree_task_3 = cross_val_score(decision_tree_unpruned, cleaned_features, labels, cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
task_3_dataframe[ALGORITHM_NAMES[0]] = random_forest_task_3
task_3_dataframe[ALGORITHM_NAMES[1]]= decision_stump_task_3
task_3_dataframe[ALGORITHM_NAMES[2]]= unpruned_decision_tree_task_3

pruned_decision_tree_task_3 = []
for train_index, test_index in repeated_k_fold.split(cleaned_features):
    train_indexes = list(train_index)
    training_x = cleaned_features.loc[train_indexes]
    training_y = labels.loc[train_indexes]
    x_train,x_validate,y_train, y_validate = train_test_split(training_x, training_y,test_size=0.30)
    path = DecisionTreeClassifier().cost_complexity_pruning_path(x_train, y_train)
    highest_accuracy,ccp_alpha_value = 0.0, 0
    for value in path.ccp_alphas:
        accuracy = DecisionTreeClassifier(ccp_alpha=value).fit(x_train, y_train.values.ravel()).score(x_validate, y_validate)
        if accuracy > highest_accuracy:
            highest_accuracy, ccp_alpha_value = accuracy, value
    test_indexes = list(test_index)
    test_x = cleaned_features.loc[test_indexes]
    test_y = labels.loc[test_indexes]
    decision_tree_pruned = DecisionTreeClassifier(ccp_alpha = ccp_alpha_value)
    accuracy = decision_tree_pruned.fit(training_x,training_y.values.ravel()).score(test_x,test_y)
    pruned_decision_tree_task_3.append(accuracy)
task_3_dataframe[ALGORITHM_NAMES[3]]= pruned_decision_tree_task_3
task_3_result = autorank(task_3_dataframe, alpha=0.05, verbose=False)
print(task_3_result)

RankResult(rankdf=
                        meanrank    mean       std  ci_lower  ci_upper  \
random_forest              1.250  0.7216  0.041772  0.713789  0.729411   
unpruned_decision_tree     2.420  0.6485  0.043794  0.640689  0.656311   
pruned_decision_tree       2.965  0.6317  0.041707  0.623889  0.639511   
decision_stump             3.365  0.6080  0.043924  0.600189  0.615811   

                       effect_size   magnitude  
random_forest                    0  negligible  
unpruned_decision_tree     1.70814       large  
pruned_decision_tree       2.15383       large  
decision_stump             2.65041       large  
pvalue=1.5783662832414107e-53
cd=None
omnibus=anova
posthoc=tukeyhsd
all_normal=True
pvals_shapiro=[0.5841171145439148, 0.4552447497844696, 0.023776041343808174, 0.15958285331726074]
homoscedastic=True
pval_homogeneity=0.9222673283626363
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=N

In [4]:
#Task 4

#Creating dataset with additive noise, taken from piazza
a_noise = np.random.normal(0, 0.2, np.shape(cleaned_features))
a_noise_data = cleaned_features + np.multiply(a_noise, np.average(cleaned_features, axis=0))
task_4_features = pd.DataFrame(a_noise_data)

task_4_dataframe = pd.DataFrame (columns = ALGORITHM_NAMES)
random_forest = RandomForestClassifier()
decision_stump = DecisionTreeClassifier(max_depth=1)
decision_tree_unpruned = DecisionTreeClassifier()
random_forest_task_4 = cross_val_score(random_forest, task_4_features, labels, cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
decision_stump_task_4 = cross_val_score(decision_stump, task_4_features, labels, cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
unpruned_decision_tree_task_4 = cross_val_score(decision_tree_unpruned, task_4_features, labels, cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
task_4_dataframe[ALGORITHM_NAMES[0]] = random_forest_task_4
task_4_dataframe[ALGORITHM_NAMES[1]]= decision_stump_task_4
task_4_dataframe[ALGORITHM_NAMES[2]]= unpruned_decision_tree_task_4

pruned_decision_tree_task_4 = []
for train_index, test_index in repeated_k_fold.split(task_4_features):
    train_indexes = list(train_index)
    training_x = task_4_features.loc[train_indexes]
    training_y = labels.loc[train_indexes]
    x_train,x_validate,y_train, y_validate = train_test_split(training_x, training_y,test_size=0.30)
    path = DecisionTreeClassifier().cost_complexity_pruning_path(x_train, y_train)
    highest_accuracy,ccp_alpha_value = 0.0, 0
    for value in path.ccp_alphas:
        accuracy = DecisionTreeClassifier(ccp_alpha=value).fit(x_train, y_train.values.ravel()).score(x_validate, y_validate)
        if accuracy > highest_accuracy:
            highest_accuracy, ccp_alpha_value = accuracy, value
    test_indexes = list(test_index)
    test_x = task_4_features.loc[test_indexes]
    test_y = labels.loc[test_indexes]
    decision_tree_pruned = DecisionTreeClassifier(ccp_alpha = ccp_alpha_value)
    accuracy = decision_tree_pruned.fit(training_x,training_y.values.ravel()).score(test_x,test_y)
    pruned_decision_tree_task_4.append(accuracy)
task_4_dataframe[ALGORITHM_NAMES[3]]= pruned_decision_tree_task_4
task_4_result = autorank(task_4_dataframe, alpha=0.05, verbose=False)
print(task_4_result)

RankResult(rankdf=
                        meanrank    mean       std  ci_lower  ci_upper  \
random_forest              1.250  0.7201  0.042604  0.711656  0.728544   
pruned_decision_tree       2.640  0.6425  0.049121  0.634056  0.650944   
unpruned_decision_tree     2.755  0.6401  0.045848  0.631656  0.648544   
decision_stump             3.355  0.6138  0.047328  0.605356  0.622244   

                       effect_size   magnitude  
random_forest                    0  negligible  
pruned_decision_tree       1.68775       large  
unpruned_decision_tree     1.80767       large  
decision_stump             2.36074       large  
pvalue=1.678738485751227e-45
cd=None
omnibus=anova
posthoc=tukeyhsd
all_normal=True
pvals_shapiro=[0.050294648855924606, 0.5874094367027283, 0.8740835785865784, 0.636618435382843]
homoscedastic=True
pval_homogeneity=0.546912328718421
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=None


In [5]:
#Task 5
#Creating dataset with muliplicative noise, taken from piazza
m_noise = np.random.normal(1, 0.2, np.shape(cleaned_features))
m_noise_data = np.multiply(cleaned_features, m_noise)
task_5_features = pd.DataFrame(m_noise_data)

task_5_dataframe = pd.DataFrame (columns = ALGORITHM_NAMES)
random_forest = RandomForestClassifier()
decision_stump = DecisionTreeClassifier(max_depth=1)
decision_tree_unpruned = DecisionTreeClassifier()
random_forest_task_5 = cross_val_score(random_forest, task_5_features, labels.iloc[:,0], cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
decision_stump_task_5 = cross_val_score(decision_stump, task_5_features, labels.iloc[:,0], cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
unpruned_decision_tree_task_5 = cross_val_score(decision_tree_unpruned, task_5_features, labels.iloc[:,0], cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
task_5_dataframe[ALGORITHM_NAMES[0]] = random_forest_task_5
task_5_dataframe[ALGORITHM_NAMES[1]]= decision_stump_task_5
task_5_dataframe[ALGORITHM_NAMES[2]]= unpruned_decision_tree_task_5

pruned_decision_tree_task_5 = []
for train_index, test_index in repeated_k_fold.split(cleaned_features):
    train_indexes = list(train_index)
    training_x = task_5_features.loc[train_indexes]
    training_y = labels.loc[train_indexes]
    x_train,x_validate,y_train, y_validate = train_test_split(training_x, training_y,test_size=0.30)
    path = DecisionTreeClassifier().cost_complexity_pruning_path(x_train, y_train)
    highest_accuracy,ccp_alpha_value = 0.0, 0
    for value in path.ccp_alphas:
        accuracy = DecisionTreeClassifier(ccp_alpha=value).fit(x_train, y_train.values.ravel()).score(x_validate, y_validate)
        if accuracy > highest_accuracy:
            highest_accuracy, ccp_alpha_value = accuracy, value
    test_indexes = list(test_index)
    test_x = task_5_features.loc[test_indexes]
    test_y = labels.loc[test_indexes]
    decision_tree_pruned = DecisionTreeClassifier(ccp_alpha = ccp_alpha_value)
    accuracy = decision_tree_pruned.fit(training_x,training_y.values.ravel()).score(test_x,test_y)
    pruned_decision_tree_task_5.append(accuracy)
task_5_dataframe[ALGORITHM_NAMES[3]]= pruned_decision_tree_task_5
task_5_result = autorank(task_5_dataframe, alpha=0.05, verbose=False)
print(task_5_result)

RankResult(rankdf=
                        meanrank    mean       std  ci_lower  ci_upper  \
random_forest              1.330  0.7149  0.047001  0.706268  0.723532   
pruned_decision_tree       2.525  0.6487  0.052081  0.640068  0.657332   
unpruned_decision_tree     2.775  0.6398  0.043251  0.631168  0.648432   
decision_stump             3.370  0.6100  0.046493  0.601368  0.618632   

                       effect_size   magnitude  
random_forest                    0  negligible  
pruned_decision_tree       1.33452       large  
unpruned_decision_tree     1.66279       large  
decision_stump             2.24396       large  
pvalue=9.201963221579113e-41
cd=None
omnibus=anova
posthoc=tukeyhsd
all_normal=True
pvals_shapiro=[0.34087854623794556, 0.05607712268829346, 0.3257023096084595, 0.2571146786212921]
homoscedastic=True
pval_homogeneity=0.3200294581372539
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=Non

In [6]:
#task 6
task_6_dataframe = pd.DataFrame (columns = ALGORITHM_NAMES)
random_forest = RandomForestClassifier()
decision_stump = DecisionTreeClassifier(max_depth=1)
decision_tree_unpruned = DecisionTreeClassifier()
random_forest_task_6 = cross_val_score(random_forest, cleaned_features, task_6_targets, cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
decision_stump_task_6 = cross_val_score(decision_stump, cleaned_features, task_6_targets, cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
unpruned_decision_tree_task_6 = cross_val_score(decision_tree_unpruned, cleaned_features, task_6_targets, cv=repeated_k_fold, scoring='accuracy',n_jobs = -1)
task_6_dataframe[ALGORITHM_NAMES[0]] = random_forest_task_6
task_6_dataframe[ALGORITHM_NAMES[1]]= decision_stump_task_6
task_6_dataframe[ALGORITHM_NAMES[2]]= unpruned_decision_tree_task_6

pruned_decision_tree_task_6 = []
for train_index, test_index in repeated_k_fold.split(cleaned_features):
    train_indexes = list(train_index)
    training_x = cleaned_features.loc[train_indexes]
    training_y = task_6_targets.loc[train_indexes]
    x_train,x_validate,y_train, y_validate = train_test_split(training_x, training_y,test_size=0.30)
    path = DecisionTreeClassifier().cost_complexity_pruning_path(x_train, y_train)
    highest_accuracy,ccp_alpha_value = 0.0, 0
    for value in path.ccp_alphas:
        accuracy = DecisionTreeClassifier(ccp_alpha=value).fit(x_train, y_train.values.ravel()).score(x_validate, y_validate)
        if accuracy > highest_accuracy:
            highest_accuracy, ccp_alpha_value = accuracy, value
    test_indexes = list(test_index)
    test_x = cleaned_features.loc[test_indexes]
    test_y = task_6_targets.loc[test_indexes]
    decision_tree_pruned = DecisionTreeClassifier(ccp_alpha = ccp_alpha_value)
    accuracy = decision_tree_pruned.fit(training_x,training_y.values.ravel()).score(test_x,test_y)
    pruned_decision_tree_task_6.append(accuracy)
task_6_dataframe[ALGORITHM_NAMES[3]]= pruned_decision_tree_task_6
task_6_result = autorank(task_6_dataframe, alpha=0.05, verbose=False)
print(task_6_result)

RankResult(rankdf=
                        meanrank    mean       std  ci_lower  ci_upper  \
random_forest              1.375  0.6795  0.046696  0.671049  0.687951   
pruned_decision_tree       2.630  0.6132  0.049561  0.604749  0.621651   
unpruned_decision_tree     2.870  0.6024  0.046648  0.593949  0.610851   
decision_stump             3.125  0.5915  0.042077  0.583049  0.599951   

                       effect_size   magnitude  
random_forest                    0  negligible  
pruned_decision_tree       1.37694       large  
unpruned_decision_tree     1.65195       large  
decision_stump              1.9799       large  
pvalue=1.4672756472598458e-34
cd=None
omnibus=anova
posthoc=tukeyhsd
all_normal=True
pvals_shapiro=[0.8119857907295227, 0.6176819205284119, 0.29801955819129944, 0.48048049211502075]
homoscedastic=True
pval_homogeneity=0.44435675954245457
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=N

In [7]:
#task 7a mulitplicative noise to training only


task_7_dataframe = pd.DataFrame (columns = ALGORITHM_NAMES)
random_forest = RandomForestClassifier()
decision_stump = DecisionTreeClassifier(max_depth=1)
decision_tree_unpruned = DecisionTreeClassifier()
algorithms = [random_forest, decision_stump, decision_tree_unpruned,decision_tree_pruned]

results = {value:[] for value in ALGORITHM_NAMES}
for train_index, test_index in repeated_k_fold.split(cleaned_features):
    train_indexes = list(train_index)
    training_x = cleaned_features.loc[train_indexes]
    training_y = labels.loc[train_indexes]
    normal_train = cleaned_features.loc[train_index]
    m_noise = np.random.normal(1, 0.2, np.shape(training_x))
    m_noise_training = np.multiply(training_x, m_noise)
    noise_training = pd.DataFrame(m_noise_training)
    

    x_train,x_validate,y_train, y_validate = train_test_split(noise_training, training_y,test_size=0.30)
    path = DecisionTreeClassifier().cost_complexity_pruning_path(x_train, y_train)
    highest_accuracy,ccp_alpha_value = 0.0, 0
    for value in path.ccp_alphas:
        accuracy = DecisionTreeClassifier(ccp_alpha=value).fit(x_train, y_train.values.ravel()).score(x_validate, y_validate)
        if accuracy > highest_accuracy:
            highest_accuracy, ccp_alpha_value = accuracy, value
    decision_tree_pruned = DecisionTreeClassifier(ccp_alpha = ccp_alpha_value)
    test_indexes = list(test_index)
    test_x = cleaned_features.loc[test_indexes]
    test_y = labels.loc[test_indexes]

    for num in range(len(ALGORITHM_NAMES)):
        accuracy = algorithms[num].fit(training_x,training_y.values.ravel()).score(test_x,test_y.values.ravel())
        results[ALGORITHM_NAMES[num]].append(accuracy)
for p in range(len(ALGORITHM_NAMES)):
    task_7_dataframe[ALGORITHM_NAMES[p]] = results[ALGORITHM_NAMES[p]]
result = autorank(task_7_dataframe, alpha=0.05, verbose=False)
print(result)

RankResult(rankdf=
                        meanrank    mean       std  ci_lower  ci_upper  \
random_forest              1.155  0.7173  0.045767  0.709104  0.725496   
pruned_decision_tree       2.590  0.6465  0.041081  0.638304  0.654696   
unpruned_decision_tree     2.850  0.6382  0.041643  0.630004  0.646396   
decision_stump             3.405  0.6116  0.050567  0.603404  0.619796   

                       effect_size   magnitude  
random_forest                    0  negligible  
pruned_decision_tree       1.62806       large  
unpruned_decision_tree     1.80785       large  
decision_stump             2.19172       large  
pvalue=6.629142541543662e-62
cd=None
omnibus=anova
posthoc=tukeyhsd
all_normal=True
pvals_shapiro=[0.7044161558151245, 0.20814861357212067, 0.33433419466018677, 0.1336606740951538]
homoscedastic=True
pval_homogeneity=0.13123447720280806
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=No

In [8]:
#task 7b mulitplicative noise to test only


task_7b_dataframe = pd.DataFrame (columns = ALGORITHM_NAMES)
random_forest = RandomForestClassifier()
decision_stump = DecisionTreeClassifier(max_depth=1)
decision_tree_unpruned = DecisionTreeClassifier()
algorithms = [random_forest, decision_stump, decision_tree_unpruned,decision_tree_pruned]

results = {value:[] for value in ALGORITHM_NAMES}
for train_index, test_index in repeated_k_fold.split(cleaned_features):
    train_indexes = list(train_index)
    training_x = cleaned_features.loc[train_indexes]
    training_y = labels.loc[train_indexes]
    x_train,x_validate,y_train, y_validate = train_test_split(noise_training, training_y,test_size=0.30)
    path = DecisionTreeClassifier().cost_complexity_pruning_path(x_train, y_train)
    highest_accuracy,ccp_alpha_value = 0.0, 0
    for value in path.ccp_alphas:
        accuracy = DecisionTreeClassifier(ccp_alpha=value).fit(x_train, y_train.values.ravel()).score(x_validate, y_validate)
        if accuracy > highest_accuracy:
            highest_accuracy, ccp_alpha_value = accuracy, value
    test_indexes = list(test_index)
    test_x = cleaned_features.loc[test_indexes]
    test_y = labels.loc[test_indexes]
    m_noise = np.random.normal(1, 0.2, np.shape(test_x))
    m_noise_test = np.multiply(test_x,m_noise)
    noise_test = pd.DataFrame(m_noise_test)
    decision_tree_pruned = DecisionTreeClassifier(ccp_alpha = ccp_alpha_value)
    for num in range(len(ALGORITHM_NAMES)):
        accuracy = algorithms[num].fit(training_x,training_y.values.ravel()).score(test_x,test_y)
        results[ALGORITHM_NAMES[num]].append(accuracy)
for p in range(len(ALGORITHM_NAMES)):
    task_7b_dataframe[ALGORITHM_NAMES[p]] = results[ALGORITHM_NAMES[p]]
result = autorank(task_7b_dataframe, alpha=0.05, verbose=False)
print(result)

RankResult(rankdf=
                        meanrank    mean       std  ci_lower  ci_upper  \
random_forest              1.125  0.7203  0.040984  0.711988  0.728612   
unpruned_decision_tree     2.720  0.6397  0.048272  0.631388  0.648012   
pruned_decision_tree       2.730  0.6397  0.045226  0.631388  0.648012   
decision_stump             3.425  0.6139  0.047416  0.605588  0.622212   

                       effect_size   magnitude  
random_forest                    0  negligible  
unpruned_decision_tree     1.80004       large  
pruned_decision_tree        1.8676       large  
decision_stump             2.40089       large  
pvalue=9.826818346502524e-64
cd=None
omnibus=anova
posthoc=tukeyhsd
all_normal=True
pvals_shapiro=[0.3564307391643524, 0.22958727180957794, 0.07281657308340073, 0.16306118667125702]
homoscedastic=True
pval_homogeneity=0.37946682411442395
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=N

In [14]:
#Analysis for task 4

for algorithm in ALGORITHM_NAMES:
    new = pd.DataFrame()
    new["Task3"] = list(task_3_dataframe[algorithm])
    new["Task4"] = list(task_4_dataframe[algorithm])
    analysis_task_4 = autorank(new, alpha=0.05, verbose=False)
    print(algorithm)
    print(analysis_task_4)
    print("")

random_forest
RankResult(rankdf=
       meanrank    mean       std  ci_lower  ci_upper effect_size   magnitude
Task3     1.465  0.7216  0.041772  0.712093  0.731107           0  negligible
Task4     1.535  0.7201  0.042604  0.710403  0.729797   0.0355533  negligible
pvalue=0.7868765259832449
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.5841171145439148, 0.050294648855924606]
homoscedastic=True
pval_homogeneity=0.844709707878143
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.025
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=None
rope_mode=None
effect_size=cohen_d)

decision_stump
RankResult(rankdf=
       meanrank    mean       std  ci_lower  ci_upper effect_size   magnitude
Task3     1.465  0.6080  0.043924  0.598003  0.617997           0  negligible
Task4     1.535  0.6138  0.047328  0.603028  0.624572   -0.127032  negligible
pvalue=0.42899567568126606
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.455244749784469

In [10]:
#Analysis for task 5

for algorithm in ALGORITHM_NAMES:
    new = pd.DataFrame()
    new["Task3"] = list(task_3_dataframe[algorithm])
    new["Task5"] = list(task_5_dataframe[algorithm])
    analysis_task_5 = autorank(new, alpha=0.05, verbose=False)
    print(algorithm)
    print(analysis_task_5)
    print("")

random_forest
RankResult(rankdf=
       meanrank    mean       std  ci_lower  ci_upper effect_size   magnitude
Task3      1.44  0.7216  0.041772  0.712093  0.731107           0  negligible
Task5      1.56  0.7149  0.047001  0.704203  0.725597    0.150686  negligible
pvalue=0.3028126354910162
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.5841171145439148, 0.34087854623794556]
homoscedastic=True
pval_homogeneity=0.24231853956936572
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.025
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=None
rope_mode=None
effect_size=cohen_d)

decision_stump
RankResult(rankdf=
       meanrank   mean       std  ci_lower  ci_upper effect_size   magnitude
Task5      1.46  0.610  0.046493  0.599418  0.620582           0  negligible
Task3      1.54  0.608  0.043924  0.598003  0.617997   0.0442217  negligible
pvalue=0.7566198484676405
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.4552447497844696, 

In [11]:
#Analysis for task 6

for algorithm in ALGORITHM_NAMES:
    new = pd.DataFrame()
    new["Task3"] = list(task_3_dataframe[algorithm])
    new["Task6"] = list(task_6_dataframe[algorithm])
    analysis_task_6 = autorank(new, alpha=0.05, verbose=False)
    print(algorithm)
    print(analysis_task_6)
    print("")

random_forest
RankResult(rankdf=
       meanrank    mean       std  ci_lower  ci_upper effect_size   magnitude
Task3     1.275  0.7216  0.041772  0.712093  0.731107           0  negligible
Task6     1.725  0.6795  0.046696  0.668872  0.690128    0.950282       large
pvalue=5.442079396224627e-09
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.5841171145439148, 0.8119857907295227]
homoscedastic=True
pval_homogeneity=0.26919137379090113
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.025
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=None
rope_mode=None
effect_size=cohen_d)

decision_stump
RankResult(rankdf=
       meanrank    mean       std  ci_lower  ci_upper effect_size   magnitude
Task3     1.355  0.6080  0.043924  0.598003  0.617997           0  negligible
Task6     1.645  0.5915  0.042077  0.581923  0.601077     0.38363       small
pvalue=0.007452525828782583
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.45524474978

In [12]:
#Analysis for task 7a

for algorithm in ALGORITHM_NAMES:
    new = pd.DataFrame()
    new["Task3"] = list(task_3_dataframe[algorithm])
    new["Task7"] = list(task_7_dataframe[algorithm])
    analysis_task_7 = autorank(new, alpha=0.05, verbose=False)
    print(algorithm)
    print(analysis_task_7)
    print("")

random_forest
RankResult(rankdf=
       meanrank    mean       std  ci_lower  ci_upper effect_size   magnitude
Task3      1.44  0.7216  0.041772  0.712093  0.731107           0  negligible
Task7      1.56  0.7173  0.045767  0.706883  0.727717   0.0981394  negligible
pvalue=0.5161298654129525
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.5841171145439148, 0.7044161558151245]
homoscedastic=True
pval_homogeneity=0.36493646529537194
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.025
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=None
rope_mode=None
effect_size=cohen_d)

decision_stump
RankResult(rankdf=
       meanrank    mean       std  ci_lower  ci_upper effect_size   magnitude
Task7     1.475  0.6116  0.050567  0.600091  0.623109           0  negligible
Task3     1.525  0.6080  0.043924  0.598003  0.617997   0.0760104  negligible
pvalue=0.5559567167479427
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.4552447497844696

In [13]:
#Analysis for task 7b

for algorithm in ALGORITHM_NAMES:
    new = pd.DataFrame()
    new["Task3"] = list(task_3_dataframe[algorithm])
    new["Task7b"] = list(task_7b_dataframe[algorithm])
    analysis_task_7b = autorank(new, alpha=0.05, verbose=False)
    print(algorithm)
    print(analysis_task_7b)
    print("")

random_forest
RankResult(rankdf=
        meanrank    mean       std  ci_lower  ci_upper effect_size   magnitude
Task3      1.495  0.7216  0.041772  0.712093  0.731107           0  negligible
Task7b     1.505  0.7203  0.040984  0.710972  0.729628   0.0314162  negligible
pvalue=0.8151637307167183
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.5841171145439148, 0.3564307391643524]
homoscedastic=True
pval_homogeneity=0.8501547724701844
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.025
num_samples=100
posterior_matrix=
None
decision_matrix=
None
rope=None
rope_mode=None
effect_size=cohen_d)

decision_stump
RankResult(rankdf=
        meanrank    mean       std  ci_lower  ci_upper effect_size   magnitude
Task7b     1.485  0.6139  0.047416  0.603108  0.624692           0  negligible
Task3      1.515  0.6080  0.043924  0.598003  0.617997    0.129094  negligible
pvalue=0.35171010850281226
cd=None
omnibus=ttest
posthoc=None
all_normal=True
pvals_shapiro=[0.4552447497