# Deodel: New Algorithm vs Scikit-Learn on Titanic Dataset

Deodel is a new algorithm for supervised classification. It features robust processing of input data and is particularly apt in dealing with mixed attributes/features.
Here is a demonstration of the algorithm's accuracy on the Titanic dataset compared to a selection of scikit-learn classifiers.

More info about deodel at: https://github.com/c4pub/deodel


## Summary

In this specific setting, the new algorithm (**deodel.DeodataDelangaClassifier**) outperforms a selection of sklearn classifiers. Only sklearn classifiers that could run with default configurations were included in the evaluation.

    - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    accuracy: 0.8049327354260087  DeodataDelangaClassifier({})
    accuracy: 0.8043946188340807  NuSVC()
    accuracy: 0.8029147982062781  SVC()
    accuracy: 0.798878923766816   MLPClassifier()
    accuracy: 0.7967713004484309  CalibratedClassifierCV()
    accuracy: 0.7966367713004484  GaussianNB()
    accuracy: 0.7965919282511212  LogisticRegression()
    accuracy: 0.7962331838565025  LinearSVC()
    accuracy: 0.7951121076233189  LogisticRegressionCV()
    accuracy: 0.7939910313901346  RidgeClassifier()
    accuracy: 0.7939461883408073  RidgeClassifierCV()
    accuracy: 0.7937668161434975  AdaBoostClassifier()
    accuracy: 0.7936322869955157  LinearDiscriminantAnalysis()
    accuracy: 0.7927802690582959  GaussianProcessClassifier()
    accuracy: 0.7921076233183855  RandomForestClassifier(max_depth=5, random_state=1)
    accuracy: 0.7890582959641256  BernoulliNB()
    accuracy: 0.7871300448430495  HistGradientBoostingClassifier()
    accuracy: 0.7866367713004486  GradientBoostingClassifier()
    accuracy: 0.7853811659192824  LabelPropagation()
    accuracy: 0.7851121076233183  LabelSpreading()
    accuracy: 0.7847533632286995  MultinomialNB()
    accuracy: 0.7829596412556054  ExtraTreesClassifier()
    accuracy: 0.7827354260089683  BaggingClassifier()
    accuracy: 0.7825112107623317  ExtraTreeClassifier()
    accuracy: 0.7822421524663676  DecisionTreeClassifier()
    accuracy: 0.7818834080717488  RandomForestClassifier()
    accuracy: 0.773946188340807   KNeighborsClassifier()
    accuracy: 0.755605381165919   NearestCentroid()
    accuracy: 0.7405381165919285  SGDClassifier()
    accuracy: 0.7263228699551572  KNeighborsClassifier(n_neighbors=1)
    accuracy: 0.7169058295964125  Perceptron()
    accuracy: 0.7143049327354261  PassiveAggressiveClassifier()
    accuracy: 0.6643946188340807  QuadraticDiscriminantAnalysis()
    accuracy: 0.6187892376681613  GaussianMixture()
    accuracy: 0.6187892376681613  BayesianGaussianMixture()
    accuracy: 0.15242152466367714 OneClassSVM()

    - - - - - - - - - - - - - - - - - - - - - - - - - - - -


Running deodel on the raw columns, not processed with *get_dummies()*, yields:


    - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    accuracy: 0.8052017937219729  DeodataDelangaClassifier({})

    - - - - - - - - - - - - - - - - - - - - - - - - - - - -


Version info:

    - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - execution end: 2023-06-14 06:00 PM (UTC)
    - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - python version: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]
    - - - - scikit-learn version: 1.2.2
    - - - - deodel version: 1.75
    - - - - - - - - - - - - - - - - - - - - - - - - - - - -



### Compare test code

In [1]:
#@title
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Helper functions
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def GetTimeStr() :
    import datetime

    now = datetime.datetime.utcnow()
    crt_time_str = str(now.strftime("%Y-%m-%d %I:%M %p (UTC)"))
    return crt_time_str

# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def GetDummiesForSelectAttr(train_data, col_target, col_features_select, drop_first=False) :

    col_eval_data = col_target + col_features_select
    data_select = pd.get_dummies(train_data[col_eval_data], drop_first=drop_first)

    # print("data_select.head:")
    # print(data_select.head())

    data_select_columns = data_select.columns.tolist()
    # print("data_select_columns:", data_select_columns)
    attr_select_columns = data_select_columns[:]
    attr_select_columns.remove(col_target[0])
    # print("attr_select_columns:", attr_select_columns)

    X_pd_data = data_select[attr_select_columns]
    # print("X_pd_data.head:")
    # print(X_pd_data.head())
    X_pd_data.head()

    y_pd_data = (train_data[col_target[0]])
    # print("y_pd_data.head:")
    # print(y_pd_data.head())

    return X_pd_data, y_pd_data

# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def ListToTabStr(in_data_list, in_tab_list = 8) :

    more_char = '>'
    space_char = ' '
    list_len = len(in_data_list)
    if not isinstance(in_tab_list, list) :
        use_tab_list = [in_tab_list] * (list_len)
    else :
        use_tab_list = in_tab_list
    total_str = ""
    for crt_idx in range(list_len - 1) :
        crt_elem = in_data_list[crt_idx]
        crt_tab = use_tab_list[crt_idx]
        data_width = crt_tab - 1
        crt_str = str(crt_elem)
        str_len = len(crt_str)
        if str_len == 0 :
            transf_str = (space_char)*(data_width)
        elif str_len > data_width :
            transf_str = crt_str[:(data_width - 1)] + more_char
        else :
            transf_str = crt_str + space_char * (data_width - str_len)
        total_str += (transf_str + space_char)
    # last column element can be any length
    transf_str = str(in_data_list[-1])
    total_str += (transf_str + space_char)
    return total_str

# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Average accuracy evaluation function
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def AvgAccuracyTest(x_data, y_target, classifier, iterations = 1, random_seed = None, test_fraction = 0.5) :

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    import datetime

    begin_time_ref = datetime.datetime.now()
    crt_time_ref = datetime.datetime.now()

    # test_fraction = 1.0/3
    cumulate_acc = 0

    crt_rand_seed = random_seed
    for crt_idx in range(iterations) :
        if not random_seed == None :
            crt_rand_seed = random_seed + crt_idx
        ret_tuple = train_test_split(x_data, y_target, test_size = test_fraction, random_state = crt_rand_seed)
        x_train, x_test, y_train, y_test = ret_tuple
        classifier.fit(x_train, y_train)
        predictions = classifier.predict(x_test)
        accuracy = accuracy_score(y_test, predictions)
        cumulate_acc += accuracy

    new_time_ref = datetime.datetime.now()
    delta = new_time_ref - crt_time_ref
    delta_secs = delta.total_seconds()

    avg_accuracy = (cumulate_acc * 1.0) / iterations
    return avg_accuracy, delta_secs

# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Accuracy compare test
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def GroupAccuracyTest(X_data, y_target, classifier_lst, iterations = 1, random_seed = 42, test_fraction = 0.5) :

    test_tbl = []
    print_tab_len = 20
    for crt_classif in classifier_lst :
        try :
            accuracy, delta_secs = AvgAccuracyTest(X_data, y_target, crt_classif, iterations, random_seed=random_seed, test_fraction=test_fraction)
            line_str = ListToTabStr([accuracy, crt_classif], print_tab_len)
            print (line_str)
            test_tbl.append( [accuracy, line_str] )
        # except :
        except (ValueError, KeyError) as excerr:
            print("exception for:", str(crt_classif))
            print("error:", excerr)
    return test_tbl

# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Accuracy compare test
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def PredictionDataShow(x_pd_data, y_pd_target) :

    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("x_pd_data.info:")
    print(x_pd_data.info())
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("x_pd_data.head:")
    print(x_pd_data.head())
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("x_pd_data.shape:")
    print(x_pd_data.shape)
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print()
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("y_pd_data.info:")
    print(y_pd_data.info())
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("y_pd_target.head:")
    print(y_pd_target.head())
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("y_pd_target.shape:")
    print(y_pd_target.shape)
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print()

# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Accuracy compare test
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def DisplayCompareAccuracyTest(X_pd_data, y_pd_target, test_tbl) :

    # X_data = X_pd_data
    # y_data = y_pd_data

    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("- - - CompareAccuracyTest - Start")
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("- - - - average accuracy test")
    print()
    print("- - - - iterations:", iterations)
    print("- - - - random_seed:", random_seed)
    print("- - - - test_fraction:", test_fraction)
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print()
    print("- - - - ranked accuracy result")
    print()
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print()

    # rank_classif = sorted(test_tbl)
    rank_classif = reversed(sorted(test_tbl))
    for crt_elem in rank_classif :
        print( "accuracy: %s"%(crt_elem[1]) )

    print()
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("- - - CompareAccuracyTest - Stop")
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print()

# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



### Get remote ressources

In [2]:
#@title
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
print("*** Get remote files - start")
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

import urllib
import shutil
try:
  import validators
except:
  !pip install validators
  import validators

remote_list = [
                {'file': 'deodel.py', 'url': "https://raw.githubusercontent.com/c4pub/deodel/main/deodel.py"},
                {'file': 'usap_csv_eval.py', 'url': "https://raw.githubusercontent.com/c4pub/deodel/main/usap_csv_eval.py"},
                {'file': 'titanic.csv', 'url': "https://raw.githubusercontent.com/c4pub/misc/main/data/titanic.csv"},
            ]

for remote_entry in remote_list :
    file_name = remote_entry['file']
    url = remote_entry['url']
    with urllib.request.urlopen(url) as response, open(file_name, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)

# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
print("*** Get remote files - stop")
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


*** Get remote files - start
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting validators
  Downloading validators-0.20.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: validators
  Building wheel for validators (setup.py) ... [?25l[?25hdone
  Created wheel for validators: filename=validators-0.20.0-py3-none-any.whl size=19579 sha256=a848982ca45364dbf510d3ad7de2125e18752b8830663a5026387eb387c5b5e8
  Stored in directory: /root/.cache/pip/wheels/f2/ed/dd/d3a556ad245ef9dc570c6bcd2f22886d17b0b408dd3bbb9ac3
Successfully built validators
Installing collected packages: validators
Successfully installed validators-0.20.0
*** Get remote files - stop


### Begin test


In [3]:
#@title

print("- - - test begin")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys

print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("- - - - execution begin:", GetTimeStr())
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")


- - - test begin
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - execution begin: 2023-06-14 07:39 PM (UTC)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 


### Create list of classifiers



In [4]:
#@title
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Create list of classifiers
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

import deodel
import sklearn
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.mixture import BayesianGaussianMixture
from sklearn.mixture import GaussianMixture

classifier_lst = [
                    ExtraTreeClassifier(),
                    HistGradientBoostingClassifier(),
                    DecisionTreeClassifier(),
                    OneClassSVM(),
                    MLPClassifier(),
                    RadiusNeighborsClassifier(), #!
                    KNeighborsClassifier(),
                    KNeighborsClassifier(n_neighbors=1),
                    SGDClassifier(),
                    RidgeClassifierCV(),
                    RidgeClassifier(),
                    PassiveAggressiveClassifier(),
                    GaussianProcessClassifier(),
                    AdaBoostClassifier(),
                    GradientBoostingClassifier(),
                    BaggingClassifier(),
                    ExtraTreesClassifier(),
                    RandomForestClassifier(),
                    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1),
                    BernoulliNB(),
                    CalibratedClassifierCV(),
                    GaussianNB(),
                    LabelPropagation(),
                    LabelSpreading(),
                    LinearDiscriminantAnalysis(),
                    LinearSVC(),
                    LogisticRegression(),
                    LogisticRegressionCV(),
                    MultinomialNB(),
                    NearestCentroid(),
                    NuSVC(),
                    Perceptron(),
                    QuadraticDiscriminantAnalysis(),
                    SVC(),
                    BayesianGaussianMixture(),
                    GaussianMixture(),
                    #! ClassifierChain(),
                    #! MultiOutputClassifier(),
                    #! OutputCodeClassifier(),
                    #! OneVsOneClassifier(),
                    #! OneVsRestClassifier(),
                    #! VotingClassifier(),
                    deodel.DeodataDelangaClassifier(),
                ]

print("- - - - - - - - - ")
print("- - - - deodel version:", str(deodel.DeodataDelangaClassifier.version))
print("- - - - scikit-learn version:", str(sklearn.__version__))
print("- - - - - - - - - ")


- - - - - - - - - 
- - - - deodel version: 1.75
- - - - scikit-learn version: 1.2.2
- - - - - - - - - 


### Extract CSV data

In [5]:
#@title
train_data = pd.read_csv("titanic.csv")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("train_data.head:")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print(train_data.head())
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")

- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
train_data.head:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7

In [6]:
#@title
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
#@title
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("train_data entry no:", len(train_data))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("train_data.info:")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print(train_data.info())
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("train_data.shape:")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print(train_data.shape)
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")

- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
train_data entry no: 891
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
train_data.info:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
- - - - - - - - - - - - - 

### Preparing data

In [8]:
#@title


In [9]:
#@title
data_columns = train_data.columns.tolist()
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("data_columns:", data_columns)

col_target = ['Survived']
col_features_all = ["PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]

print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("data_columns:", data_columns)
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("col_features_all:", col_features_all)
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("col_target:", col_target)
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")

if (set(col_features_all + col_target) != set(data_columns)) :
    error_msg = "inconsistency! - column description mismatch"
    print(error_msg)
    raise ValueError(error_msg)
else :
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
    print("colums descriptions are consistent")

# classifier_lst = GetClassifierList()
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("classifier_lst: ", classifier_lst)
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")

# iterations = 1
iterations = 50
random_seed = 42
# random_seed = 4242
# random_seed = 4342
# random_seed = 4442

test_fraction = 0.5

- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
data_columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
data_columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
col_features_all: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
col_target: ['Survived']
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
colums descriptions are consistent
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
classifier_lst:  [ExtraTreeClassifier(), HistGradientBoostingClassifier(), DecisionTreeClassifier(), OneClassSVM(), MLPClassifier(), RadiusNeighborsClassifi

## Accuracy compare test
This test uses the same attribute columns as those in Alexis Cook's recommended [Kaggle Titanic Tutorial](https://www.kaggle.com/code/alexisbcook/titanic-tutorial/notebook).

### Execute test

In [10]:
#@title
col_features_select = ["Pclass", "Sex", "SibSp", "Parch"]
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("col_features_select:", col_features_select)
X_pd_data, y_pd_data = GetDummiesForSelectAttr(train_data, col_target, col_features_select)

print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
test_tbl = GroupAccuracyTest(X_pd_data, y_pd_data, classifier_lst, iterations, random_seed, test_fraction)
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
PredictionDataShow(X_pd_data, y_pd_data)
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")



- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
col_features_select: ['Pclass', 'Sex', 'SibSp', 'Parch']
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
0.7823766816143496  ExtraTreeClassifier() 
0.7871300448430495  HistGradientBoostingClassifier() 
0.7820179372197309  DecisionTreeClassifier() 
0.15242152466367714 OneClassSVM() 




0.797578475336323   MLPClassifier() 
exception for: RadiusNeighborsClassifier()
error: No neighbors found for test samples array([227]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.
0.773946188340807   KNeighborsClassifier() 
0.7263228699551572  KNeighborsClassifier(n_neighbors=1) 
0.7065919282511212  SGDClassifier() 
0.7939461883408073  RidgeClassifierCV() 
0.7939910313901346  RidgeClassifier() 
0.7125112107623319  PassiveAggressiveClassifier() 
0.7927802690582959  GaussianProcessClassifier() 
0.7937668161434975  AdaBoostClassifier() 
0.786457399103139   GradientBoostingClassifier() 
0.7817488789237668  BaggingClassifier() 
0.7834080717488788  ExtraTreesClassifier() 
0.783183856502242   RandomForestClassifier() 
0.7921076233183855  RandomForestClassifier(max_depth=5, random_state=1) 
0.7890582959641256  BernoulliNB() 




0.7967713004484309  CalibratedClassifierCV() 
0.7966367713004484  GaussianNB() 
0.7853811659192824  LabelPropagation() 
0.7851121076233183  LabelSpreading() 
0.7936322869955157  LinearDiscriminantAnalysis() 




0.7962780269058298  LinearSVC() 
0.7965919282511212  LogisticRegression() 
0.7951121076233189  LogisticRegressionCV() 
0.7847533632286995  MultinomialNB() 
0.755605381165919   NearestCentroid() 
0.8043946188340807  NuSVC() 
0.7169058295964125  Perceptron() 




0.6643946188340807  QuadraticDiscriminantAnalysis() 
0.8029147982062781  SVC() 
0.6187892376681613  BayesianGaussianMixture() 
0.6187892376681613  GaussianMixture() 
0.8049327354260087  DeodataDelangaClassifier({}) 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
x_pd_data.info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Pclass      891 non-null    int64
 1   SibSp       891 non-null    int64
 2   Parch       891 non-null    int64
 3   Sex_female  891 non-null    uint8
 4   Sex_male    891 non-null    uint8
dtypes: int64(3), uint8(2)
memory usage: 22.7 KB
None
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
x_pd_data.head:
   Pclass  SibSp  Parch  Sex_female  Sex_male
0       3      1      0           0         1
1       1      1      0           1         0
2       3   

### Compare test result

In [11]:
#@title
DisplayCompareAccuracyTest(X_pd_data, y_pd_data, test_tbl)
print("- - - - - - - - -", GetTimeStr())


- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - CompareAccuracyTest - Start
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - average accuracy test

- - - - iterations: 50
- - - - random_seed: 42
- - - - test_fraction: 0.5
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 

- - - - ranked accuracy result

- - - - - - - - - - - - - - - - - - - - - - - - - - - - 

accuracy: 0.8049327354260087  DeodataDelangaClassifier({}) 
accuracy: 0.8043946188340807  NuSVC() 
accuracy: 0.8029147982062781  SVC() 
accuracy: 0.797578475336323   MLPClassifier() 
accuracy: 0.7967713004484309  CalibratedClassifierCV() 
accuracy: 0.7966367713004484  GaussianNB() 
accuracy: 0.7965919282511212  LogisticRegression() 
accuracy: 0.7962780269058298  LinearSVC() 
accuracy: 0.7951121076233189  LogisticRegressionCV() 
accuracy: 0.7939910313901346  RidgeClassifier() 
accuracy: 0.7939461883408073  RidgeClassifierCV() 
accuracy: 0.7937668161434975  AdaBoostClassifier() 
accuracy: 0.793

## Test with the raw attributes
In this test the table values are left unchanged. Non-numerical attributes will automatically be treated as  multi valued categorical/nominal features.

### Execute test

In [12]:
#@title
X_pd_data = train_data[col_features_select]
yy_pd_data = train_data[col_target]
y_pd_data = (yy_pd_data.iloc[:,0])

print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
PredictionDataShow(X_pd_data, y_pd_data)
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
test_tbl = GroupAccuracyTest(X_pd_data, y_pd_data, [deodel.DeodataDelangaClassifier()], iterations, random_seed, test_fraction)
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")



- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
x_pd_data.info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Pclass  891 non-null    int64 
 1   Sex     891 non-null    object
 2   SibSp   891 non-null    int64 
 3   Parch   891 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 28.0+ KB
None
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
x_pd_data.head:
   Pclass     Sex  SibSp  Parch
0       3    male      1      0
1       1  female      1      0
2       3  female      0      0
3       1  female      1      0
4       3    male      0      0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
x_pd_data.shape:
(891, 4)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 

- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
y_pd_data.info:
<class 'pandas.

### Raw test result

In [13]:
#@title
# DisplayCompareAccuracyTest(X_pd_data, y_pd_data, test_tbl)

print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print()
print("- - - - average accuracy result")
print()
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print()
print("accuracy: %s"%(test_tbl[0][1]))
print()
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print()
print("- - - - - - - - -", GetTimeStr())


- - - - - - - - - - - - - - - - - - - - - - - - - - - - 

- - - - average accuracy result

- - - - - - - - - - - - - - - - - - - - - - - - - - - - 

accuracy: 0.8052017937219729  DeodataDelangaClassifier({}) 

- - - - - - - - - - - - - - - - - - - - - - - - - - - - 

- - - - - - - - - 2023-06-14 07:41 PM (UTC)


##### end

In [14]:
#@title
# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
import sys
import sklearn

print()
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("- - - - execution end:", GetTimeStr())
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print("- - - - python version:", str(sys.version))
print("- - - - scikit-learn version:", str(sklearn.__version__))
print("- - - - deodel version:", str(deodel.DeodataDelangaClassifier.version))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print()
print("- - - test end")
print()

# >- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - execution end: 2023-06-14 07:41 PM (UTC)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - python version: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]
- - - - scikit-learn version: 1.2.2
- - - - deodel version: 1.75
- - - - - - - - - - - - - - - - - - - - - - - - - - - - 

- - - test end

