In [409]:
import pandas as pd
from sklearn import tree
def read_data():
    data = pd.read_table('./vertebral_column_data/column_3C.dat',header=None, sep=' ')
    data.columns = ['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle', 'sacral_slope', 'pelvic_radius',
                    'degree_spondylolisthesis', 'class']
    return data

def generate_data_info(data):
    features_set = data.columns[:6]
    label_name = data.columns[6]
    training_labels = data.iloc[:,6]
    overall_features_data = data.iloc[:, :6]
    return features_set, label_name, training_labels, overall_features_data
# features_set, label_name, training_labels, overall_features_data = generate_data_info(read_data())
print(read_data())


     pelvic_incidence  pelvic_tilt  lumbar_lordosis_angle  sacral_slope  \
0               63.03        22.55                  39.61         40.48   
1               39.06        10.06                  25.02         29.00   
2               68.83        22.22                  50.09         46.61   
3               69.30        24.65                  44.31         44.64   
4               49.71         9.65                  28.32         40.06   
..                ...          ...                    ...           ...   
305             47.90        13.62                  36.00         34.29   
306             53.94        20.72                  29.22         33.22   
307             61.45        22.69                  46.17         38.75   
308             45.25         8.69                  41.58         36.56   
309             33.84         5.07                  36.64         28.77   

     pelvic_radius  degree_spondylolisthesis class  
0            98.67                     -0.25  

In [394]:
from sklearn.model_selection import train_test_split
def train_split(test_size, shuffle, random_state):
    data = read_data()
    features_set, label_name, training_labels, overall_features_data = generate_data_info(data)
    data_train, data_test, labels_train, labels_test = train_test_split(overall_features_data, training_labels,
                                                                        test_size=test_size, shuffle=True,
                                                                        random_state=random_state, 
                                                                        stratify=training_labels)
    return data_train, data_test, labels_train, labels_test, features_set

In [406]:
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import numpy as np
def predict(max_depth, criterion, data_train, labels_train, data_test, labels_test):
    model = tree.DecisionTreeClassifier(max_depth=max_depth, criterion=criterion)
    model = model.fit(data_train, labels_train)
    return model

In [421]:
import graphviz
def save_tree_file(model, test_size, random_state, max_depth, criterion, scores, features_set):
    dot_data = tree.export_graphviz(model, out_file=None,
        feature_names=features_set,
        class_names=label_name,
        filled=True, rounded=True,
        special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.format='png'
    graph.render('./Decision_Tree_GraphsGraph-TestSize({})-RandomState({})-MaxDepth({})-Criterion({})-Score({})'
                 .format(test_size, random_state, max_depth, criterion, scores), view=False)

In [410]:
import matplotlib.pyplot as plt
def visualize_data(x_array, y_array, x_label, y_label, criterion,graph_name):
    plt.title(graph_name)
    plt.scatter(x_array, y_array, c=criterion)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.savefig('./Decision_Tree_Analysis_Graph/{}.jpg'.format(graph_name))
    plt.show()
    

In [405]:
import numpy as np
from sklearn import metrics
from sklearn import tree
def generate_train_split_decision_trees():
    random_state_list = [7,27, 43]
    criterion_list = ['entropy', 'gini']
    features_set=''
    max_scores_list = []
    max_test_size_list = []
    max_criterion_list = []
    max_depth_list = []
    max_node_count_list = []
    DH_Scores_list = []
    SL_Scores_list = []
    NO_Scores_list = []
    DH_SL_Scores_list = []
    DH_NO_Scores_list = []
    SL_NO_Scores_list = []

    for testSize in np.arange(0.1, 1, 0.1):
        for cri in criterion_list:
            max_score = 0
            max_test_size = 0
            max_random_state = 0
            max_depth = 0
            max_criterion = ''
            max_confusion_matrix= ''
            max_tree_value = ''
            max_decision_path = ''
            max_model = ''
            max_node_count = 0
            DH_Scores_TMP = 0
            SL_Scores_TMP = 0
            NO_Scores_TMP = 0
            DH_SL_Scores_TMP = 0
            DH_NO_Scores_TMP = 0
            SL_NO_Scores_TMP = 0
            for state in random_state_list:
                for depth in range(1, 10):
                        data_train, data_test, labels_train, labels_test, features_set=train_split(testSize, 
                                                                                                   True, state)
                        model = predict(depth, cri, data_train, labels_train, data_test, labels_test)
                        labels_predict = model.predict(data_test)
                        score = accuracy_score(labels_test, labels_predict)
                        df = pd.concat([data_test, labels_test], axis=1)
                        
                        df_DH = df[df['class']=='DH']
                        DH_data_test = df_DH.iloc[:, :6]
                        DH_labels_test = df_DH.iloc[:, 6]
                        DH_labels_predict = model.predict(DH_data_test)
                        DH_Scores = accuracy_score(DH_labels_test, DH_labels_predict)
                        
                        df_SL = df[df['class']=='SL']
                        SL_data_test = df_SL.iloc[:, :6]
                        SL_labels_test = df_SL.iloc[:, 6]
                        SL_labels_predict = model.predict(SL_data_test)
                        SL_Scores = accuracy_score(SL_labels_test, SL_labels_predict)
                        
                        df_NO = df[df['class']=='NO']
                        NO_data_test = df_NO.iloc[:, :6]
                        NO_labels_test = df_NO.iloc[:, 6]
                        NO_labels_predict = model.predict(NO_data_test)
                        NO_Scores = accuracy_score(NO_labels_test, NO_labels_predict)
                        
                        df_DH_SL = df[df['class']==('SL' or 'DH')]
                        DH_SL_data_test = df_DH_SL.iloc[:, :6]
                        DH_SL_labels_test = df_DH_SL.iloc[:, 6]
                        DH_SL_labels_test.replace('DH', 'SL')
                        DH_SL_labels_predict = model.predict(DH_SL_data_test)
                        DH_SL_labels_predict[1].replace('DH', 'SL')
                        DH_SL_Scores = accuracy_score(DH_SL_labels_test, DH_SL_labels_predict)
                        
                        df_DH_NO = df[df['class']==('NO' or 'DH')]
                        DH_NO_data_test = df_DH_NO.iloc[:, :6]
                        DH_NO_labels_test = df_DH_NO.iloc[:, 6]
                        DH_NO_labels_test.replace('DH', 'NO')
                        DH_NO_labels_predict = model.predict(DH_NO_data_test)
                        DH_NO_labels_predict[1].replace('DH', 'NO')
                        DH_NO_Scores = accuracy_score(DH_NO_labels_test, DH_NO_labels_predict)
                        
                        df_SL_NO = df[df['class']==('NO' or 'DH')]
                        SL_NO_data_test = df_SL_NO.iloc[:, :6]
                        SL_NO_labels_test = df_SL_NO.iloc[:, 6]
                        SL_NO_labels_test.replace('SL', 'NO')
                        SL_NO_labels_predict = model.predict(SL_NO_data_test)
                        SL_NO_labels_predict[1].replace('SL', 'NO')
                        SL_NO_Scores = accuracy_score(SL_NO_labels_test, SL_NO_labels_predict)
                        
                        confusion_matrix = metrics.confusion_matrix(labels_test, labels_predict)
                        tree_value = model.tree_.value
                        decision_path = model.decision_path(data_test).todense()
                        if score > max_score:
                            max_score = score
                            max_test_size = testSize
                            max_random_state = state
                            max_depth = depth
                            max_criterion = cri
                            max_confusion_matrix= confusion_matrix
                            max_tree_value = tree_value
                            max_decision_path = decision_path
                            max_model = model
                            max_node_count = model.tree_.node_count
                            DH_Scores_TMP = DH_Scores
                            SL_Scores_TMP = SL_Scores
                            NO_Scores_TMP = NO_Scores
                            DH_SL_Scores_TMP = DH_SL_Scores
                            DH_NO_Scores_TMP = DH_NO_Scores
                            SL_NO_Scores_TMP = SL_NO_Scores
            max_scores_list.append(max_score)   
            max_test_size_list.append(max_test_size)
            max_depth_list.append(max_depth)
            max_criterion_color = 'b' if max_criterion == 'gini' else 'y'
            max_criterion_list.append(max_criterion_color)
            max_node_count_list.append(max_node_count)
            DH_Scores_list.append(DH_Scores_TMP)
            SL_Scores_list.append(SL_Scores_TMP)
            NO_Scores_list.append(NO_Scores_TMP)
            DH_SL_Scores_list.append(DH_SL_Scores_TMP)
            DH_NO_Scores_list.append(DH_NO_Scores_TMP)
            SL_NO_Scores_list.append(SL_NO_Scores_TMP)
            print('Max TestSize:{} State:{} Depth:{} Criterion: {} model score: {} Node Count: {} \n DH_Scores: {} SL_Scores: {} NO_Scores: {} \n DH_SL_Scores: {} DH_NO_Scores: {} SL_NO_Scores: {}'
                  .format(max_test_size, max_random_state, max_depth, max_criterion, max_score, max_node_count, DH_Scores_TMP, SL_Scores_TMP, NO_Scores_TMP, DH_SL_Scores_TMP, DH_NO_Scores_TMP, SL_NO_Scores_TMP))
#             print('Confusion Matrix: \n', max_confusion_matrix)
#             print('Max Decision Path: ', max_decision_path)
#             save_tree_file(max_model, max_test_size, max_random_state, max_depth, 
#                            max_criterion, max_score, features_set)
#     print(len(max_scores_list))
#     print(len(max_test_size_list))
#     print(len(max_criterion_list))
#     visualize_data(max_test_size_list[0::2], max_scores_list[0::2], 'Test Set Size', 'Max Scores', max_criterion_list[0::2], 'Entropy_Graph_With_Test_Set_Size_and_Max_Scores')
#     visualize_data(max_test_size_list[1::2], max_scores_list[1::2], 'Test Set Size', 'Max Scores', max_criterion_list[1::2], 'Gini_Graph_With_Test_Set_Size_and_Max_Scores')
#     visualize_data(max_test_size_list[0::2], max_depth_list[0::2], 'Test Set Size', 'Max Depth', max_criterion_list[0::2], 'Entropy_Graph_With_Test_Set_Size_and_Max_Depth')
#     visualize_data(max_test_size_list[1::2], max_depth_list[1::2], 'Test Set Size', 'Max Depth', max_criterion_list[1::2], 'Gini_Graph_With_Test_Set_Size_and_Max_Depth')
#     visualize_data(max_depth_list[0::2], max_scores_list[0::2], 'Max Depth', 'Max Scores', max_criterion_list[0::2], 'Entropy_Graph_With_Max_Depth_and_Max_Scores')
#     visualize_data(max_depth_list[1::2], max_scores_list[1::2], 'Max Depth', 'Max Scores', max_criterion_list[1::2], 'Gini_Graph_With_Max_Depth_and_Max_Scores')
#     visualize_data(max_node_count_list[0::2], max_scores_list[0::2], 'Node Count', 'Max Scores', max_criterion_list[0::2], 'Entropy_Graph_With_Max_Node_Count_and_Max_Scores')
#     visualize_data(max_node_count_list[1::2], max_scores_list[1::2], 'Node Count', 'Max Scores', max_criterion_list[1::2], 'Gini_Graph_With_Max_Node_Count_and_Max_Scores')
#     visualize_data(max_test_size_list, DH_Scores_list, 'Test Set Size', 'DH Scores', None, 'Test_Set_Size_and_DH_Scores')
#     visualize_data(max_test_size_list, SL_Scores_list, 'Test Set Size', 'SL Scores', None, 'Test_Set_Size_and_SL_Scores')
#     visualize_data(max_test_size_list, NO_Scores_list, 'Test Set Size', 'NO Scores', None, 'Test_Set_Size_and_NO_Scores')
#     visualize_data(max_test_size_list, DH_SL_Scores_list, 'Test Set Size', 'DH_SL Scores', None, 'Test_Set_Size_and_DH_SL_Scores')
#     visualize_data(max_test_size_list, DH_NO_Scores_list, 'Test Set Size', 'DH_NO Scores', None, 'Test_Set_Size_and_DH_NO_Scores')
#     visualize_data(max_test_size_list, SL_NO_Scores_list, 'Test Set Size', 'SL_NO Scores', None, 'Test_Set_Size_and_SL_NO_Scores')
    
    
    
    
generate_train_split_decision_trees()
                    
                    
                        
        

Max TestSize:0.1 State:27 Depth:6 Criterion: entropy model score: 0.9354838709677419 Node Count: 41 
 DH_Scores: 1.0 SL_Scores: 1.0 NO_Scores: 0.8 
 DH_SL_Scores: 1.0 DH_NO_Scores: 0.8 SL_NO_Scores: 0.8
Max TestSize:0.1 State:27 Depth:4 Criterion: gini model score: 0.9032258064516129 Node Count: 25 
 DH_Scores: 0.6666666666666666 SL_Scores: 1.0 NO_Scores: 0.9 
 DH_SL_Scores: 1.0 DH_NO_Scores: 0.9 SL_NO_Scores: 0.9
Max TestSize:0.2 State:27 Depth:6 Criterion: entropy model score: 0.9193548387096774 Node Count: 41 
 DH_Scores: 0.9166666666666666 SL_Scores: 1.0 NO_Scores: 0.8 
 DH_SL_Scores: 1.0 DH_NO_Scores: 0.8 SL_NO_Scores: 0.8
Max TestSize:0.2 State:27 Depth:3 Criterion: gini model score: 0.9193548387096774 Node Count: 13 
 DH_Scores: 0.75 SL_Scores: 1.0 NO_Scores: 0.9 
 DH_SL_Scores: 1.0 DH_NO_Scores: 0.9 SL_NO_Scores: 0.9
Max TestSize:0.30000000000000004 State:27 Depth:4 Criterion: entropy model score: 0.8936170212765957 Node Count: 21 
 DH_Scores: 0.7222222222222222 SL_Scores: 1.0 

In [399]:
features_set, label_name, training_labels, overall_features_data = generate_data_info(read_data())
# print(type(training_labels))
# print(type(overall_features_data))
# result = pd.concat([overall_features_data, training_labels], axis=1)
# result_DH= result[result['class']=='DH']
# result_SL= result[result['class']=='SL']
training_labels.replace('DH', 'SL')
print(training_labels)


# print(overall_features_data[:, 0:6] + training_labels[:,:])

0      DH
1      DH
2      DH
3      DH
4      DH
       ..
305    NO
306    NO
307    NO
308    NO
309    NO
Name: class, Length: 310, dtype: object


In [265]:
data.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.03,22.55,39.61,40.48,98.67,-0.25,DH
1,39.06,10.06,25.02,29.0,114.41,4.56,DH
2,68.83,22.22,50.09,46.61,105.99,-3.53,DH
3,69.3,24.65,44.31,44.64,101.87,11.21,DH
4,49.71,9.65,28.32,40.06,108.17,7.92,DH


In [194]:
data['class'].value_counts()

SL    150
NO    100
DH     60
Name: class, dtype: int64

In [266]:
import graphviz
def save_tree_file():
    dot_data = tree.export_graphviz(model, out_file=None,
        feature_names=features_set,
        class_names=label_name,
        filled=True, rounded=True,
        special_characters=True)
    graph = graphviz.Source(dot_data)
    grpah.format='png'
    graph.render('.//Graphs//Graph-{}'.format(i), view=True)

In [13]:
# Test
from sklearn import tree
from sklearn import datasets
iris = datasets.load_iris()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# print
# training_labels = dftrain.iloc[:,label_column_index]
# labels = le.fit_transform(training_labels)
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(iris.data, iris.target)
print(clf)
print(iris.data)
print(iris.target)


DecisionTreeClassifier(max_depth=5)
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 

In [5]:
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None,
 feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
 special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('Iris', view=True)

'Iris.pdf'

In [1]:
print("Hello")

Hello
