In [178]:
# Import required dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [179]:
model_scores = pd.DataFrame()

def build_model_scores_df(name, trainacc, testacc):
    model_scores[name] = [trainacc, testacc]

In [180]:
# Import data
file_path = "vertebral-column.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544,Hernia
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,Hernia
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Hernia
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,Hernia
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Hernia


In [181]:
# Check the data types
df.dtypes

pelvic_incidence            float64
pelvic_tilt                 float64
lumbar_lordosis_angle       float64
sacral_slope                float64
pelvic_radius               float64
degree_spondylolisthesis    float64
class                        object
dtype: object

In [182]:
# Get the features (everything except the "Class" column)
X = df.copy()
X = X.drop(columns="class")
X.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501


In [183]:
X.dtypes

pelvic_incidence            float64
pelvic_tilt                 float64
lumbar_lordosis_angle       float64
sacral_slope                float64
pelvic_radius               float64
degree_spondylolisthesis    float64
dtype: object

In [184]:
# Get the target variable (the "class" column)
y = df["class"]
y

0      Hernia
1      Hernia
2      Hernia
3      Hernia
4      Hernia
        ...  
305    Normal
306    Normal
307    Normal
308    Normal
309    Normal
Name: class, Length: 310, dtype: object

In [185]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [186]:
y_train

140    Spondylolisthesis
208    Spondylolisthesis
278               Normal
203    Spondylolisthesis
144    Spondylolisthesis
             ...        
188    Spondylolisthesis
71     Spondylolisthesis
106    Spondylolisthesis
270               Normal
102    Spondylolisthesis
Name: class, Length: 232, dtype: object

In [187]:
y_test

289               Normal
9                 Hernia
57                Hernia
60     Spondylolisthesis
25                Hernia
             ...        
30                Hernia
22                Hernia
262               Normal
56                Hernia
148    Spondylolisthesis
Name: class, Length: 78, dtype: object

In [188]:
# Since the target column is an object, we need to convert the data to numerical classes
# Encode the y data
# Create an instance of the label encoder
le = LabelEncoder()

# Fit and transform the y training and testing data using the label encoder
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
y_train_encoded

array([2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 0, 2, 2, 1, 1, 2, 0, 0, 1, 1,
       2, 2, 1, 0, 0, 0, 2, 1, 2, 2, 2, 0, 2, 2, 1, 2, 0, 1, 2, 2, 2, 2,
       1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 0, 1, 2, 2, 2, 2, 2, 1, 1, 1,
       2, 1, 0, 2, 2, 2, 2, 1, 1, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2,
       2, 1, 2, 0, 0, 0, 2, 2, 1, 2, 2, 0, 2, 1, 2, 1, 0, 2, 1, 2, 0, 1,
       2, 1, 0, 1, 1, 2, 2, 1, 2, 2, 2, 0, 2, 2, 1, 1, 1, 0, 1, 2, 1, 0,
       1, 0, 2, 0, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 0, 2, 1, 2, 0, 1, 2, 0,
       2, 1, 1, 2, 0, 1, 2, 2, 1, 2, 1, 0, 1, 1, 1, 2, 0, 0, 2, 1, 1, 1,
       2, 2, 0, 1, 2, 2, 0, 0, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 2, 1, 2, 1,
       2, 1, 1, 0, 0, 2, 2, 1, 2, 2, 0, 0, 1, 1, 0, 2, 1, 2, 1, 1, 2, 2,
       2, 2, 2, 1, 2, 1, 0, 2, 2, 2, 1, 2])

In [189]:
X_train

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
140,69.563486,15.401139,74.438497,54.162347,105.067356,29.701211
208,87.679087,20.365613,93.822416,67.313473,120.944829,76.730629
278,40.413366,-1.329412,30.982768,41.742778,119.335655,-6.173675
203,73.635962,9.711318,63.000000,63.924644,98.727930,26.975787
144,60.044177,14.309656,58.038865,45.734521,105.131664,30.409133
...,...,...,...,...,...,...
188,85.680950,38.650035,82.680977,47.030914,120.840707,61.959034
71,86.900794,32.928168,47.794347,53.972627,135.075364,101.719092
106,65.013773,9.838262,57.735837,55.175511,94.738525,49.696955
270,51.311771,8.875541,57.000000,42.436230,126.472258,-2.144044


In [190]:
X_test

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
289,44.430701,14.174264,32.243495,30.256437,131.717613,-3.604255
9,36.686353,5.010884,41.948751,31.675469,84.241415,0.664437
57,46.855781,15.351514,38.000000,31.504267,116.250917,1.662706
60,74.377678,32.053104,78.772013,42.324573,143.560690,56.125906
25,54.124920,26.650489,35.329747,27.474432,121.447011,1.571205
...,...,...,...,...,...,...
30,50.819268,15.402213,42.528939,35.417055,112.192804,10.869566
22,63.073611,24.413803,54.000000,38.659808,106.424329,15.779697
262,42.517272,14.375671,25.323565,28.141601,128.905689,0.757020
56,43.349606,7.467469,28.065483,35.882137,112.776187,5.753277


In [191]:
# Create the logistic regression classifier model with a random_state of 1
lr_model = LogisticRegression(max_iter=5000, random_state=1)

# Fit the model to the training data
lr_model.fit(X_train, y_train_encoded)

In [192]:
train_accuracy = lr_model.score(X_train, y_train_encoded)
test_accuracy = lr_model.score(X_test, y_test_encoded)
build_model_scores_df ( "LogisticRegression", train_accuracy, test_accuracy )

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % train_accuracy)
print('Test Accuracy: %.3f' % test_accuracy)

Train Accuracy: 0.892
Test Accuracy: 0.782


In [193]:
# Create the support vector machine classifier model with a 'poly' kernel
svm_model = SVC(kernel='poly')

# Fit the model to the training data
svm_model.fit(X_train, y_train_encoded)

In [194]:
train_accuracy = svm_model.score(X_train, y_train_encoded)
test_accuracy = svm_model.score(X_test, y_test_encoded)
build_model_scores_df ( "SVC", train_accuracy, test_accuracy )


# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % train_accuracy)
print('Test Accuracy: %.3f' % test_accuracy)

Train Accuracy: 0.901
Test Accuracy: 0.833


In [195]:
# Create the KNN model with 9 neighbors
knn_model = KNeighborsClassifier(n_neighbors=9)

# Fit the model to the training data
knn_model.fit(X_train, y_train_encoded)

In [196]:
train_accuracy = knn_model.score(X_train, y_train_encoded)
test_accuracy = knn_model.score(X_test, y_test_encoded)
build_model_scores_df ( "KNN", train_accuracy, test_accuracy )


# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % train_accuracy)
print('Test Accuracy: %.3f' % test_accuracy)

Train Accuracy: 0.892
Test Accuracy: 0.782


In [197]:
# Create the random forest classifier model
# with n_estimators=128 and random_state=1
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train_encoded)

In [198]:
train_accuracy = rf_model.score(X_train, y_train_encoded)
test_accuracy = rf_model.score(X_test, y_test_encoded)
build_model_scores_df ( "RF", train_accuracy, test_accuracy )


# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % train_accuracy)
print('Test Accuracy: %.3f' % test_accuracy)

Train Accuracy: 1.000
Test Accuracy: 0.795


In [199]:
# Create the decision tree classifier model
dt_model = DecisionTreeClassifier()

# Fit the model to the training data
dt_model.fit(X_train, y_train_encoded)

In [200]:
train_accuracy = dt_model.score(X_train, y_train_encoded)
test_accuracy = dt_model.score(X_test, y_test_encoded)
build_model_scores_df ( "DT", train_accuracy, test_accuracy )

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % train_accuracy)
print('Test Accuracy: %.3f' % test_accuracy)

Train Accuracy: 1.000
Test Accuracy: 0.731


In [201]:
model_scores

Unnamed: 0,LogisticRegression,SVC,KNN,RF,DT
0,0.892241,0.900862,0.892241,1.0,1.0
1,0.782051,0.833333,0.782051,0.794872,0.730769


In [202]:
sample = df.sample ( frac=.6 )

In [203]:
import pandas as pd
y = sample["class"]

y_sample_encoded = le.transform(y)

sample = sample.drop("class", axis=1)

sample_accuracy = dt_model.score(sample, y_sample_encoded)
print('Sample Accuracy: %.3f' % sample_accuracy)


Sample Accuracy: 0.925


In [204]:
y_sample_encoded

array([0, 2, 0, 1, 2, 0, 1, 2, 2, 0, 1, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2,
       1, 0, 0, 0, 2, 2, 1, 0, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2,
       2, 1, 2, 0, 2, 2, 1, 2, 1, 2, 0, 2, 1, 2, 2, 2, 2, 0, 1, 2, 2, 2,
       2, 1, 2, 2, 1, 0, 1, 0, 0, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0,
       1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 1,
       1, 0, 2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       1, 2, 2, 1, 0, 2, 2, 1, 2, 1, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 2, 1,
       0, 2, 1, 2, 2, 1, 0, 2, 2, 1, 1, 0, 2, 0, 0, 1, 0, 2, 2, 0, 1, 2,
       2, 1, 0, 0, 2, 0, 2, 2, 1, 2])

In [205]:
sample

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
37,35.703458,19.443253,20.700000,16.260205,137.540613,-0.263490
102,70.399308,13.469986,61.200000,56.929322,102.337524,25.538429
20,43.922840,14.177959,37.832547,29.744881,134.461016,6.451648
298,66.507179,20.897672,31.727471,45.609507,128.902905,1.517203
170,64.809541,15.174078,58.839994,49.635463,111.679961,21.407198
...,...,...,...,...,...,...
10,49.706610,13.040974,31.334500,36.665635,108.648265,-7.825986
198,74.854480,13.909084,62.693259,60.945396,115.208701,33.172255
147,55.080766,-3.759930,56.000000,58.840695,109.915367,31.773583
215,30.741938,13.354966,35.903526,17.386972,142.410107,-2.005373


In [206]:
y_hat = dt_model.predict(sample)
y_hat

array([0, 2, 0, 0, 2, 0, 1, 2, 2, 0, 1, 0, 2, 1, 1, 0, 1, 1, 2, 2, 1, 2,
       1, 0, 1, 0, 2, 1, 1, 0, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2,
       2, 1, 2, 0, 2, 2, 1, 2, 1, 2, 0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 1, 0, 1, 0, 0, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0,
       1, 1, 2, 2, 1, 1, 0, 1, 2, 2, 2, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 1,
       1, 1, 2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       1, 2, 2, 1, 1, 2, 2, 0, 2, 0, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 2, 1,
       0, 2, 1, 2, 2, 1, 0, 2, 2, 1, 1, 0, 2, 1, 0, 1, 0, 2, 2, 0, 1, 2,
       2, 1, 0, 0, 2, 0, 2, 2, 1, 2])

In [207]:
sample = df.sample ( frac=.3 )

y = sample["class"]

y_sample_encoded = le.transform(y)

sample = sample.drop("class", axis=1)

sample_accuracy = dt_model.score(sample, y_sample_encoded)
print('Sample Accuracy: %.3f' % sample_accuracy)


Sample Accuracy: 0.935


In [208]:
sample = df.sample ( frac=.6 )

y = sample["class"]

y_sample_encoded = le.transform(y)

sample = sample.drop("class", axis=1)

sample_accuracy = svm_model.score(sample, y_sample_encoded)
print('Sample Accuracy: %.3f' % sample_accuracy)

Sample Accuracy: 0.855


In [209]:
sample = df.sample ( frac=.6 )

y = sample["class"]

y_sample_encoded = le.transform(y)

sample = sample.drop("class", axis=1)

sample_accuracy = rf_model.score(sample, y_sample_encoded)
print('Sample Accuracy: %.3f' % sample_accuracy)

Sample Accuracy: 0.946


In [210]:
sample = df.sample ( frac=.6 )

y = sample["class"]

y_sample_encoded = le.transform(y)

sample = sample.drop("class", axis=1)

sample_accuracy = knn_model.score(sample, y_sample_encoded)
print('Sample Accuracy: %.3f' % sample_accuracy)

Sample Accuracy: 0.887
