In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score


In [3]:
file = 'csv_result-column_2C_weka.csv'
sep=','

data = pd.read_csv(file,sep)
data

Unnamed: 0,id,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,1,63.027817,22.552586,39.609117,40.475232,98.672917,-0.254400,Abnormal
1,2,39.056951,10.060991,25.015378,28.995960,114.405425,4.564259,Abnormal
2,3,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Abnormal
3,4,69.297008,24.652878,44.311238,44.644130,101.868495,11.211523,Abnormal
4,5,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Abnormal
...,...,...,...,...,...,...,...,...
305,306,47.903565,13.616688,36.000000,34.286877,117.449062,-4.245395,Normal
306,307,53.936748,20.721496,29.220534,33.215251,114.365845,-0.421010,Normal
307,308,61.446597,22.694968,46.170347,38.751628,125.670725,-2.707880,Normal
308,309,45.252792,8.693157,41.583126,36.559635,118.545842,0.214750,Normal


In [6]:
#replace Abnormal and Normal with 1 and 0
data['class'].replace(['Abnormal','Normal'], [1,0], inplace=True)


data

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.254400,1
1,39.056951,10.060991,25.015378,28.995960,114.405425,4.564259,1
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,1
3,69.297008,24.652878,44.311238,44.644130,101.868495,11.211523,1
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,1
...,...,...,...,...,...,...,...
305,47.903565,13.616688,36.000000,34.286877,117.449062,-4.245395,0
306,53.936748,20.721496,29.220534,33.215251,114.365845,-0.421010,0
307,61.446597,22.694968,46.170347,38.751628,125.670725,-2.707880,0
308,45.252792,8.693157,41.583126,36.559635,118.545842,0.214750,0


In [7]:
data.describe()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
count,310.0,310.0,310.0,310.0,310.0,310.0,310.0
mean,60.496653,17.542822,51.93093,42.953831,117.920655,26.296694,0.677419
std,17.23652,10.00833,18.554064,13.423102,13.317377,37.559027,0.46822
min,26.147921,-6.554948,14.0,13.366931,70.082575,-11.058179,0.0
25%,46.430294,10.667069,37.0,33.347122,110.709196,1.603727,0.0
50%,58.691038,16.357689,49.562398,42.404912,118.268178,11.767934,1.0
75%,72.877696,22.120395,63.0,52.695888,125.467674,41.287352,1.0
max,129.834041,49.431864,125.742385,121.429566,163.071041,418.543082,1.0


In [12]:
# split into explanatory and response variables 
X = data.iloc[:,:6]
Y = data.iloc[:,6]

In [13]:
# build and fit model
reg = LogisticRegression()
reg.fit(X,Y)

print("Coefficients: ",reg.coef_)
print("Intercept: ", reg.intercept_)

# compute predicted values from training set
Y_pred = reg.predict(X)

cm = confusion_matrix(Y, Y_pred)
print("Confusion matrix:\n",cm)

accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print("Accuracy calculated from the training set = %.3f" % (accuracy))

print(classification_report(Y, Y_pred, target_names=['no', 'yes']))

Coefficients:  [[-0.0070301   0.08258075 -0.01870319 -0.08961076 -0.1067678   0.16811009]]
Intercept:  [15.15499493]
Confusion matrix:
 [[ 78  22]
 [ 22 188]]
Accuracy calculated from the training set = 0.858
              precision    recall  f1-score   support

          no       0.78      0.78      0.78       100
         yes       0.90      0.90      0.90       210

    accuracy                           0.86       310
   macro avg       0.84      0.84      0.84       310
weighted avg       0.86      0.86      0.86       310



In [14]:
# cross-validate
# number of folds
k = 10
scores = cross_val_score(estimator=reg,
                        X=X,
                        y=Y,
                        scoring="accuracy",
                        cv=k)
print("Accuracies from %d individual folds:" % k)
print(scores)
print("Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))

Accuracies from 10 individual folds:
[0.58064516 0.67741935 0.74193548 0.77419355 0.90322581 0.93548387
 0.93548387 0.96774194 0.83870968 0.93548387]
Accuracy calculated using 10-fold cross validation = 0.829


In [16]:
# retrieve estimated probabilities (from training set)
reg.predict_proba(X)

array([[1.64019223e-01, 8.35980777e-01],
       [2.31914918e-01, 7.68085082e-01],
       [6.26521433e-01, 3.73478567e-01],
       [5.30092783e-02, 9.46990722e-01],
       [2.20058752e-01, 7.79941248e-01],
       [5.85969115e-01, 4.14030885e-01],
       [4.59643014e-01, 5.40356986e-01],
       [9.03415180e-01, 9.65848199e-02],
       [2.07058106e-01, 7.92941894e-01],
       [5.70396052e-02, 9.42960395e-01],
       [7.11969377e-01, 2.88030623e-01],
       [1.03099814e-01, 8.96900186e-01],
       [1.51552123e-01, 8.48447877e-01],
       [9.81039213e-02, 9.01896079e-01],
       [1.97527130e-01, 8.02472870e-01],
       [4.96342187e-01, 5.03657813e-01],
       [6.51086090e-01, 3.48913910e-01],
       [7.50314686e-01, 2.49685314e-01],
       [4.39969334e-01, 5.60030666e-01],
       [4.96990386e-01, 5.03009614e-01],
       [6.51929413e-01, 3.48070587e-01],
       [5.68254994e-01, 4.31745006e-01],
       [2.81017589e-02, 9.71898241e-01],
       [7.30517186e-01, 2.69482814e-01],
       [1.037655