In [148]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [41]:
#read in data
df = pd.read_table("Datasets/schooldropout.csv", sep=';')

#encode Target variable as integers, with graduate = enrolled so there are 2 classes
df['Target'].replace(['Dropout', 'Graduate', 'Enrolled'], [0, 1, 1], inplace=True)

#split into train and test sets
train, test = train_test_split(df, test_size=0.2)

#split into x and y arrays
trainX = train.drop('Target', axis=1)
trainY = train['Target']

testX = np.asarray(test.drop('Target', axis=1))
testY = np.asarray(test['Target'])

In [122]:
#can't visualize with many predictors and no useful components from FAMD
#try different kernal types, using default parameters, except dual in LinearSVC
#default dual=True is more applicable when n_samples < n_features

#linear
svmLinear = svm.LinearSVC(dual=False)
fitLinear = svmLinear.fit(trainX, trainY)
testLinear = svmLinear.score(testX, testY)

#poly, degree = 3
svmSquare = svm.SVC(kernel='poly', degree=2)
fitSquare = svmSquare.fit(trainX, trainY)
testSquare = svmSquare.score(testX, testY)

#poly, degree = 3
svmCube = svm.SVC(kernel='poly', degree=3)
fitCube = svmCube.fit(trainX, trainY)
testCube = svmCube.score(testX, testY)

#radial
svmRadial = svm.SVC(kernel='rbf')
fitRadial = svmRadial.fit(trainX, trainY)
testRadial = svmRadial.score(testX, testY)

#sigmoid
svmSigmoid = svm.SVC(kernel='sigmoid')
fitSigmoid = svmSigmoid.fit(trainX, trainY)
testSigmoid = svmSigmoid.score(testX, testY)




In [123]:
print(testLinear)
print(testSquare)
print(testCube)
print(testRadial)
print(testSigmoid)


0.8655367231638418
0.688135593220339
0.688135593220339
0.688135593220339
0.6689265536723163


The linear kernel clearly produces the best accuracy, so let's tune the regularization parameter C:

In [133]:
#regularization parameter C
svm1 = svm.LinearSVC(C=9, dual=False)
fit1 = svm1.fit(trainX, trainY)
test1 = svm1.score(testX, testY)
print(test1)

0.8632768361581921




The default value C=1 achieves the highest accuracy. <br>
Let's look at the model. 

In [137]:
#confusion matrix
confusion_matrix(testY, svm1.predict(testX))



array([[174, 102],
       [ 19, 590]], dtype=int64)

9.8% false negative<br>
14.7% false positive<br>
Model has higher tendency to underestimate students' likelihood to dropout, might call it "conservative."

In [158]:
trainY = np.asarray(trainY)
trainY.reshape(1,-1)
np.shape(trainY)

(3539,)

In [154]:
#plot confidence scores of training data
trainY = np.asarray(trainY)
trainY.reshape(-1,1)
trainConfidence = pd.DataFrame(data=[trainY, svm1.decision_function(trainY)], columns=['Labels', 'Dist'])
trainConfidence.pivot(columns='Labels', values='Dist').plot.hist()
plt.show()



ValueError: Expected 2D array, got 1D array instead:
array=[1 1 1 ... 1 1 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.