In [86]:
# imports
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.svm import SVC

In [80]:
# import the uci dataset

uci_train = []
x_train_data = []
y_train_data = []
uci_dir = 'uci_dataset'
training_data_file = 'shuttle.trn'

# load the training data
with open('/'.join([os.getcwd(), uci_dir, training_data_file])) as f:
    
    for line in f:
        uci.append(line) # for checking if split of data is correct
        split = line.strip().split(' ')
        x_values = [ float(x) for x in split[:-1]]
        y_values = int(split[-1])
        
        x_train_data.append(x_values)
        y_train_data.append([y_values])
        
print('x_train_data length: ', len(x_train_data))
print('y_train_data length: ', len(y_train_data))
# 
# print('uci[0]: %s' % uci[-1])
# print('x_data[0]: %s' % x_data[-1])
# print('y_data[0]: %s' % y_data[-1])

np_x_data = np.array([ np.array(x) for x in x_data])
np_y_data = np.array([ np.array(y) for y in y_data])

# print('np_x_data.shape: %s' % str(np_x_data.shape))
# print('np_y_data.shape: %s' % str(np_y_data.shape))

uci_test = []
x_test_data = []
y_test_data = []
test_data_file = 'shuttle.tst'

# load the training data
with open('/'.join([os.getcwd(), uci_dir, test_data_file])) as f:
    
    for line in f:
        uci_test.append(line) # for checking if split of data is correct
        split = line.strip().split(' ')
        x_values = [ float(x) for x in split[:-1]]
        y_values = int(split[-1])
        
        x_test_data.append(x_values)
        y_test_data.append([y_values])
        
print('x_test_data length: ', len(x_test_data))
print('y_test_data length: ', len(y_test_data))

print('total dataset: %d' % (len(x_train_data) + len(x_test_data)))

x_train_data length:  43500
y_train_data length:  43500
x_test_data length:  14500
y_test_data length:  14500
total dataset: 58000


In [84]:
# count the number of data for each classes - training dataset
total_num_data = len(x_train_data)
train_bins = [[] for _ in range(7)]
for x, y in zip(x_train_data, y_train_data):
    train_bins[y[0] - 1].append(x)
    
print('Training dataset:')
for idx, category in enumerate(train_bins):
    print('class %d: count: %d ratio: %.2f%%' % (idx + 1, len(category), (len(category)/float(total_num_data)) * 100) )

Training dataset:
class 1: count: 34108 ratio: 78.41%
class 2: count: 37 ratio: 0.09%
class 3: count: 132 ratio: 0.30%
class 4: count: 6748 ratio: 15.51%
class 5: count: 2458 ratio: 5.65%
class 6: count: 6 ratio: 0.01%
class 7: count: 11 ratio: 0.03%


In [85]:
# count the number of data for each classes - training dataset
total_num_data = len(x_test_data)
test_bins = [[] for _ in range(7)]
for x, y in zip(x_test_data, y_test_data):
    test_bins[y[0] - 1].append(x)
    
print('Test dataset:')
for idx, category in enumerate(test_bins):
    print('class %d: count: %d ratio: %.2f%%' % (idx + 1, len(category), (len(category)/float(total_num_data)) * 100) )

Test dataset:
class 1: count: 11478 ratio: 79.16%
class 2: count: 13 ratio: 0.09%
class 3: count: 39 ratio: 0.27%
class 4: count: 2155 ratio: 14.86%
class 5: count: 809 ratio: 5.58%
class 6: count: 4 ratio: 0.03%
class 7: count: 2 ratio: 0.01%


In [87]:
# Use an svm with rbf kernel function
svm = SVC(C=0.1, kernel='rbf', gamma=1)
svm.fit(x_train_data, y_train_data)

  y = column_or_1d(y, warn=True)


SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [88]:
# create predictions
svm_predictions = svm.predict(x_test_data)

In [101]:
# accuracy
acc = [1 if y_pred == y_true[0] else 0 for y_pred, y_true in zip(svm_predictions, y_test_data)]
accuracy = np.sum(acc) / float(len(acc))
print('Accuracy: %f' % accuracy)

Accuracy: 0.791586


In [102]:
# TODO: Do cross validation
# Possible approaches
# 1. k-fold cross validation
# 2. Try different algorithm
# 3. Under/Oversampling method
# 4. Penalized Models (penalized SVM)
# 5. Look into Anomaly Detection

# Cross Validation techniques (Non-exhaustive methods)
# 1. Holdout Method
# 2. k-fold cross validation
# 3. Stratified k-fold cross validation
#    - each fold contains approximately the same 
#    percentage of samples of each target class
#    in the case of prediction problems, the mean response value is approximately equal in all folds
# (Exhaustive methods)
# 4. Leave-P-Out Cross Validation

In [111]:
# Approach 1:
# reduce class 1's number
half_class_one = train_bins[0][:len(train_bins[0])//5]
print(len(half_class_one))
hold_out_train_bin = []
hold_out_train_bin.append(half_class_one)
for bin_class in train_bins[1:]:
    hold_out_train_bin.append(bin_class)

for idx, bin_class in enumerate(hold_out_train_bin):
    print('class %d: %d' % (idx + 1, len(bin_class)))
    
holdout_x_train = []
holdout_y_train = []

6821
class 1: 6821
class 2: 37
class 3: 132
class 4: 6748
class 5: 2458
class 6: 6
class 7: 11


In [None]:
svm_holdout = SVC(C=1, kernel='rbf')
svm.