In [1]:
import pandas as pd
from random import seed
from random import randrange
from math import exp
from math import log
from math import floor

In [2]:
def cross_val_split(data_X,data_Y,test_size,seed_val):
	data_x = data_X.tolist()
	data_y = data_Y.tolist()
	seed(seed_val)
	train_size = floor((1 - test_size)*len(data_x))
	train_x = []
	train_y = []
	while(len(train_x)<train_size):
		index = randrange(len(data_x))
		train_x.append(data_x.pop(index))
		train_y.append(data_y.pop(index))
	return train_x,train_y,data_x,data_y

In [4]:
def statistics(x):
	cols = list(zip(*x))
	stats = []
	for e in cols:
		stats.append([min(e),max(e)])
	return stats

In [None]:
def scale(x, stat):
	for row in x:
		for i in range(len(row)):
			row[i] = (row[i] - stat[i][0])/(stat[i][1] - stat[i][0])

In [5]:
def one_vs_all_cols(s):
	m = list(set(s))
	m.sort()
	for i in range(len(s)):
		new = [0]*len(m)
		new[m.index(s[i])] = 1
		s[i] = new
	return m

In [6]:
def ThetaTX(Q,X):
	det = 0.0
	for i in range(len(Q)):
		det += X[i]*Q[i]
	return det

In [7]:
def LinearSVM_cost0(z):
	if(z < -1): #Ensuring margin
		return 0
	return z + 1

In [9]:
def LinearSVM_cost1(z):
	if(z > 1): #Ensuring margin
		return 0
	return -z + 1

In [8]:
def sigmoid(z):
	return 1.0/(1.0 + exp(-z))

In [10]:
def cost(theta,c,x,y):
	cost = 0.0
	for i in range(len(x)):
		z = ThetaTX(theta[c], x[i])
		cost += y[i]*LinearSVM_cost1(z) + (1 - y[i])*LinearSVM_cost0(z)
		#cost += -1*(y[i]*log(sigmoid(z)) + (1 - y[i])*log(1 - sigmoid(z)))
	return cost

In [11]:
def gradDescent(theta,c,x,y,learning_rate):
	oldTheta = theta[c]
	for Q in range(len(theta[c])):
		derivative_sum = 0 
		for i in range(len(x)):
			derivative_sum += (sigmoid(ThetaTX(oldTheta,x[i])) - y[i])*x[i][Q]
		theta[c][Q] -= learning_rate*derivative_sum

In [12]:
def predict(data,theta):
	predictions = []
	count = 1
	for row in data:
		hypothesis = []
		multiclass_ans = [0]*len(theta)
		for c in range(len(theta)):
			z = ThetaTX(row,theta[c])
			hypothesis.append(sigmoid(z))
		index = hypothesis.index(max(hypothesis))
		multiclass_ans[index] = 1
		predictions.append(multiclass_ans)
		count+=1
	return predictions

In [13]:
def accuracy(predicted, actual):
	n = len(predicted)
	correct = 0
	for i in range(n):
		if(predicted[i]==actual[i]):
			correct+=1
	return correct/n

In [14]:
def cross_validation(x,y,test_data_size,validations,learning_rate,epoch):
	print("No. of validation checks to be performed: ",validations)
	print("No. of Iterations per validation: ",epoch)
	accuracies = []
	for valid in range(validations):
		print("\nRunning Validation",valid+1)
		x_train, y_train, x_test, y_test = cross_val_split(x,y,test_data_size,valid+1)
		#Convertir y_train en columnas de clase con valores 0/1
		classes = []
		for i in range(len(label_map)):
			classes.append([row[i] for row in y_train])
		#Inicializando Theta (Pesos)
		theta = [[0]*len(x_train[0]) for _ in range(len(classes))]
		#Entrenando al modelo
		for i in range(epoch):
			for class_type in range(len(classes)):
				gradDescent(theta,class_type,x_train,classes[class_type],learning_rate)
			if(i%(epoch/10)==0):
				print("Processed", i*100/epoch,"%")
		print("Completed")
		#Predecir usando datos de prueba
		y_pred = predict(x_test,theta)
		#Precisión de cálculo
		accuracies.append(accuracy(y_pred,y_test))
		print("Validation",valid+1,"accuracy score: ",accuracies[valid])
	return sum(accuracies)/len(accuracies)

In [16]:
#URL del conjunto de datos que se va a importar
print("Running Forest Cover Detection using Linear SVM\n")
url = "dataset.csv"
dataset = pd.read_csv(url)
data = dataset.values
#Asignación de x e y: entidades y clases
x = data[:,:26]
y = data[:,27]
#Escalado de funciones mediante el uso de estadísticas máximas y mínimas por columnas
stats = statistics(x)
scale(x,stats)
#Convertir diferentes etiquetas en columnas 
#label_map se puede usar más tarde para recuperar la etiqueta de clase predicha en la forma original (formato de cadena)
label_map = one_vs_all_cols(y)
#Dividir el conjunto de datos en datos de entrenamiento y prueba
test_data_size = 0.2
learning_rate = 0.01
epoch = 500
validations = 5
final_score = cross_validation(x,y,test_data_size,validations,learning_rate,epoch)
#Impresión de estadísticas finales
print("\nReporte")
print("Modelo usado: ","SVM lineal con descenso de gradiente")
print("Tasa de aprendizaje: ", learning_rate)
print("Nº de iteraciones: ",epoch)
print("Nº de características: ", len(x[0]))
print("Tamaño de los datos de entrenamiento: ", floor(len(x)*(1 - test_data_size)))
print("Tamaño de datos de prueba: ", len(x) - floor(len(x)*(1 - test_data_size)))
print("Nº de pruebas de validación realizadas: ", validations)
print("Precisión: ",final_score*100,"%")

Running Forest Cover Detection using Linear SVM

No. of validation checks to be performed:  5
No. of Iterations per validation:  500

Running Validation 1
Processed 0.0 %
Processed 10.0 %
Processed 20.0 %
Processed 30.0 %
Processed 40.0 %
Processed 50.0 %
Processed 60.0 %
Processed 70.0 %
Processed 80.0 %
Processed 90.0 %
Completed
Validation 1 accuracy score:  0.9142857142857143

Running Validation 2
Processed 0.0 %
Processed 10.0 %
Processed 20.0 %
Processed 30.0 %
Processed 40.0 %
Processed 50.0 %
Processed 60.0 %
Processed 70.0 %
Processed 80.0 %
Processed 90.0 %
Completed
Validation 2 accuracy score:  0.8857142857142857

Running Validation 3
Processed 0.0 %
Processed 10.0 %
Processed 20.0 %
Processed 30.0 %
Processed 40.0 %
Processed 50.0 %
Processed 60.0 %
Processed 70.0 %
Processed 80.0 %
Processed 90.0 %
Completed
Validation 3 accuracy score:  0.8857142857142857

Running Validation 4
Processed 0.0 %
Processed 10.0 %
Processed 20.0 %
Processed 30.0 %
Processed 40.0 %
Processed 5