In [1]:
import numpy as np
import h5py
from sklearn.linear_model import LinearRegression
from ranking import *
from sklearn.svm import SVR



In [2]:
def MSE(a,b):
    mse =0.0
    print(a.shape, b.shape)
    if a.shape != b.shape:
        print("Size of vector mixmatch - cannot calculate Mean Squared error")
    for i in range(0,len(a)):
        mse += np.linalg.norm(b[i] - a[i])**2
    mse = 1.0/len(a)*mse
    return mse

f = h5py.File('l2_normalized_semantic_SVM_full_data_with_val_291labels_no_zero.mat' )

In [3]:
def F1_score(tags_pred, tags_actual,k1=3, k2=5):
    k1_count=0.0
    k2_count=0.0
    for i in range(0,k1):
        if tags_actual[tags_pred[i]] == 1:
            k1_count += 1

    for i in range(0,k2):
        if tags_actual[tags_pred[i]] == 1:
            k2_count += 1

    num_tags = 1.0*(len(tag_word_vectors) + sum(tags_actual ))/2
    
    k1_recall = 1.0*k1_count/num_tags
    k2_recall = 1.0*k2_count/num_tags 
    k1_precision = k1_count/k1
    k2_precision = k2_count/k2
    
    tmp1 = 2.0*k1_precision*k1_recall/(k1_precision + k1_recall)
    tmp2 = 2.0*k2_precision*k2_recall/(k2_precision+k2_recall)
    if tmp1 >= 0 and tmp2 >= 0:
        return [tmp1,tmp2]
    elif tmp1 >= 0:
        return [tmp1,0]
    elif tmp2 >= 0:
        return [0,tmp2]
    else:
        return [0,0]


In [4]:


print("-----------------------------------------\nLoading Data")
n_all = 1000

training_data = np.transpose(f["prepared_training_data"])
training_label = np.transpose(f["prepared_training_label"])
valid_data = np.transpose(f["prepared_val_data"])
valid_label = np.transpose(f["prepared_val_label"])
testing_data = np.transpose(f["prepared_testing_data"])
testing_label = np.transpose(f["prepared_testing_label"])

n_training = len(training_data)
n_valid = len(valid_data)
n_testing = len(testing_data)

tag_word_vectors = np.transpose(h5py.File('291labels.mat')["semantic_mat"])

print("Done")

-----------------------------------------
Loading Data
Done


In [5]:
print("Ranking SVM for Training Data")
r=RankSVM()
w_list=np.zeros([n_training,300])
for i in range(0,len(training_data)):
    r.fit(tag_word_vectors,training_label[i])
    w_list[i] = r.coef_
print("Done")

Ranking SVM for Training Data
Done


In [6]:
print("Fitting Linear Regression model")
lin_reg = LinearRegression(normalize=True)
lin_reg.fit(training_data, w_list)
print(lin_reg.score(training_data, w_list))
A = lin_reg.coef_
print(w_list.shape," = ", A.shape, training_data.shape)
print("Done")

Fitting Linear Regression model
0.46558391979
((15015, 300), ' = ', (300, 4096), (15015, 4096))
Done


In [7]:
print("Fitting Support Vector Kernelized Regression model")
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_lin = SVR(kernel='linear', C=1e3)
svr_poly = SVR(kernel='poly', C=1e3, degree=2)
#y_rbf = svr_rbf.fit(training_data, w_list)
y_lin = svr_lin.fit(training_data, w_list)
y_poly = svr_poly.fit(training_data, w_list)
print("Done")

Fitting Support Vector Kernelized Regression model


ValueError: bad input shape (15015, 300)

In [50]:
print("Accuracy for Training Data")
r=RankSVM()
avg1 = 0
avg2 = 0

for j in range(0,n_training):
    w = np.dot(training_data[j], np.transpose(A))
    tags_pred_score = np.dot(w,np.transpose(tag_word_vectors))

    tag_pred_ranked = [i[0] for i in sorted(enumerate(tags_pred_score), key=lambda x:x[1])]
    tag_pred_ranked.reverse()
    [tmp1, tmp2] = F1_score(tag_pred_ranked, training_label[j],3,5)
    
    avg1 += tmp1
    avg2 += tmp2

print("Averages : " + str(avg1/n_training) + " : "+ str(avg2/n_training))

Ranking SVM for Training Data




Averages : 0.506236493471 : 0.543220654681


In [49]:
print("Accuracy for Testing Data")
avg1 =0 
avg2 = 0
for j in range(0,n_testing):
    w = np.dot(testing_data[j], np.transpose(A))
    tags_pred_score = np.dot(w,np.transpose(tag_word_vectors)) 
    tag_pred_ranked = [i[0] for i in sorted(enumerate(tags_pred_score), key=lambda x:x[1])]
    tag_pred_ranked.reverse()
    
    [tmp1, tmp2] = F1_score(tag_pred_ranked, testing_label[j],3,5)
    
    avg1 += tmp1
    avg2 += tmp2

print("Averages : " + str(avg1/n_testing) + " : "+ str(avg2/n_testing))

Ranking SVM for Training Data




Averages : 0.283574736374 : 0.301731812447


In [48]:
print("Accuracy for Validation Data")
avg1 =0 
avg2 = 0
for j in range(0,n_valid):
    w = np.dot(valid_data[j], np.transpose(A))
    tags_pred_score = np.dot(w,np.transpose(tag_word_vectors)) 
    tag_pred_ranked = [i[0] for i in sorted(enumerate(tags_pred_score), key=lambda x:x[1])]
    tag_pred_ranked.reverse()
    
    [tmp1, tmp2] = F1_score(tag_pred_ranked, valid_label[j],3,5)
    
    avg1 += tmp1
    avg2 += tmp2

print("Averages : " + str(avg1/n_valid) + " : "+ str(avg2/n_valid))

Ranking SVM for Training Data




Averages : 0.277139618367 : 0.29699567689


In [None]:
print("Accuracy for Testing Data with Kernelized SVR rbf")
avg1 =0 
avg2 = 0
for j in range(0,n_testing):
    w = svr_rbf(testing_data[j])
    tags_pred_score = np.dot(w,np.transpose(tag_word_vectors)) 
    tag_pred_ranked = [i[0] for i in sorted(enumerate(tags_pred_score), key=lambda x:x[1])]
    tag_pred_ranked.reverse()
    
    [tmp1, tmp2] = F1_score(tag_pred_ranked, testing_label[j],3,5)
    
    avg1 += tmp1
    avg2 += tmp2

print("Averages : " + str(avg1/n_testing) + " : "+ str(avg2/n_testing))