In [1]:
import numpy as np
import scipy.io as io
from random import randint
from scipy.io import loadmat
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
def load_data(filename):
    data = loadmat(filename)
    X = data['data']
    Y = data['info'][0]
    X_train = np.zeros((len(X),X[0,0].shape[1]))
    for i in range(len(X)):
        X_train[i,:]=X[i,0] 
        
    # build word-id dictionary
    with open('dictionary.txt') as f:
        words = [l.strip() for l in f if l.strip()]
    Y_train = np.zeros((360,2))
    
    # preprocess Y train
    for i in range(360):
        Y_train[i,0] = words.index(Y[i]['word'][0])+1 # get the word id from the dictionary
        Y_train[i,1] = randint(1, 60)
        
    return X_train,Y_train,data

In [4]:
Yp = 218
ntrain = 2700
ntest = 540
print('load data...') 
X1,wid1,data1 = load_data('data-science-P1.mat')
X2,wid2,data2 = load_data('data-science-P2.mat')
X3,wid3,data3 = load_data('data-science-P3.mat')
X4,wid4,data4 = load_data('data-science-P4.mat')
X5,wid5,data5 = load_data('data-science-P5.mat')
X6,wid6,data6 = load_data('data-science-P6.mat')
X7,wid7,data7 = load_data('data-science-P7.mat')
X8,wid8,data8 = load_data('data-science-P8.mat')
X9,wid9,data9 = load_data('data-science-P9.mat')

dim = min(X1.shape[1],X2.shape[1],X3.shape[1],X4.shape[1],X5.shape[1],X6.shape[1],X7.shape[1],X8.shape[1],X9.shape[1])
X = np.vstack((X1[:,:dim],X2[:,:dim],X3[:,:dim],X4[:,:dim],X5[:,:dim],X6[:,:dim],X7[:,:dim],X8[:,:dim],X9[:,:dim]))
wid = np.vstack((wid1,wid2,wid3,wid4,wid5,wid6,wid7,wid8,wid9))
data = np.vstack((data1,data2,data3,data4,data5,data6,data7,data8,data9))

from sklearn.decomposition import PCA
pca = PCA(n_components=500)
X = pca.fit_transform(X)

word_feature=io.mmread('word_feature_centered.mtx')

X_train,X_test,wid_train,wid_test = train_test_split(X,wid,test_size=1/6,random_state=42)

# separate data set
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

# from word id to Y_train (300,1) and Y_test (60,2)
Y_train = np.zeros((ntrain,Yp))
Y_test = np.zeros((ntest,Yp))
Y2_test = np.zeros((ntest,Yp))
for i in range(ntrain):
    for j in range(Yp):
        Y_train[i][j]=word_feature[int(wid_train[i,0])-1][j]
for i in range(ntest):
    for j in range(Yp):
        Y_test[i][j]=word_feature[int(wid_test[i,0])-1][j]
        Y2_test[i][j]=word_feature[int(wid_test[i,1])-1][j]

print('shape of X train:',X_train.shape)
print('shape of X test:',X_test.shape)
print('shape of Y train:',Y_train.shape)
print('shape of Y test:',Y_test.shape)

load data...
shape of X train: (2700, 500)
shape of X test: (540, 500)
shape of Y train: (2700, 218)
shape of Y test: (540, 218)


In [18]:
class LassoShooting:
    def __init__(self, X, Y, lamda):
        self.X = X
        self.Y = Y
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.lamda = lamda
        self.w = np.zeros(self.p)
 
    def shoot(self):
        stop=False
        X = self.X
        w = self.w
        t=0
        epsilon=1e-4
        max_iter=10
        while not stop and t<max_iter:
            w_pre = w.copy()
            for j in range(self.p):
                #j = randint(0,self.p-1)
                w_nonj = np.delete(w,j)
                X_nonj = np.delete(X,j,1)  # delete k_th column
                w_j = w[j]*np.ones((self.n,1))
                r_j = X_nonj.dot(w_nonj)-self.Y
                c_j = np.sum(np.multiply(X[:,j],r_j))
                a_j = np.sum(X[:,j]**2)
                if c_j > self.lamda:
                    w[j] = (-c_j+self.lamda)/a_j
                elif c_j < (-self.lamda):
                    w[j] = (-c_j-self.lamda)/a_j
                else:
                    w[j] = 0  
            if max(abs(w-w_pre))<=epsilon:
                stop=True
            t+=1    
        self.w =w         
    
    def predict(self,X,w):
        return X.dot(w)

In [19]:
def L1(pred,y_true,y_wrong):
    score = 0
    for i in range(len(pred)):
        d1 = np.sum(np.abs(pred[i]-y_true[i]))
        d2 = np.sum(np.abs(pred[i]-y_wrong[i]))
        if d1<d2:
            score+=1
        elif d1==d2:
            score+=0.5
    return score
    
def L2(pred,y_true,y_wrong):
    score = 0
    for i in range(len(pred)):
        d1 = np.sum((pred[i]-y_true[i])**2)
        d2 = np.sum((pred[i]-y_wrong[i])**2)
        if d1<d2:
            score+=1
        elif d1==d2:
            score+=0.5
    return score

train the model

In [20]:
weight = np.zeros((X_train.shape[1],Y_train.shape[1]))
for j in range(Yp):
    print(j)
    ls = LassoShooting(X_train,Y_train[:,j],0.15)
    ls.shoot()
    weight[:,j]=ls.w
y_hat = ls.predict(X_test,weight)

print test results

In [22]:
print('---------All Persons--------')
print('Accuracy for L1:',L1(y_hat,Y_test,Y2_test)/len(Y_test))
print('Accuracy for L2:',L2(y_hat,Y_test,Y2_test)/len(Y_test))

---------All Persons--------
Accuracy for L1: 0.649074074074074
Accuracy for L2: 0.6509259259259259
