In [2]:
import numpy as np

In [29]:
y = np.array([0,1,1,0]).reshape(4,1)
X = np.array([
    [0,0,1,0,1],
    [0,1,0,0,0],
    [0,1,1,0,0],
    [1,0,0,1,0]])
theta = np.array([1.5,2,1,2,3]).reshape(5,1)

In [30]:
def log_likelihood_function(theta,X,y):
    lr_result = np.dot(X,theta)
    ONE = np.ones([len(y),1])
    return (-np.vdot(y,lr_result)+np.vdot(np.ones([4,1]),np.log(1+np.power(np.e,lr_result))))/len(y) 

In [31]:
def gradient_function(theta,X,y):
    lr_result = np.dot(X,theta)
    return (-np.dot(X.T,y)+np.dot(X.T,np.power(np.e,lr_result)/(1+np.power(np.e,lr_result))))/len(y)

In [32]:
gradient_function(theta,X,y)

array([[ 0.24267194],
       [-0.0416572 ],
       [ 0.23364698],
       [ 0.24267194],
       [ 0.24550345]])

In [5]:
def batch_gradient_descent(X,y,alpha):
    theta = np.array([1.5,2,1,2,3]).reshape(5,1)
    gradient = gradient_function(theta,X,y)
    while not all(abs(gradient) <= 1e-5):
        theta = theta - alpha*gradient
        gradient = gradient_function(theta,X,y)
    return theta

In [1]:
from utils import WordMapper
from feature import Preprocessor

word_mapper = WordMapper()
word_mapper.load_dict('dict.txt')

preprocessor_model1 = Preprocessor(1,word_mapper)
preprocessor_model2 = Preprocessor(2,word_mapper)

In [2]:
preprocessor_model1.process('smalldata/train_data.tsv','eric_smalloutput/model1_formatted_train_data.tsv')
preprocessor_model2.process('smalldata/train_data.tsv','eric_smalloutput/model2_formatted_train_data.tsv')
preprocessor_model1.process('smalldata/test_data.tsv','eric_smalloutput/model1_formatted_test_data.tsv')
preprocessor_model2.process('smalldata/test_data.tsv','eric_smalloutput/model2_formatted_test_data.tsv')
preprocessor_model1.process('smalldata/valid_data.tsv','eric_smalloutput/model1_formatted_validation_data.tsv')
preprocessor_model2.process('smalldata/valid_data.tsv','eric_smalloutput/model2_formatted_validation_data.tsv')

In [7]:
import numpy as np
y = np.array(['0','1','1','0']).reshape(4,1)
X = np.array([
    '0:1\t2:1',
    '0:1\t1:1\t2:1\t3:1\t4:1',
    '1:1\t3:1\t4:1',
    '4:1'])
theta = np.array([1.5,2,1,2,3,2]).reshape(6,1)
gradient = np.zeros([6,1])

In [8]:
def negative_log_likelihood_function(theta,X,y):
    num_of_instances = len(y)
    result = 0
    for i in range(num_of_instances):
        label = int(y[i])
        x = X[i].split('\t')
        lm_result = get_lm_result(theta,x)
        result += -label*lm_result + np.log(1+np.power(np.e,lm_result))
    return result/num_of_instances


def get_lm_result(theta,x):
    result = 0
    bias = theta[-1][0]
    for feature in x:
        idx = int(feature.split(':')[0])
        value = int(feature.split(':')[1])

        result += value*theta[idx][0]
    result += bias
    return result

def update_gradient(theta,gradient,x,label):
    lm = get_lm_result(theta,x)
    for feature in x:
        idx = int(feature.split(':')[0])
        value = int(feature.split(':')[1])

        gradient[idx] = value*(-label+np.power(np.e,lm)/(1+np.power(np.e,lm)))
    gradient[-1] = -label+np.power(np.e,lm)/(1+np.power(np.e,lm))
    return gradient

def stochastic_gradient_descent(theta,gradient,X,y,num_epoch):
    alpha = 0.1
    count = 0
    while count < num_epoch:
        count += 1
        for i in range(len(y)):
            label = int(y[i])
            x = X[i].split('\t')
            gradient = update_gradient(theta,gradient,x,label)
            theta = theta - alpha * gradient
    return theta

In [None]:
## feature.py model1
python feature.py smalldata/train_data.tsv smalldata/valid_data.tsv smalldata/test_data.tsv dict.txt eric_smalloutput/model1_formatted_train.tsv eric_smalloutput/model1_formatted_valid.tsv eric_smalloutput/model1_formatted_test.tsv 1
## feature.py model2
python feature.py smalldata/train_data.tsv smalldata/valid_data.tsv smalldata/test_data.tsv dict.txt eric_smalloutput/model2_formatted_train.tsv eric_smalloutput/model2_formatted_valid.tsv eric_smalloutput/model2_formatted_test.tsv 2
## lr.py model1
python lr.py eric_smalloutput/model1_formatted_train.tsv eric_smalloutput/model1_formatted_valid.tsv eric_smalloutput/model1_formatted_test.tsv dict.txt eric_smalloutput/model1_train_out.labels eric_smalloutput/model1_test_out.labels eric_smalloutput/model1_metrics_out.txt 30
## lr.py model2
python lr.py eric_smalloutput/model2_formatted_train.tsv eric_smalloutput/model2_formatted_valid.tsv eric_smalloutput/model2_formatted_test.tsv dict.txt eric_smalloutput/model2_train_out.labels eric_smalloutput/model2_test_out.labels eric_smalloutput/model2_metrics_out.txt 30

## feature.py model1
python feature.py largedata/train_data.tsv largedata/valid_data.tsv largedata/test_data.tsv dict.txt eric_largeoutput/model1_formatted_train.tsv eric_largeoutput/model1_formatted_valid.tsv eric_largeoutput/model1_formatted_test.tsv 1
## feature.py model2
python feature.py largedata/train_data.tsv largedata/valid_data.tsv largedata/test_data.tsv dict.txt eric_largeoutput/model2_formatted_train.tsv eric_largeoutput/model2_formatted_valid.tsv eric_largeoutput/model2_formatted_test.tsv 2
## lr.py model1
python lr.py eric_largeoutput/model1_formatted_train.tsv eric_largeoutput/model1_formatted_valid.tsv eric_largeoutput/model1_formatted_test.tsv dict.txt eric_largeoutput/model1_train_out.labels eric_largeoutput/model1_test_out.labels eric_largeoutput/model1_metrics_out.txt 200
## lr.py model2
python lr.py eric_largeoutput/model2_formatted_train.tsv eric_largeoutput/model2_formatted_valid.tsv eric_largeoutput/model2_formatted_test.tsv dict.txt eric_largeoutput/model2_train_out.labels eric_largeoutput/model2_test_out.labels eric_largeoutput/model2_metrics_out.txt 200

In [3]:
epochs = np.arange(201)

In [4]:
epochs

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [5]:
cg = {'s':1,'d':2,'f':3}
cf = {'d':2,'f':3,'s':1}

In [6]:
cg == cf

True