In [3]:
from functions import mle , gradient, init_graph, matching_func, regularized_vector, sigmoid, noise_generation

import random, csv, time, os, pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import optimize
from scipy.stats import bernoulli
from scipy.io import savemat

%matplotlib inline

# Functions
 * **mle(w, pairs)**
 
 Calculate the MLE of w.
 
 Due to the fact that optimize.minimize only takes 1 input, here the pair information is imported from global variable.
 
 
 * **gradient(w,pairs)**
 
 Calculate the gradient of MLE. 
 
 Returns a numpy array with value of gradient.
 
 
 * **hessian(w, pairs)**
 
 Calculate the hessian of MLE. 
 
 Returns a numpy matrix with value of hessian.
 
 
 * **compare_rank(video_score, results, verbose=False, hist=False)**
 
 Compute the true rankings and rankings from results. 
 
 `verbose` will output with columns: Result Order, True Order, Result Score, Ture Score
    
 `hist` will output a histogram


 * **init_graph(num_nodes, num_edges, reference_matrix=None)**
 
 Marc's method of generating pairs
 
 
 * **matching_func(param, video_score, w_hat)**

 Function used to calculate L2 norm of v and w_star. Used to find a and b.
 Here param is [a, b]

    
 * **regularized_vector(video_score,w_hat)**

 Function to generate vector v after using matching_func to find optimal a and b
 
 
 
 * **performance_isabelle(video_score, video_num, w_hat)**

 Performance evaluation using method proposed by Isabelle.
 
 
 * **performance_nihar(video_score, video_num, w_hat)**

 Performance evaluation using method proposed by Nihar.
 
 
 * **noise_generation(video_score, pair)**

 Generate a decision of whether to flip, using proability from normal distribution and then Bernoulli to decide whether to flip
 
 
 * **sigmoid(x)**
 
 

# Data Generation
The cell below generates data for $video\_num$ videos, calculates all possible pairs and stores in $pairs\_truth$ and $total_pairs$ is the number of pairs. 

In [5]:
test_pairs = init_graph(10000, 200000)
pickle.dump( dict({'Pairs':test_pairs}), open( 'AMT.p', "wb" ) )

In [26]:
video_nums = [5000,10000]
# num_edges_pct = [0.01,0.1,1,10,100]

# video_nums = [10000]
num_edges_pct = [0.1, 0.5, 1, 2.17, 3]

for iteration in range(1):
    for video_num in video_nums:
        for num_edge_pct in num_edges_pct:

            num_edge = int(num_edge_pct*video_num*np.log(video_num))

            if num_edge < video_num:
                num_edge = video_num
            if num_edge > video_num*(video_num-1)/2:
                num_edge = video_num*(video_num-1)/2

            video_score = np.random.uniform(-5,5,video_num)
            resolution = 0.1
            video_score = np.round(video_score/resolution)*resolution

            w = np.ones(video_num)
            test_pairs = [pairs_truth[i] for i in random.sample(range(total_pairs),num_edge)]
            test_pairs = init_graph(video_num, num_edge, video_score)
            for i in range(len(test_pairs)):
                flip = noise_generation(video_score, test_pairs[i])
                if flip:
                    test_pairs[i] = (test_pairs[i][1], test_pairs[i][0])
            start_time = time.time()

            res = optimize.minimize(mle, w, 
                                    method='Newton-CG',
                                    jac=gradient,
                                    args=(test_pairs,),
                                    tol = 10,
                                    options={'disp': False})

            filename = 'result/random_error_'+ str(video_num)+'_at_'+str(num_edge)+'_iter_00'+str(iteration)+'.p'
            pickle.dump( dict({'x':res.x, 'video_score':video_score}), open( filename, "wb" ) )
            print  'Time on Iteration %d with Video Number %d and edges %.2f : %.1f seconds' %(iteration, video_num, num_edge, float(time.time() - start_time))

Time on Iteration 0 with Video Number 625 and edges 625.00 : 0.3 seconds
Time on Iteration 0 with Video Number 625 and edges 2011.00 : 1.5 seconds
Time on Iteration 0 with Video Number 625 and edges 4023.00 : 3.1 seconds
Time on Iteration 0 with Video Number 625 and edges 8731.00 : 7.6 seconds
Time on Iteration 0 with Video Number 625 and edges 12070.00 : 9.6 seconds
Time on Iteration 0 with Video Number 1250 and edges 1250.00 : 1.3 seconds
Time on Iteration 0 with Video Number 1250 and edges 4456.00 : 6.3 seconds
Time on Iteration 0 with Video Number 1250 and edges 8913.00 : 13.4 seconds
Time on Iteration 0 with Video Number 1250 and edges 19342.00 : 31.7 seconds
Time on Iteration 0 with Video Number 1250 and edges 26740.00 : 47.2 seconds
Time on Iteration 0 with Video Number 2500 and edges 2500.00 : 5.0 seconds
Time on Iteration 0 with Video Number 2500 and edges 9780.00 : 31.8 seconds
Time on Iteration 0 with Video Number 2500 and edges 19560.00 : 66.6 seconds
Time on Iteration 0 wi

In [6]:
video_nums = [5000,10000]

num_edges_pct = [0.1, 0.5, 1, 2.17, 3]

for iteration in range(1):
    for video_num in video_nums:
        for num_edge_pct in num_edges_pct:

            num_edge = int(num_edge_pct*video_num*np.log(video_num))

            if num_edge < video_num:
                num_edge = video_num
            if num_edge > video_num*(video_num-1)/2:
                num_edge = video_num*(video_num-1)/2

            video_score = np.random.uniform(-5,5,video_num)
            resolution = 0.1
            video_score = np.round(video_score/resolution)*resolution
            
            pairs_truth = []
            
            for i in range(len(video_score)-1):
                for j in range(i+1, len(video_score)):
                    if video_score[i] > video_score[j]:
                        pairs_truth.append((i,j))
                    else:
                        pairs_truth.append((j,i))

            total_pairs = len(pairs_truth)
            
            w = np.ones(video_num)
            
            test_pairs = [pairs_truth[i] for i in random.sample(range(total_pairs),num_edge)]
            
            for i in range(len(test_pairs)):
                flip = noise_generation(video_score, test_pairs[i])
                if flip:
                    test_pairs[i] = (test_pairs[i][1], test_pairs[i][0])
                    
            start_time = time.time()

            res = optimize.minimize(mle, w, 
                                    method='Newton-CG',
                                    jac=gradient,
                                    args=(test_pairs,),
                                    tol = 10,
                                    options={'disp': False})

            filename = 'result/random_error_'+ str(video_num)+'_at_'+str(num_edge)+'.p'
            pickle.dump( dict({'x':res.x, 'video_score':video_score}), open( filename, "wb" ) )
            print  'Time on Iteration %d with Video Number %d and edges %.2f : %.1f seconds' %(iteration, video_num, num_edge, float(time.time() - start_time))

Time on Iteration 0 with Video Number 5000 and edges 5000.00 : 29.1 seconds
Time on Iteration 0 with Video Number 5000 and edges 21292.00 : 201.8 seconds
Time on Iteration 0 with Video Number 5000 and edges 42585.00 : 407.4 seconds
Time on Iteration 0 with Video Number 5000 and edges 92411.00 : 839.9 seconds
Time on Iteration 0 with Video Number 5000 and edges 127757.00 : 1226.2 seconds
Time on Iteration 0 with Video Number 10000 and edges 10000.00 : 207.4 seconds
Time on Iteration 0 with Video Number 10000 and edges 46051.00 : 999.9 seconds
Time on Iteration 0 with Video Number 10000 and edges 92103.00 : 2041.0 seconds
Time on Iteration 0 with Video Number 10000 and edges 199864.00 : 4695.4 seconds


KeyboardInterrupt: 

In [7]:
video_nums = [10000]

num_edges_pct = [3]

for iteration in range(1):
    for video_num in video_nums:
        for num_edge_pct in num_edges_pct:

            num_edge = int(num_edge_pct*video_num*np.log(video_num))

            if num_edge < video_num:
                num_edge = video_num
            if num_edge > video_num*(video_num-1)/2:
                num_edge = video_num*(video_num-1)/2

            video_score = np.random.uniform(-5,5,video_num)
            resolution = 0.1
            video_score = np.round(video_score/resolution)*resolution
            
            pairs_truth = []
            
            for i in range(len(video_score)-1):
                for j in range(i+1, len(video_score)):
                    if video_score[i] > video_score[j]:
                        pairs_truth.append((i,j))
                    else:
                        pairs_truth.append((j,i))

            total_pairs = len(pairs_truth)
            
            w = np.ones(video_num)
            
            test_pairs = [pairs_truth[i] for i in random.sample(range(total_pairs),num_edge)]
            
            for i in range(len(test_pairs)):
                flip = noise_generation(video_score, test_pairs[i])
                if flip:
                    test_pairs[i] = (test_pairs[i][1], test_pairs[i][0])
                    
            start_time = time.time()

            res = optimize.minimize(mle, w, 
                                    method='Newton-CG',
                                    jac=gradient,
                                    args=(test_pairs,),
                                    tol = 10,
                                    options={'disp': False})

            filename = 'result/random_error_'+ str(video_num)+'_at_'+str(num_edge)+'.p'
            pickle.dump( dict({'x':res.x, 'video_score':video_score}), open( filename, "wb" ) )
            print  'Time on Iteration %d with Video Number %d and edges %.2f : %.1f seconds' %(iteration, video_num, num_edge, float(time.time() - start_time))

Time on Iteration 0 with Video Number 10000 and edges 276310.00 : 7812.3 seconds


# Sigma's effect on $R^2$

In [50]:
def mle(w, pairs):    
    out = 15  
    for pair in pairs:
        if pair[0] == -1 or pair[1] == -1:
            continue
        out *= 1/(1+np.exp((-w[pair[0]] + w[pair[1]])/1)  ) 
    return -np.log(out)

def gradient_sigma(w,pairs):
    sigma = 15
    grad = []
    for i in range(len(w)):
        gradient = 0

        for pair in pairs:
            if i == pair[0]:
                out = -1
            elif i == pair[1]:
                out = 1  
            else:
                continue
            gradient -= out / (1/(np.exp((w[pair[1]]-w[pair[0]])/sigma) +1 )) / sigma
            
        grad.append(-gradient)
        
    return np.array(grad)

def R2(yhat, y):
    ybar = np.sum(y)/len(y) 
    ssreg = np.sum((yhat-ybar)**2)
    sstot = np.sum((y - ybar)**2)
    return ssreg/sstot

In [None]:
video_num = 1250

num_edge = int(video_num*np.log(video_num)*1.5)

video_score = np.random.uniform(-5,5,video_num)
resolution = 0.1
video_score = np.round(video_score/resolution)*resolution

w = np.ones(video_num)

test_pairs = init_graph(video_num, num_edge, video_score)

start_time = time.time()

res = optimize.minimize(mle, w, 
                        method='Newton-CG',
                        jac=gradient_sigma,
                        args=(test_pairs,),
                        tol = 10,
                        options={'disp': False})
w_hat = regularized_vector(video_score,res.x)
R2(w_hat, video_score)

# Age data

In [3]:
csv = np.genfromtxt ('age.csv', delimiter=",")
scores = csv[:,0]

In [23]:
# video_num = len(scores)
video_num = 2500

video_score = np.random.uniform(-5,5,video_num)
resolution = 0.1
video_score = np.round(video_score/resolution)*resolution

num_edge_pct = 1.56

num_edge = int(num_edge_pct*video_num*np.log(video_num))

# video_score = scores
# resolution = 0.1
# video_score = np.round(video_score/resolution)*resolution

w = np.ones(video_num)

test_pairs = init_graph(video_num, num_edge, video_score)

for i in range(len(test_pairs)):
    flip = noise_generation(video_score, test_pairs[i])
    if flip:
        test_pairs[i] = (test_pairs[i][1], test_pairs[i][0])

start_time = time.time()

res = optimize.minimize(mle, w, 
                        method='Newton-CG',
                        jac=gradient,
                        args=(test_pairs,),
                        tol = 10,
                        options={'disp': False})

filename = 'age1.p'
pickle.dump( dict({'x':res.x, 'video_score':video_score}), open( filename, "wb" ) )
print  'Time on age data %.1f seconds' %(float(time.time() - start_time))

Time on age data 116.8 seconds


In [18]:
difference = (45 - video_score[pair[1]])/4
p = sigmoid(difference)*(1-sigmoid(difference))
decision = bernoulli.rvs(p,size=1)

0.25

In [21]:
filename = 'age1.p'
pickle.dump( dict({'x':res.x, 'video_score':video_score}), open( filename, "wb" ) )
