In [1]:
import sys
from random import sample
import math
import numpy as np

In [2]:
fp = open('../Datasets/MQ2008/Fold1/train.txt', 'r')
data = fp.readlines()

In [3]:
def parse(line):
    sl = line.split()
    y = int(sl[0])
    if y == 2:
        y = 1
    id = int(sl[1].split(':')[1])
    X = []
    for i in range(2, 48):
        val = float(sl[i].split(':')[1])
        X.append(val)
    return id, X, y

In [28]:
def populate(data):
    X_dict = {}
    y_dict = {}
    index_dict = {}
    X_global = np.empty((0,46), float)
    y_global = []
    ind = 0
    for line in data:
        id, X, y = parse(line)
        X_global = np.append(X_global, [X], axis=0)
        y_global.append(y)
        id_global.append(id)
        if id in X_dict:
            X_dict[id] = np.append(X_dict[id], [X], axis=0)
            y_dict[id].append(y)
            index_dict[id].append(ind)
        else:
            X_dict[id] = np.empty((0,46), float)
            y_dict[id] = []
            index_dict[id] = []
            X_dict[id] = np.append(X_dict[id], [X], axis=0)
            y_dict[id].append(y)
            index_dict[id].append(ind)
        ind += 1
    return X_dict, y_dict, X_global, y_global, index_dict

In [29]:
X, y, X_global, y_global, index_dict = populate(data)

In [31]:
print(index_dict[10032])

[8, 9, 10, 11, 12, 13, 14, 15]


In [32]:
def getalpha(y):
    alpha0 = 0
    alpha1 = 0
    for yi in y:
        if yi == 0:
            alpha0 += 1
        else:
            alpha1 += 1
    alpha0 /= len(y)
    alpha1 /= len(y)
    return [alpha0, alpha1]

In [33]:
def quantize(f, b):
    f_min = np.min(f, axis=0)
    f_max = np.max(f, axis=0)
    mq = f.shape[0]
    tf = f.shape[1]
    f_new = np.zeros((mq, tf), dtype=int)
    for i in range(mq):
        for j in range(tf):
            if f_min[j] == f_max[j]:
                f_new[i][j] = 1
            else:
                f_new[i][j] = round(b*(f[i][j] - f_min[j])/(f_max[j] - f_min[j])) + 1
    return f_new

In [34]:
def getbeta(X, y, b):
    X_new = quantize(X, b-1)
    mq = X.shape[0]
    nf = X.shape[1]
    beta = np.zeros((2, nf, b), dtype = float)
    i0 = 0
    i1 = 0
    for i in range(mq):
        if y[i] == 0:
            i0 += 1
        else:
            i1 += 1
        for j in range(nf):
            k = X_new[i][j]
            yi = y[i]
            beta[yi][j][k-1] += 1
    if i0 != 0:
        for j in range(nf):
            for k in range(b):
                beta[0][j][k]/=i0
    if i1 != 0:
        for j in range(nf):
            for k in range(b):
                beta[1][j][k]/=i1
    return beta

In [35]:
def estimateparameters(X, y, X_global, y_global, b, lamb):
    alpha_global = getalpha(y_global)
    beta_global = getbeta(X_global, y_global, b)
    alphas = {}
    betas = {}
    for qid in X:
        alphas[qid] = lamb * np.array(getalpha(y[qid])) + (1 - lamb) * np.array(alpha_global)
        betas[qid] = lamb * np.array(getbeta(X[qid], y[qid], b)) + (1 - lamb) * np.array(beta_global)
    return alphas, betas

In [36]:
alphas, betas = estimateparameters(X, y, X_global, y_global, 5, 0.75)

In [37]:
def predictcondprob(X, y, alphas, betas, b):
    condprob = {}
    for qid in X:
        X_q = quantize(X[qid], b-1)
        mq = X_q.shape[0]
        nf = X_q.shape[1]
        condprob[qid] = []
        for i in range(mq):
            p0 = alphas[qid][0]
            p1 = alphas[qid][1]
            for j in range(nf):
                k = X_q[i][j]
                p0 *= betas[qid][0][j][k-1]
                p1 *= betas[qid][1][j][k-1]
            if y[qid][i] == 0:
                condprob[qid].append(p0/(p0+p1))
            else:
                condprob[qid].append(p1/(p0+p1))
    return condprob

In [38]:
conditional_probability = predictcondprob(X, y, alphas, betas, 5)

In [39]:
def getindexedresults(condprob, index_dict, m):
    prob = np.zeros(m, dtype=float)
    for id in condprob:
        for i in range(len(condprob[id])):
            prob[index_dict[id][i]] = condprob[id][i]
    return prob

In [47]:
correctprob = getindexedresults(conditional_probability, index_dict, X_global.shape[0])
noise = 0
for p in correctprob:
    if p<0.5:
        noise+=1
print(noise/X_global.shape[0]*100)

6.749740394600208


In [24]:
mq = X[10215].shape[0]
nf = X[10215].shape[1]
for i in range(mq):
    for j in range(mq):
        if i!=j:
            fl = 0
            for f in range(nf):
                if X[10215][i][f] != X[10215][j][f]:
                    fl = 1
                    break
            if fl == 0:
                print(i, j)

14 16
16 14


In [25]:
print(conditional_probability[10215][14], conditional_probability[10215][16])
print(y[10215][14], y[10215][16])

0.9999473525199924 5.264748000751682e-05
0 1


In [41]:
print(conditional_probability)

{10002: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 10032: [0.999999999999999, 1.0, 0.9999999999981533, 0.9999999999994799, 0.9999997094157138, 1.0, 1.0, 1.0], 10035: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 10036: [1.0, 0.9999999999998098, 0.9999999999999978, 0.9953104881474952, 0.9999999997849138, 0.9999999999999997, 0.9976196269781864, 0.9999999999996179], 10050: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 10056: [1.0, 1.0, 0.9999999999999997, 0.9999969806090598, 1.0, 1.0, 0.9999999611986582, 0.9999900029374327, 0.9999999216448584, 0.99999999999172, 0.9999999976341697, 1.0, 1.0, 1.0, 1.0, 1.0], 10066: [0.991940340534572, 0.9999998055758521, 0.9999999999953662, 0.9999999770518646, 1.0, 1.0, 1.0, 0.9999999696627991], 10078: [0.9999999999967908, 0.9999999999781103, 0.9999981778767824, 1.0, 0.9999999999999867, 1.0, 0.9999999867947955, 0.9999999679683661, 0.9999999999979254, 0.9999999997636183, 0.9999999999862983, 1.0, 0.9999999999996056, 0.9999990731858944, 0.9999999984800176, 0.9999658623