In [1]:
import csv
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torchvision
from sklearn.linear_model import LogisticRegression

In [2]:
print(torch.cuda.is_available())

False


In [4]:
# prepare dataset

f = open('train.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
trains = []
for line in reader:
    trains.append(line)
f.close()    
print("trains", len(trains))

f = open('node_ingredient.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
node_ingredients = []
for line in reader:
    node_ingredients.append(line)
f.close()    
print("node_ingredients", len(node_ingredients))
num_ing = len(node_ingredients)

# f = open('validation_classification_question.csv', 'r', encoding='utf-8')
# reader = csv.reader(f)
# val_cls_q = []
# for line in reader:
#     val_cls_q.append(line)
# f.close()
# print("val_cls_q", len(val_cls_q))

# f = open('validation_classification_answer.csv', 'r', encoding='utf-8')
# reader = csv.reader(f)
# val_cls_a = []
# for line in reader:
#     val_cls_a.append(line)
# f.close()
# print("val_cls_a", len(val_cls_a))

f = open('validation_completion_question.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
val_cpt_q = []
for line in reader:
    val_cpt_q.append(line)
f.close()
print("val_cpt_q", len(val_cpt_q))

f = open('validation_completion_answer.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
val_cpt_a = []
for line in reader:
    val_cpt_a.append(line)
f.close()
print("val_cpt_a", len(val_cpt_a))

trains 23547
node_ingredients 6714
val_cpt_q 7848
val_cpt_a 7848


## Completion Task : Method 0
find neighbors of largest weight (no use of embedding)

In [62]:
# make graph of ingredients
G = nx.Graph()
for i in range(len(node_ingredients)):
    G.add_node(str(i))
    
print(G.number_of_nodes())

for data in trains:
    for i in range(len(data) - 2):
        for j in range(i+1, len(data) - 1):
            if G.has_edge(data[i], data[j]):
                G[data[i]][data[j]]['weight'] += 1
            else:
                G.add_edge(data[i], data[j], weight=1)

print(G.number_of_edges())

6714
355816


In [67]:
acc = 0
for i, data in tqdm(enumerate(val_cpt_q)):
#     print(data)
    weight_dict = {}
    for node in data:
        for adv, w in G.adj[node].items():
            if adv in weight_dict.keys():
                weight_dict[adv] += w['weight']
            else:
                weight_dict[adv] = w['weight']
    for node in data:
        if node in weight_dict.keys():
            del weight_dict[node]
    
    weight_dict = sorted(weight_dict.items(), key=(lambda x: x[1]), reverse=True)
    
#     print(weight_dict)

    if weight_dict[0][0] == val_cpt_a[i][0]:
        acc += 1
        
print("accuracy: ", acc / len(val_cpt_q) * 100, "%")

7848it [01:20, 97.01it/s] 

accuracy:  6.167176350662589 %





## Completion Task : Method 1
cosine similarity of embedding

In [5]:
def cos_sim(X,y):
    return np.dot(X, y) / (np.linalg.norm(X, axis=1) * np.linalg.norm(y))

In [69]:
# prepare embeddings as numpy
f = open('Embedding/Embp1q10.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
f.readline()
embp1q10 = np.zeros((6714, 64))
for line in reader:
    i = int(line[1])
    j = 0
    for node in line[2][2:-1].split(' '):
        if node != '':
            embp1q10[i][j] = float(node.strip())
            j += 1
    
f.close()
print("emb1q10", embp1q10.shape)

emb1q10 (6714, 64)


In [84]:
# method 1-1: similarity of average

acc = 0
for i, data in tqdm(enumerate(val_cpt_q)):
    nodes = [int(node) for node in data]
    avg_node = np.average(embp1q10[nodes][:], axis=0)
    sims = cos_sim(embp1q10, avg_node)
    ranks = np.argsort(-sims)
        
    target = int(val_cpt_a[i][0])
    
#     print("input: ", data)
#     print("target: ", target)
#     print("estimation: ", ranks[0:10])
#     print("rank: ", np.where(ranks == target))
#     print("")

    j = 0
    while ranks[j] in data:
        j += 1
        
    estimation = ranks[j]
    
    if estimation == target:
        acc += 1
        
print("accuracy: ", acc / len(val_cpt_q) * 100, "%")

7848it [00:07, 1093.45it/s]

accuracy:  0.6116207951070336 %





In [94]:
# method 1-2: average of similarity

acc = 0
for i, data in tqdm(enumerate(val_cpt_q)):
    nodes = [int(node) for node in data]
    sims = []
    for node in nodes:
        sims.append(cos_sim(embp1q10, embp1q10[node]))
    sims = np.stack(sims, axis=0)
    avg_sim = np.average(sims, axis=0)
    ranks = np.argsort(-avg_sim)
        
    target = int(val_cpt_a[i][0])
    
    j = 0
    while ranks[j] in data:
        j += 1
        
    estimation = ranks[j]
    
    if estimation == target:
        acc += 1
        
print("accuracy: ", acc / len(val_cpt_q) * 100, "%")

  
7848it [00:18, 429.91it/s]

accuracy:  0.5224260958205913 %





## Completion Task : Method 2
Logistic Regression

In [7]:
def make_trainset(dataset):
    xs = []
    ys = []
    for data in tqdm(dataset):
        nodes = [int(node) for node in data[:-1]]
        x = np.zeros((len(nodes), num_ing))
        y = np.zeros(len(nodes))
        for i, node in enumerate(nodes):
            other_nodes = [nd for nd in nodes if nd != node]
            for other in other_nodes:
                x[i][other] = 1
                y[i] = node
        xs.append(x)
        ys.append(y)
    train_x = np.concatenate(xs, axis=0)
    train_y = np.concatenate(ys, axis=0)
    return train_x, train_y

In [None]:
train_x, train_y = make_trainset(trains)

100%|███████████████████████████████████| 23547/23547 [00:05<00:00, 4207.67it/s]


In [113]:
clf = LogisticRegression(penalty='l2', max_iter=10, verbose=True).fit(train_x, train_y)
clf.score(train_x, train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 45.0min finished


0.10297918006462584

In [124]:
def make_valset(question, answer):
    val_x = np.zeros((len(question), num_ing))
    val_y = np.zeros(len(question))
    for i in range(len(question)):
        nodes = [int(node) for node in question[i]]
        target = int(answer[i][0])
        for node in nodes:
            val_x[i][node] = 1
        val_y[i] = target
        
    return val_x, val_y

In [125]:
val_x, val_y = make_valset(val_cpt_q, val_cpt_a)

In [127]:
clf.score(val_x, val_y)

0.09174311926605505

In [12]:
# 0,1,2 => (1100 - 0010), (0110 - 1000), (1010 - 0100)

class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, train_list, transform):
        self.data = []
        self.label = [] # missing ingredient
        self.transform = transform
        
        for data in train_list[0:1]:
            nodes = [int(node) for node in data[0:-1]]
            print(nodes)
            np_data = np.zeros((len(nodes), num_ing))
            np_label = np.zeros((len(nodes), num_ing))
            print(np_data.shape)
            for mis_node in nodes:
                other_node = [node for node in nodes if node!=mis_node]
                print(mis_node, other_node)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        label = self.label[idx]
        data = self.transform(data)
        label = self.transform(label)
        return data, label

train_loader = torch.utils.data.DataLoader(
    TrainDataset(trains, transform=torchvision.transforms.ToTensor()), 
    batch_size=32, 
    shuffle=True
)

for i, (data, label) in enumerate(train_loader):
    print(i, data, label)

[2813, 3146, 3229, 3885, 4379, 4390, 5250, 5456, 6187]
(9, 6714)
2813 [3146, 3229, 3885, 4379, 4390, 5250, 5456, 6187]
3146 [2813, 3229, 3885, 4379, 4390, 5250, 5456, 6187]
3229 [2813, 3146, 3885, 4379, 4390, 5250, 5456, 6187]
3885 [2813, 3146, 3229, 4379, 4390, 5250, 5456, 6187]
4379 [2813, 3146, 3229, 3885, 4390, 5250, 5456, 6187]
4390 [2813, 3146, 3229, 3885, 4379, 5250, 5456, 6187]
5250 [2813, 3146, 3229, 3885, 4379, 4390, 5456, 6187]
5456 [2813, 3146, 3229, 3885, 4379, 4390, 5250, 6187]
6187 [2813, 3146, 3229, 3885, 4379, 4390, 5250, 5456]


ValueError: num_samples should be a positive integer value, but got num_samples=0