In [12]:
# Library imports
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.semi_supervised import LabelPropagation
import csv
import pickle
from tqdm.autonotebook import tqdm

In [2]:
# Read data from data/four_area
author_df = pd.read_csv("data/four_area/author.txt", sep = "\t", names=["ID", "Author name"],encoding='utf8')
conf_df = pd.read_csv("data/four_area/conf.txt", sep = "\t", names=["ID", "Conference name"])
paper_df = pd.read_csv("data/four_area/paper.txt", sep = "\t", names=["ID", "Paper title"])
term_df = pd.read_csv("data/four_area/term.txt", sep = "\t", names=["ID", "Term"])
paper_author = pd.read_csv("data/four_area/paper_author.txt", sep = "\t", names=["paperID", "authorID"])
paper_conf = pd.read_csv("data/four_area/paper_conf.txt", sep = "\t", names=["paperID", "confID"])
paper_term = pd.read_csv("data/four_area/paper_term.txt", sep = "\t", names=["paperID", "termID"])

### ID in DBLP_four_area are differents from four_area. four_area's ID are rehashed.
### Label are defined as follows
* Database->1
* Data Mining->2
* Machine Learning->3
* Information Retrieval->4

In [51]:
# Read data from data/DBLP_four_area
author_dict = pd.read_csv("data/DBLP_four_area/cleaned_author_dict.txt", sep = "\t", names=["ID", "Author name"], encoding='utf8')
conf_dict = pd.read_csv("data/DBLP_four_area/conf_dict.txt", sep = "\t", names=["ID", "Conference name"])
term_dict = pd.read_csv("data/DBLP_four_area/term_dict.txt", sep = "\t", names=["ID", "Term"])
author_label = pd.read_csv("data/DBLP_four_area/author_label.txt", sep = "\t", names=["authorID", "Label"])
conf_label = pd.read_csv("data/DBLP_four_area/conf_label.txt", sep = "\t", names=["confID", "Conference name", "Label"])

In [4]:
# Merge
conf_dict_m = pd.merge(conf_dict, conf_df, on='Conference name')
author_dict_m = pd.merge(author_dict, author_df, on='Author name')

In [5]:
print ("Number of nodes %d"%(author_df.shape[0] + conf_df.shape[0] + paper_df.shape[0] + term_df.shape[0]))
print ("Number of edges %d"%(paper_author.shape[0] + paper_conf.shape[0]+paper_term.shape[0]) )

Number of nodes 70647
Number of edges 332388


### WTF are we doing here
* Define G = (V, E, YL, W), where G stands for graph, V is the set of all vertices, E is the set of all edges, YL is the set of observed labels, and W is the weight.
* V size is 70647. E size is 332388.
* Our goal is to find a function f that maps G to Y. In other words, we want to know a prediction function that, given G, can predict the label Y.
* New Param = old param + gradient * learning rate
* Gradiant can be expressed explicitly in a formula, that requires LBP to get expectation. 
* We need a way to evaluate the performance

* What's the percentage of unobserved labels? 
* (author_label.shape[0] / author.shape[0] * 100) = 14.758553410912132%

# 1 Hyperparameters

In [6]:
learning_rate = 0.01
learning_rates = [0.001, 0.01, 0.1] 
learning_iteration = 100
learning_iterations = [50, 100, 200]

# 2 Setup

In [None]:
author_matrix = pd.DataFrame(np.zeros(shape=(28702,28702)),
                             columns=author_dict['ID'].unique(), 
                             index=author_dict['ID'].unique())

In [None]:
conf_matrix= pd.DataFrame(np.zeros(shape=(28702,20)),
                             columns=conf_dict['ID'].unique(), 
                             index=author_dict['ID'].unique())

In [7]:
index = np.arange(1,28723)

In [8]:
adj_matrix = pd.DataFrame(np.zeros(shape=(28722,28722)),
                             columns=index, 
                             index=index)

In [9]:
# Brute Force

for paper in tqdm(paper_author['paperID']):
    authors = list(paper_author[paper_author['paperID'] == paper]['authorID'])
    confID = int(paper_conf[paper_conf['paperID']==paper]['confID'].to_string(index=False).strip())
    if (len(authors) > 1) :
        for i in range(len(authors)):
            authorID1 = authors[i]
            authorDictID1 = author_dict_m[author_dict_m["ID_y"] == authorID1]['ID_x'].to_string(index=False).strip()        
            confDictID = conf_dict_m[conf_dict_m['ID_y']==confID]['ID_x'].to_string(index=False).strip()
            adj_matrix.at[int(authorDictID1),int(confDictID)+28702] += 1
            adj_matrix.at[int(confDictID)+28702,int(authorDictID1)] += 1
            for j in range(i+1, len(authors)):
                authorID2 = authors[j]
                authorDictID2 = author_dict_m[author_dict_m["ID_y"] == authorID2]['ID_x'].to_string(index=False).strip()
                adj_matrix.at[int(authorDictID1),int(authorDictID2)] += 1
                adj_matrix.at[int(authorDictID2),int(authorDictID1)] += 1
                adj_matrix.at[int(authorDictID2),int(confDictID)+28702] += 1
                adj_matrix.at[int(confDictID)+28702,int(authorDictID2)] += 1
    else:
        authorID1 = authors[0]
        authorDictID1 = author_dict_m[author_dict_m["ID_y"] == authorID1]['ID_x'].to_string(index=False).strip()        
        confDictID = conf_dict_m[conf_dict_m['ID_y']==confID]['ID_x'].to_string(index=False).strip()
        adj_matrix.at[int(authorDictID1),int(authorDictID1)] += 1
        adj_matrix.at[int(authorDictID1),int(confDictID)+28702] += 1
        adj_matrix.at[int(confDictID)+28702,int(authorDictID1)] += 1
                

HBox(children=(IntProgress(value=0, max=74632), HTML(value='')))




In [None]:
# author_matrix = author_matrix.astype('int32',copy=True, errors='raise')
# author_matrix.to_pickle(r'F:\author_matrix.pickle')

# conf_matrix = conf_matrix.astype('int16')
# conf_matrix.to_pickle('F:\conf_matrix.pickle')

In [None]:
# Read
author_matrix = pd.read_pickle('./author_matrix.pickle')
conf_matrix = pd.read_pickle('./conf_matrix.pickle')

In [10]:
adj_matrix = adj_matrix.astype('int32')

In [11]:
adj_matrix.to_pickle(r'F:\adj_matrix.pickle')

In [16]:
adj_data = adj_matrix.to_numpy()

In [29]:
label = np.zeros(28702)

In [31]:
c_label = conf_label['Label'].to_numpy()

In [64]:
author_label.loc[author_label.sample(frac=0.7).index,'Label'] = 0

In [76]:
for index, row in author_label.iterrows():
    if (row['Label'] != 0):
        label[row['authorID']-1] = row['Label']

2
2
4
1
3
2
1
1
1
2
3
1
3
3
4
3
3
2
1
3
4
2
2
4
2
1
2
1
1
1
1
2
1
1
1
2
1
3
3
3
1
2
3
3
2
4
3
3
1
2
3
2
3
2
2
4
4
4
1
2
1
1
3
3
4
1
2
1
1
1
1
3
3
4
3
3
3
4
1
3
1
4
4
1
3
1
4
4
4
1
4
1
3
4
3
1
3
2
2
3
2
4
1
3
2
4
1
3
1
3
3
3
1
1
3
1
4
2
4
1
1
4
4
3
3
3
4
3
4
1
4
4
1
3
1
3
3
4
1
1
3
1
1
1
3
2
2
4
3
1
2
2
4
1
3
4
1
2
3
4
2
4
4
2
2
3
1
2
1
1
3
2
3
1
1
3
3
3
1
1
3
3
2
2
1
4
1
4
1
3
1
2
1
1
2
3
1
2
2
1
4
3
3
4
2
3
3
3
2
1
2
1
3
2
3
3
3
1
3
1
1
3
3
3
2
1
4
2
4
2
1
1
1
2
2
1
3
2
2
4
2
4
2
4
2
3
3
1
1
1
3
4
2
4
3
3
4
1
4
3
4
1
4
3
2
2
4
3
1
2
4
4
1
1
1
3
2
2
1
3
4
3
1
2
1
3
2
1
4
4
2
3
4
3
1
1
3
4
3
1
1
3
2
3
2
3
2
3
4
1
2
4
1
2
4
2
2
3
2
2
4
4
1
3
3
1
1
3
1
1
1
2
1
3
1
2
2
1
1
3
1
4
1
4
3
4
1
1
1
3
1
2
1
3
1
4
2
1
1
4
2
3
2
3
2
3
4
3
4
1
3
1
3
3
2
4
1
4
4
1
1
3
3
3
4
2
3
2
1
4
4
4
4
2
3
3
2
3
1
3
4
4
3
1
1
3
1
2
1
4
2
1
2
2
3
2
2
1
1
1
2
3
1
3
4
4
1
3
3
2
3
3
2
4
3
1
2
3
1
3
3
3
4
2
1
1
2
3
1
1
4
1
2
3
4
3
1
1
4
3
3
1
3
3
4
4
1
3
3
4
1
2
4
3
1
1
1
3
3
3
4
1
3
3
2
1
2
4
3
1
4
4
2
3
1
2
1
1
2
2


In [79]:
labels = np.concatenate((label, c_label))

In [85]:
labels[labels == 0] = -1

In [84]:
label_prop_model = LabelPropagation(n_neighbors = 4)
label_prop_model

LabelPropagation(gamma=20, kernel='rbf', max_iter=1000, n_jobs=None,
                 n_neighbors=4, tol=0.001)

In [86]:
labels

array([-1., -1.,  2., ...,  2.,  4.,  4.])