In [16]:
# Library imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
import pickle
from tqdm.autonotebook import tqdm
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [2]:
author_df = pd.read_csv("data/four_area/author.txt", sep = "\t", names=["ID", "Author name"],encoding='utf8')
conf_df = pd.read_csv("data/four_area/conf.txt", sep = "\t", names=["ID", "Conference name"])
paper_df = pd.read_csv("data/four_area/paper.txt", sep = "\t", names=["ID", "Paper title"])
term_df = pd.read_csv("data/four_area/term.txt", sep = "\t", names=["ID", "Term"])
paper_author = pd.read_csv("data/four_area/paper_author.txt", sep = "\t", names=["paperID", "authorID"])
paper_conf = pd.read_csv("data/four_area/paper_conf.txt", sep = "\t", names=["paperID", "confID"])
paper_term = pd.read_csv("data/four_area/paper_term.txt", sep = "\t", names=["paperID", "termID"])
author_dict = pd.read_csv("data/DBLP_four_area/cleaned_author_dict.txt", sep = "\t", names=["ID", "Author name"], encoding='utf8')
conf_dict = pd.read_csv("data/DBLP_four_area/conf_dict.txt", sep = "\t", names=["ID", "Conference name"])
term_dict = pd.read_csv("data/DBLP_four_area/term_dict.txt", sep = "\t", names=["ID", "Term"])
author_label = pd.read_csv("data/DBLP_four_area/author_label.txt", sep = "\t", names=["authorID", "Label"])
conf_label = pd.read_csv("data/DBLP_four_area/conf_label.txt", sep = "\t", names=["confID", "Conference name", "Label"])
conf_dict_m = pd.merge(conf_dict, conf_df, on='Conference name')
author_dict_m = pd.merge(author_dict, author_df, on='Author name')
paper_conf_m = pd.merge(conf_dict_m, paper_conf, left_on='ID_y', right_on='confID')
paper_conf_m = paper_conf_m.drop(columns=['Conference name', 'ID_y','confID'])
paper_label_m = pd.merge(paper_conf_m, conf_label, left_on='ID_x', right_on='confID')
paper_label_m = paper_label_m.drop(columns=['Conference name', 'ID_x','confID'])
author_paper_label_m = pd.merge(paper_label_m,paper_author,on='paperID')

In [3]:
# for author in tqdm(author_paper_label_m['authorID'].unique()):
#     author_dict_ID = int(author_dict_m[author_dict_m["ID_y"] == author]['ID_x'].to_string(index=False).strip())
#     value_count = author_paper_label_m[author_paper_label_m['authorID'] == author]['Label'].value_counts()
#     for vc in value_count.iteritems():
#         label = vc[0]
#         count = vc[1]
#         author_feature.at[author_dict_ID, label]=count

In [4]:
author_feature = pd.read_pickle(r'F:\author_feature.pickle')

In [5]:
author_features = author_feature.loc[author_label['authorID'],:]

In [6]:
author_sum = np.sum(author_feature, axis = 1)

In [7]:
author_features['sum'] = author_sum
author_features['lgt_median'] = author_sum > 2
author_features['lgt_mean'] = author_sum > 5

In [8]:
author_features['DB_lgt_0'] = author_features[1]  > 0
author_features['DM_lgt_0'] = author_features[2]  > 0
author_features['ML_lgt_0'] = author_features[3]  > 0
author_features['IR_lgt_0'] = author_features[4]  > 0

author_features['DB_lgt_0.25'] = author_features[1] / author_features['sum'] > 0.25
author_features['DM_lgt_0.25'] = author_features[2] / author_features['sum'] > 0.25
author_features['ML_lgt_0.25'] = author_features[3] / author_features['sum'] > 0.25
author_features['IR_lgt_0.25'] = author_features[4] / author_features['sum'] > 0.25

author_features['co_1'] = 0
author_features['co_2'] = 0    
author_features['co_3'] = 0    
author_features['co_4'] = 0    

In [9]:
for index, row in tqdm(author_features.iterrows()):
    adid = index
    aid = int(author_dict_m[author_dict_m["ID_x"] == adid]['ID_y'].to_string(index=False).strip())
    all_paper = paper_author[paper_author['authorID'] == aid]['paperID']
    coauthor = []
    for paper in all_paper:
        authors = paper_author[paper_author['paperID'] == paper]['authorID']
        authors = authors[authors != aid]
        coauthor = np.concatenate((coauthor, authors))
    coauthor = np.unique(coauthor)
    coauthor_hashed = author_dict_m[author_dict_m['ID_y'].isin(coauthor)]['ID_x']
    vc = author_label[author_label['authorID'].isin(coauthor_hashed)]['Label'].value_counts()
    author_features.loc[adid, 'co_1']  += vc[1] if 1 in vc.index else 0
    author_features.loc[adid, 'co_2']  += vc[2] if 2 in vc.index else 0    
    author_features.loc[adid, 'co_3']  += vc[3] if 3 in vc.index else 0    
    author_features.loc[adid, 'co_4']  += vc[4] if 4 in vc.index else 0    

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
author_features['DB_co'] = author_features['co_1'] > 0
author_features['DM_co'] = author_features['co_2'] > 0    
author_features['ML_co'] = author_features['co_3'] > 0    
author_features['IR_co'] = author_features['co_4'] > 0   

In [11]:
author_features = pd.merge(author_features, author_label, left_index = True, right_on='authorID')

In [None]:
author_features = pd.read_pickle(r'F:\author_features.pickle')

Variables of the MRF: 
1. Number of paper by author > 2 (median)
2. Number of paper by author > 5 (mean)
7. Percentage of DB paper > 0.25
8. Percentage of DM paper > 0.25
9. Percentage of ML paper > 0.25
10. Percentage of IR paper > 0.25
11. Co-authored with DB 
12. Co-authored with DM 
13. Co-authored with ML 
14. Co-authored with IR

3. ~~Number of DB paper > 0~~
4. ~~Number of DM paper > 0~~
5. ~~Number of ML paper > 0~~
6. ~~Number of IR paper > 0~~


In [17]:
X = author_features[['lgt_median', 'lgt_mean', 'DB_lgt_0', 'DM_lgt_0', 'ML_lgt_0', 'IR_lgt_0',
                'DB_lgt_0.25', 'DM_lgt_0.25', 'ML_lgt_0.25', 'IR_lgt_0.25', 
                'DB_co', 'DM_co', 'ML_co', 'IR_co']]
y = author_features[['Label']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
X_train_actual = []
y_train_actual = []
X_test_actual = []
y_test_actual = []

In [19]:
for i, row in X_train.iterrows():
    X_train_actual.append([row.to_dict()])
for i, row in X_test.iterrows():
    X_test_actual.append([row.to_dict()])

In [20]:
for i, row in y_train.iterrows():
    y_train_actual.append([row.to_string(index = False).strip()])
for i, row in y_test.iterrows():
    y_test_actual.append([row.to_string(index = False).strip()])

In [21]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [22]:
crf.fit(X_train_actual, y_train_actual)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [23]:
labels = list(crf.classes_)

In [24]:
y_pred = crf.predict(X_test_actual)
metrics.flat_f1_score(y_test_actual, y_pred,
                      average='weighted', labels=labels)

0.9142716068506009